# Explora√ß√£o de Dados - IGDB

### Configura√ß√µes Iniciais

In [2]:
 pip install minio

Note: you may need to restart the kernel to use updated packages.


In [1]:
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql import SparkSession
from IPython.core.display import HTML
from minio import Minio

display(HTML("<style>pre { white-space: pre !important; }</style>"))

# Define a sess√£o do Spark com os jars necess√°rios para conex√£o com o MINIO
spark = (SparkSession.builder
         .config("spark.jars","""/home/jovyan/jars/aws-java-sdk-core-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-dynamodb-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-s3-1.11.534.jar,
                                 /home/jovyan/jars/hadoop-aws-3.2.2.jar""")
         .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
         .config("spark.hadoop.fs.s3a.access.key", "aulafia")
         .config("spark.hadoop.fs.s3a.secret.key", "aulafia@123")
         .config("spark.hadoop.fs.s3a.path.style.access", True)
         .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
         .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
         .getOrCreate()
        )

# Configura as informa√ß√µes de acesso ao MinIO para listar os objetos
minio_client = Minio("minio:9000", access_key="aulafia", secret_key="aulafia@123", secure=False)

In [2]:
# Define a data de extra√ß√£o para leitura no Lake
extraction_date = "2023-07-02"

## Games

### Leitura (e unifica√ß√£o) dos arquivos JSON

In [3]:
# Nome do bucket
bucket_name = "raw"

# Lista de caminhos dos arquivos JSON no MinIO
arquivos = []

# Lista os objeto no bucket e adiciona os caminhos dos arquivos √† lista
for obj in minio_client.list_objects(bucket_name, prefix="igdb/games/" + extraction_date + "/"):
    caminho = f"s3a://{bucket_name}/{obj.object_name}"
    arquivos.append(caminho)

dfIGDB_Games = None

# Loop para ler cada arquivo JSON e combinar os DataFrames
for arquivo in arquivos:
    df_temp = spark.read.json(arquivo)
    
    # Se o DataFrame inicial estiver vazio, atribui o DataFrame atual
    if dfIGDB_Games is None:
        dfIGDB_Games = df_temp
    # Caso contr√°rio, combina o DataFrame atual com o DataFrame anterior
    else:
        dfIGDB_Games = dfIGDB_Games.unionByName(df_temp, allowMissingColumns=True)

### Opcional

In [9]:
(dfIGDB_Games
 .write
 .format('parquet')
 .mode('overwrite')
 .save('s3a://explore/igdb/games')
)

In [None]:
dfIGDB_Games = (spark
                .read
                .format('parquet')
                .load('s3a://explore/igdb/games')
       )

### Conhecendo a Base

In [4]:
# Exibe o DataFrame resultante
dfIGDB_Games.show(20, False)

+-------------------------------------------------------+-----------------+-----------------------+--------------------------------------------------------------------+-------------------------------------------------------------------------+------------------------------------+--------+------------------------------------+----------+------+----------+-----------------------------------+--------------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+-------+-----+---------+----------+------------+------------------+----------+--------------------+-----+---+---------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
# Exibe o Schema do DataFrame
dfIGDB_Games.printSchema()

root
 |-- age_ratings: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- aggregated_rating: double (nullable = true)
 |-- aggregated_rating_count: long (nullable = true)
 |-- alternative_names: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- artworks: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- bundles: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- collection: long (nullable = true)
 |-- cover: long (nullable = true)
 |-- created_at: long (nullable = true)
 |-- dlcs: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- expanded_games: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- expansions: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- external_games: array (nullable = true)
 |    |-- element: long (containsNull = true)

#### Todos os registros do Dataframe s√£o √∫nicos, considerando a chave ID

In [6]:
dfIGDB_Games.count()

237506

In [7]:
dfIGDB_Games.select("id").distinct().count()

237506

#### Describe dos campos num√©ricos

In [10]:
dfIGDB_Games.describe().show()

+-------+------------------+-----------------------+------------------+--------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+-----------------+--------+-----------------+------------------+------------------+--------+------------------+--------------------+--------------------+------------------+------------------+--------------------+--------------------+-----------------+------------------+
|summary| aggregated_rating|aggregated_rating_count|          category|            checksum|        collection|             cover|          created_at|  first_release_date|           follows|         franchise|             hypes|               id|    name|      parent_game|            rating|      rating_count|    slug|            status|           storyline|             summary|      total_rating|total_rating_count|          updated_at|                 url|   version_parent|     version_title|


#### Converte os campos de data para Unix Timestamp

In [23]:
dfIGDB_Games = (dfIGDB_Games
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("first_release_date", fn.to_timestamp(fn.from_unixtime("first_release_date")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

In [31]:
(dfIGDB_Games
 .select("name", "updated_at")
 .orderBy(fn.col("updated_at").desc())
).show(10, False)

+--------------------------------------------+-------------------+
|name                                        |updated_at         |
+--------------------------------------------+-------------------+
|Project Spark: Conker's Big Reunion         |2023-07-02 19:13:03|
|Goat Simulator 3                            |2023-07-02 19:12:17|
|Wo Long: Fallen Dynasty                     |2023-07-02 19:04:43|
|Plants vs. Zombies: Battle for Neighborville|2023-07-02 19:03:33|
|MechWarrior 5: Mercenaries                  |2023-07-02 19:03:12|
|Massive Chalice                             |2023-07-02 19:03:12|
|Lost in Random                              |2023-07-02 19:03:11|
|Hello Neighbor 2                            |2023-07-02 19:02:53|
|Frostpunk: Console Edition                  |2023-07-02 19:02:41|
|Far Cry 5                                   |2023-07-02 19:02:38|
+--------------------------------------------+-------------------+
only showing top 10 rows

