# Exploração de Dados - IGDB

### Configurações Iniciais

In [3]:
!spark-submit --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.2.1
      /_/
                        
Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 11.0.15
Branch HEAD
Compiled by user hgao on 2022-01-20T19:26:14Z
Revision 4f25b3f71238a00508a356591553f2dfa89f8290
Url https://github.com/apache/spark
Type --help for more information.


In [5]:
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql import SparkSession
from IPython.core.display import HTML
from delta.tables import DeltaTable

display(HTML("<style>pre { white-space: pre !important; }</style>"))

# Define a sessão do Spark com os jars necessários para conexão com o MINIO
spark = (SparkSession.builder
         .config("spark.jars","""/home/jovyan/jars/aws-java-sdk-core-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-dynamodb-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-s3-1.11.534.jar,
                                 /home/jovyan/jars/hadoop-aws-3.2.2.jar""")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0")
         #.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         #.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
         .config("spark.hadoop.fs.s3a.access.key", "aulafia")
         .config("spark.hadoop.fs.s3a.secret.key", "aulafia@123")
         .config("spark.hadoop.fs.s3a.path.style.access", True)
         .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
         .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
         .getOrCreate()
        )

In [7]:
# Nome do bucket
bucket_name = "raw"

# Define o nome da API de onde foram extraídos os dados via arquivo JSON
api_name = 'igdb'

# Define o path do bucket dos dados
bucket_path = 's3a://' + bucket_name + '/' + api_name + '/'

## Games

### Leitura da base

In [8]:
table_name = 'games'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Games = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [9]:
# Exibe o DataFrame resultante
dfIGDB_Games.show(20, False)

+-------------------------------------------------------+-----------------+-----------------------+--------------------------------------------------------+-------------------------------------------------------------------------+------------------------------------+--------+------------------------------------+----------+------+----------+------------------------------------------------+--------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+-------+-----+---------+----------+------------+------------------+----------+----------------+-----+---+---------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
# Exibe o Schema do DataFrame
dfIGDB_Games.printSchema()

root
 |-- age_ratings: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- aggregated_rating: double (nullable = true)
 |-- aggregated_rating_count: long (nullable = true)
 |-- alternative_names: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- artworks: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- bundles: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- collection: long (nullable = true)
 |-- cover: long (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- dlcs: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- expanded_games: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- expansions: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- external_games: array (nullable = true)
 |    |-- element: long (containsNull = 

#### Todos os registros do Dataframe são únicos, considerando a chave ID

In [10]:
dfIGDB_Games.count()

241909

In [11]:
dfIGDB_Games.select("id").distinct().count()

241909

#### Describe dos campos numéricos

In [15]:
dfIGDB_Games.describe().show()

+-------+------------------+-----------------------+------------------+--------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+--------+-----------------+------------------+------------------+--------+------------------+--------------------+--------------------+------------------+------------------+--------------------+----------------+------------------+
|summary| aggregated_rating|aggregated_rating_count|          category|            checksum|        collection|             cover|          follows|         franchise|            hypes|                id|    name|      parent_game|            rating|      rating_count|    slug|            status|           storyline|             summary|      total_rating|total_rating_count|                 url|  version_parent|     version_title|
+-------+------------------+-----------------------+------------------+--------------------+------------------+------------------+

#### Converte os campos de data para Unix Timestamp

In [13]:
dfIGDB_Games = (dfIGDB_Games
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("first_release_date", fn.to_timestamp(fn.from_unixtime("first_release_date")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [16]:
# Games atualizados recentemente (considerando a última ingestão até a data da análise)
(dfIGDB_Games
 .select("name", "updated_at")
 .orderBy(fn.col("updated_at").desc())
).show(10, False)

+------------------------------------+-------------------+
|name                                |updated_at         |
+------------------------------------+-------------------+
|Assassin's Creed: Liberation HD     |2023-08-06 22:34:07|
|God of War                          |2023-08-06 22:30:18|
|Tom Clancy's Ghost Recon: Wildlands |2023-08-06 22:29:46|
|Men of War                          |2023-08-06 22:11:41|
|King's Bounty: Warriors of the North|2023-08-06 22:11:28|
|The Last Express: Gold Edition      |2023-08-06 22:11:28|
|Truck Racer                         |2023-08-06 22:11:18|
|NeXus: One Core                     |2023-08-06 22:11:17|
|Stay Dead Evolution                 |2023-08-06 22:11:17|
|Rescue 2: Everyday Heroes           |2023-08-06 22:11:13|
+------------------------------------+-------------------+
only showing top 10 rows



In [17]:
# Games mais bem avaliados
(dfIGDB_Games
 .select("id", "name", "rating")
 .orderBy(fn.col("rating").desc())
).show(10, False)

+------+---------------------------------------+-----------------+
|id    |name                                   |rating           |
+------+---------------------------------------+-----------------+
|201897|Vinemon: Sauce Edition                 |100.0            |
|8863  |Age of Wonders III: Golden Realms      |100.0            |
|88973 |Goblin Sword                           |99.94224659928568|
|20196 |Metal Gear Solid: The Legacy Collection|99.68039606212281|
|41888 |Battlefield 3: Premium Edition         |99.67350381707095|
|50261 |Angband                                |99.63776240578409|
|45131 |Grand Theft Auto V: Special Edition    |99.59974553815321|
|204360|Goat Simulator 3                       |99.5558811519708 |
|11226 |Anstoss 3                              |99.53177623237173|
|122661|Gwent: Iron Judgment                   |99.38977451446661|
+------+---------------------------------------+-----------------+
only showing top 10 rows



## Genres

### Leitura da base

In [19]:
table_name = 'genres'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Genres = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [31]:
# Exibe o DataFrame resultante
dfIGDB_Genres.show(truncate=False)

+------------------------------------+-------------------+---+--------------------------+-------------------------+-------------------+-----------------------------------------------------+
|checksum                            |created_at         |id |name                      |slug                     |updated_at         |url                                                  |
+------------------------------------+-------------------+---+--------------------------+-------------------------+-------------------+-----------------------------------------------------+
|ef2ff68a-f7bd-d2d0-76cb-c830bd6e3191|2011-02-13 00:00:00|2  |Point-and-click           |point-and-click          |2011-12-08 00:00:00|https://www.igdb.com/genres/point-and-click          |
|2ccc6572-bdde-6ed4-8843-25447ea40782|2011-02-13 00:00:00|4  |Fighting                  |fighting                 |2011-12-07 00:00:00|https://www.igdb.com/genres/fighting                 |
|bb15fd3f-0f46-e5f3-2b40-d046cf9bd2ef|2011-02-13 0

In [28]:
# Exibe o Schema do DataFrame
dfIGDB_Genres.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)



In [23]:
dfIGDB_Genres.count()

23

#### Converte os campos de data para Unix Timestamp

In [27]:
dfIGDB_Genres = (dfIGDB_Genres
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [71]:
# Número de Games por Gênero
dfGamesByGenre = (dfIGDB_Games
 .select("id", fn.explode_outer("genres").alias("genre_id"))
 .groupBy("genre_id")
 .agg(fn.count("id").alias("games"))
 .orderBy("genre_id")
)

(dfGamesByGenre
 .join(dfIGDB_Genres, dfGamesByGenre.genre_id == dfIGDB_Genres.id, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Undefined")
                             .otherwise(fn.col("name")))
 .select(fn.col("name").alias("Genre"), fn.col("games").alias("Games"))
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+--------------------------+-----+
|Genre                     |Games|
+--------------------------+-----+
|Indie                     |76933|
|Adventure                 |63292|
|Undefined                 |48107|
|Simulator                 |35111|
|Strategy                  |32308|
|Role-playing (RPG)        |26614|
|Puzzle                    |20921|
|Shooter                   |17654|
|Arcade                    |17203|
|Platform                  |14739|
|Sport                     |14390|
|Racing                    |10111|
|Visual Novel              |7534 |
|Fighting                  |5146 |
|Turn-based strategy (TBS) |3744 |
|Point-and-click           |3688 |
|Hack and slash/Beat 'em up|3557 |
|Music                     |3026 |
|Card & Board Game         |2978 |
|Tactical                  |2638 |
+--------------------------+-----+
only showing top 20 rows

