# Exploração de Dados - IGDB

### Configurações Iniciais

In [4]:
!spark-submit --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.2.1
      /_/
                        
Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 11.0.15
Branch HEAD
Compiled by user hgao on 2022-01-20T19:26:14Z
Revision 4f25b3f71238a00508a356591553f2dfa89f8290
Url https://github.com/apache/spark
Type --help for more information.


In [1]:
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql import SparkSession
from IPython.core.display import HTML

display(HTML("<style>pre { white-space: pre !important; }</style>"))

# Define a sessão do Spark com os jars necessários para conexão com o MINIO
spark = (SparkSession.builder
         .config("spark.jars","""/home/jovyan/jars/aws-java-sdk-core-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-dynamodb-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-s3-1.11.534.jar,
                                 /home/jovyan/jars/hadoop-aws-3.2.2.jar""")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
         .config("spark.hadoop.fs.s3a.access.key", "aulafia")
         .config("spark.hadoop.fs.s3a.secret.key", "aulafia@123")
         .config("spark.hadoop.fs.s3a.path.style.access", True)
         .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
         .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
         .getOrCreate()
        )

from delta.tables import DeltaTable

In [2]:
# Nome do bucket
bucket_name = "raw"

# Define o nome da API de onde foram extraídos os dados via arquivo JSON
api_name = 'igdb'

# Define o path do bucket dos dados
bucket_path = 's3a://' + bucket_name + '/' + api_name + '/'

## Games

### Leitura da base

In [3]:
table_name = 'games'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Games = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [12]:
# Exibe o DataFrame resultante
dfIGDB_Games.show(20, False)

+---------------------------------------+-----------------+-----------------------+--------------------------------------------------------------------+--------------------------------------------------------------+-----------------------------+--------+------------------------------------+----------+------+----------+----+--------------+----------+---------------------------------------------------------------------------------------+------------------+-------+-----+---------+----------+------------+------------------+----------+--------------------+-----+---+-------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
# Exibe o Schema do DataFrame
dfIGDB_Games.printSchema()

root
 |-- age_ratings: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- aggregated_rating: double (nullable = true)
 |-- aggregated_rating_count: long (nullable = true)
 |-- alternative_names: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- artworks: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- bundles: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- collection: long (nullable = true)
 |-- cover: long (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- dlcs: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- expanded_games: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- expansions: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- external_games: array (nullable = true)
 |    |-- element: long (containsNull = 

#### Todos os registros do Dataframe são únicos, considerando a chave ID

In [4]:
dfIGDB_Games.count()

256043

In [6]:
dfIGDB_Games.select("id").distinct().count()

249891

#### Describe dos campos numéricos

In [11]:
dfIGDB_Games.describe().show()

+-------+-----------------+-----------------------+------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------+-----------------+------------------+------------------+--------+------------------+--------------------+--------------------+------------------+------------------+--------------------+-----------------+------------------+
|summary|aggregated_rating|aggregated_rating_count|          category|            checksum|        collection|             cover|           follows|         franchise|             hypes|                id|    name|      parent_game|            rating|      rating_count|    slug|            status|           storyline|             summary|      total_rating|total_rating_count|                 url|   version_parent|     version_title|
+-------+-----------------+-----------------------+------------------+--------------------+------------------+----------------

#### Converte os campos de data para Unix Timestamp

In [7]:
dfIGDB_Games = (dfIGDB_Games
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("first_release_date", fn.to_timestamp(fn.from_unixtime("first_release_date")))
           dfIGDB_Games.describe().show()     .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

In [7]:
import pandas as pd

pandas_df = dfIGDB_Games.toPandas()

In [34]:
pip install openpyxl


Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2
Note: you may need to restart the kernel to use updated packages.


In [11]:
pandas_df = pandas_df.drop('summary', axis=1)

In [10]:
pandas_df = pandas_df.drop('storyline', axis=1)

In [12]:
pandas_df.count()

age_ratings                 56161
aggregated_rating           14942
aggregated_rating_count     14942
alternative_names           56117
artworks                    92643
bundles                     10890
category                   244906
checksum                   244906
collection                  36584
cover                      181298
created_at                 244906
dlcs                         2536
expanded_games               1119
expansions                    665
external_games             231278
first_release_date         154862
follows                     23321
forks                          48
franchise                    1516
franchises                  18681
game_engines                21096
game_localizations          22475
game_modes                 129052
genres                     196762
hypes                        9550
id                         244906
involved_companies         104535
keywords                    81466
language_supports          111127
multiplayer_mo

In [13]:
pandas_df.to_csv("/home/jovyan/notebooks/dfIGDB_Games.csv", index=False)

#### Exploração

In [80]:
# Games atualizados recentemente (considerando a última ingestão até a data da análise)
(dfIGDB_Games
 .select("name", "updated_at")
 .orderBy(fn.col("updated_at").desc())
).show(10, False)

+---------------------------+----------+
|name                       |updated_at|
+---------------------------+----------+
|TheHunter: Call of the Wild|1692054284|
|Paper Planet               |1692053424|
|For Honor                  |1692053377|
|Ghosts of Tabor            |1692053339|
|Battlefield 2042           |1692053337|
|Colossal Cave              |1692053314|
|The Darkside Detective     |1692050900|
|Roguelight                 |1692050898|
|Neon Tail                  |1692050891|
|Dungeon Nightmares         |1692050884|
+---------------------------+----------+
only showing top 10 rows



In [13]:
# Games mais bem avaliados
(dfIGDB_Games
 .select("id", "name", "rating")
 .orderBy(fn.col("rating").desc())
).show(10, False)

+------+----------------------------------------+-----------------+
|id    |name                                    |rating           |
+------+----------------------------------------+-----------------+
|8863  |Age of Wonders III: Golden Realms       |100.0            |
|88973 |Goblin Sword                            |99.94224659928568|
|20196 |Metal Gear Solid: The Legacy Collection |99.68039606212281|
|41888 |Battlefield 3: Premium Edition          |99.67350381707095|
|50261 |Angband                                 |99.63776240578409|
|45131 |Grand Theft Auto V: Special Edition     |99.59974553815321|
|204360|Goat Simulator 3                        |99.5558811519708 |
|11226 |Anstoss 3                               |99.53177623237173|
|122661|Gwent: Iron Judgment                    |99.38977451446661|
|164664|Sekiro: Shadows Die Twice - GOTY Edition|99.36728476578347|
+------+----------------------------------------+-----------------+
only showing top 10 rows



## Genres

### Leitura da base

In [64]:
table_name = 'genres'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Genres = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [15]:
# Exibe o DataFrame resultante
dfIGDB_Genres.show(truncate=False)

+------------------------------------+----------+---+--------------------------+-------------------------+----------+-----------------------------------------------------+
|checksum                            |created_at|id |name                      |slug                     |updated_at|url                                                  |
+------------------------------------+----------+---+--------------------------+-------------------------+----------+-----------------------------------------------------+
|ef2ff68a-f7bd-d2d0-76cb-c830bd6e3191|1297555200|2  |Point-and-click           |point-and-click          |1323302400|https://www.igdb.com/genres/point-and-click          |
|2ccc6572-bdde-6ed4-8843-25447ea40782|1297555200|4  |Fighting                  |fighting                 |1323216000|https://www.igdb.com/genres/fighting                 |
|bb15fd3f-0f46-e5f3-2b40-d046cf9bd2ef|1297555200|5  |Shooter                   |shooter                  |1323216000|https://www.igdb.com/ge

In [66]:
# Exibe o Schema do DataFrame
dfIGDB_Genres.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)



In [17]:
dfIGDB_Genres.count()

23

#### Converte os campos de data para Unix Timestamp

In [65]:
dfIGDB_Genres = (dfIGDB_Genres
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [67]:
# Número de Games por Gênero (considerando multi-classificação)
dfGamesByGenre = (dfIGDB_Games
 .select("id", fn.explode_outer("genres").alias("genre_id"))
 .groupBy("genre_id")
 .agg(fn.count("id").alias("games"))
 .orderBy("genre_id")
)

total_titles = dfGamesByGenre.agg(fn.sum("games")).collect()[0][0]

(dfGamesByGenre
 .join(dfIGDB_Genres, dfGamesByGenre.genre_id == dfIGDB_Genres.id, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Undefined")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("Games") / total_titles) * 100), 2))
 .select(fn.col("name").alias("Genre"), fn.col("games").alias("Games"), "Percent")
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+--------------------------+-----+-------+
|Genre                     |Games|Percent|
+--------------------------+-----+-------+
|Indie                     |77364|18.56  |
|Adventure                 |63632|15.27  |
|Undefined                 |48187|11.56  |
|Simulator                 |35272|8.46   |
|Strategy                  |32439|7.78   |
|Role-playing (RPG)        |26778|6.42   |
|Puzzle                    |21036|5.05   |
|Shooter                   |17856|4.28   |
|Arcade                    |17344|4.16   |
|Platform                  |14865|3.57   |
|Sport                     |14439|3.46   |
|Racing                    |10149|2.44   |
|Visual Novel              |7596 |1.82   |
|Fighting                  |5160 |1.24   |
|Turn-based strategy (TBS) |3753 |0.9    |
|Point-and-click           |3710 |0.89   |
|Hack and slash/Beat 'em up|3571 |0.86   |
|Music                     |3048 |0.73   |
|Card & Board Game         |2994 |0.72   |
|Tactical                  |2653 |0.64   |
+----------

## Game Modes

### Leitura da base

In [68]:
table_name = 'game_modes'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_GameModes = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [69]:
# Exibe o DataFrame resultante
dfIGDB_GameModes.show(truncate=False)

+------------------------------------+----------+---+----------------------------------+--------------------------------+----------+----------------------------------------------------------------+
|checksum                            |created_at|id |name                              |slug                            |updated_at|url                                                             |
+------------------------------------+----------+---+----------------------------------+--------------------------------+----------+----------------------------------------------------------------+
|a43b1688-f968-3541-0897-9735ffde1745|1298937600|1  |Single player                     |single-player                   |1323216000|https://www.igdb.com/game_modes/single-player                   |
|288b548c-11e4-d910-f037-21d4e6a33b38|1298937600|2  |Multiplayer                       |multiplayer                     |1323216000|https://www.igdb.com/game_modes/multiplayer                     |
|e8329d55-

In [72]:
# Exibe o Schema do DataFrame
dfIGDB_GameModes.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: long (nullable = true)
 |-- url: string (nullable = true)



In [51]:
dfIGDB_GameModes.count()

6

#### Converte os campos de data para Unix Timestamp

In [73]:
dfIGDB_GameModes = (dfIGDB_GameModes
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [75]:
# Número de Games por Modo de Jogo (considerando multi-classificação)
dfGamesByGameMode = (dfIGDB_Games
 .select("id", fn.explode_outer("game_modes").alias("gmode_id"))
 .groupBy("gmode_id")
 .agg(fn.count("id").alias("games"))
 .orderBy("gmode_id")
)

total_titles = dfGamesByGameMode.agg(fn.sum("games")).collect()[0][0]

(dfGamesByGameMode
 .join(dfIGDB_GameModes, dfGamesByGameMode.gmode_id == dfIGDB_GameModes.id, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Undefined")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("Games") / total_titles) * 100), 2))
 .select(fn.col("name").alias("Game Mode"), fn.col("games").alias("Games"), "Percent")
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+----------------------------------+------+-------+
|Game Mode                         |Games |Percent|
+----------------------------------+------+-------+
|Single player                     |121367|42.54  |
|Undefined                         |115136|40.36  |
|Multiplayer                       |29065 |10.19  |
|Co-operative                      |12109 |4.24   |
|Massively Multiplayer Online (MMO)|3586  |1.26   |
|Split screen                      |3500  |1.23   |
|Battle Royale                     |523   |0.18   |
+----------------------------------+------+-------+



## Player Perspectives

### Leitura da base

In [16]:
table_name = 'player_perspectives'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_PlayerPersp = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [20]:
# Exibe o DataFrame resultante
dfIGDB_PlayerPersp.show(truncate=False)

+------------------------------------+-------------------+---+---------------------+-------------------------+-------------------+------------------------------------------------------------------+
|checksum                            |created_at         |id |name                 |slug                     |updated_at         |url                                                               |
+------------------------------------+-------------------+---+---------------------+-------------------------+-------------------+------------------------------------------------------------------+
|4e23cb22-7a70-effb-b8e1-151317c6cdbd|2011-03-01 00:00:00|1  |First person         |first-person             |2011-12-07 00:00:00|https://www.igdb.com/player_perspectives/first-person             |
|2788b856-580c-66d0-bef3-d6169034f175|2011-03-01 00:00:00|2  |Third person         |third-person             |2011-12-07 00:00:00|https://www.igdb.com/player_perspectives/third-person             |
|83c59132-

In [19]:
# Exibe o Schema do DataFrame
dfIGDB_PlayerPersp.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)



In [11]:
dfIGDB_PlayerPersp.count()

7

#### Converte os campos de data para Unix Timestamp

In [18]:
dfIGDB_PlayerPersp = (dfIGDB_PlayerPersp
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at", 'yyyy-MM-dd HH:mm:ss')))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at", 'yyyy-MM-dd HH:mm:ss')))
               )

#### Exploração

In [98]:
# Número de Games por Perspectiva de Jogador (considerando multi-classificação)
dfGamesByPersp = (dfIGDB_Games
 .select("id", fn.explode_outer("player_perspectives").alias("persp_id"))
 .groupBy("persp_id")
 .agg(fn.count("id").alias("games"))
 .orderBy("persp_id")
)

total_titles = dfGamesByPersp.agg(fn.sum("games")).collect()[0][0]

(dfGamesByPersp
 .join(dfIGDB_PlayerPersp, dfGamesByPersp.persp_id == dfIGDB_PlayerPersp.id, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Undefined")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("Games") / total_titles) * 100), 2))
 .select(fn.col("name").alias("Player Perspective"), fn.col("games").alias("Games"), "Percent")
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+---------------------+------+-------+
|Player Perspective   |Games |Percent|
+---------------------+------+-------+
|Undefined            |164837|64.69  |
|Bird view / Isometric|22132 |8.69   |
|Side view            |20627 |8.1    |
|Third person         |20148 |7.91   |
|First person         |17486 |6.86   |
|Text                 |6660  |2.61   |
|Virtual Reality      |2293  |0.9    |
|Auditory             |611   |0.24   |
+---------------------+------+-------+



## Platforms

### Leitura da base

In [3]:
table_name = 'platforms'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Platforms = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [4]:
# Exibe o DataFrame resultante
dfIGDB_Platforms.show(20, truncate=False)

+------------+--------------------+--------+------------------------------------+----------+----------+---+-----------------------------------+---------------+-------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------------+-----------------------------------------------------------+--------+
|abbreviation|alternative_name    |category|checksum                            |created_at|generation|id |name                               |platform_family|platform_logo|slug   |summary                                                                                                                                                                      

In [6]:
dfIGDB_Platforms.filter("alternative_name is null").show(truncate = False)
print(dfIGDB_Platforms.filter("alternative_name is null").count())

+-------------+----------------+--------+------------------------------------+----------+----------+---+----------------+---------------+-------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
# Exibe o Schema do DataFrame
dfIGDB_Platforms.printSchema()

root
 |-- abbreviation: string (nullable = true)
 |-- alternative_name: string (nullable = true)
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- generation: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- platform_family: long (nullable = true)
 |-- platform_logo: long (nullable = true)
 |-- slug: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- websites: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [5]:
dfIGDB_Platforms.count()

200

#### Converte os campos de data para Unix Timestamp

In [6]:
dfIGDB_Platforms = (dfIGDB_Platforms
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [20]:
dfPlatformsByCategory = (dfIGDB_Platforms
                         .select("id", "category")
                         .groupby("category")
                         .agg(fn.count("id").alias("platforms"))
                         .orderBy(fn.col("category").desc())
                        )

delta_table_path = 's3a://raw/igdb_enums/platforms/category/delta/'

dfIGDB_Platform_Categories = DeltaTable.forPath(spark, delta_table_path).toDF()

total_plats = dfPlatformsByCategory.agg(fn.sum("platforms")).collect()[0][0]

(dfPlatformsByCategory
 .join(dfIGDB_Platform_Categories, dfPlatformsByCategory.category == dfIGDB_Platform_Categories.value, how="left")
 .withColumn("category", fn.when(fn.col("category").isNull(), "Undefined")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("platforms") / total_plats) * 100), 2))
 .select(fn.col("category").alias("Categories"), fn.col("platforms").alias("Platforms"), "Percent")
 .orderBy(fn.col("Platforms").desc())
).show(truncate=False)

+----------------+---------+-------+
|Categories      |Platforms|Percent|
+----------------+---------+-------+
|console         |80       |40.0   |
|computer        |55       |27.5   |
|portable_console|34       |17.0   |
|operating_system|10       |5.0    |
|Undefined       |10       |5.0    |
|platform        |8        |4.0    |
|arcade          |3        |1.5    |
+----------------+---------+-------+



In [113]:
# Número de Games por Plataforma (considerando multi-classificação)
dfGamesByPlatform = (dfIGDB_Games
 .select("id", fn.explode_outer("platforms").alias("platform_id"))
 .groupBy("platform_id")
 .agg(fn.count("id").alias("games"))
 .orderBy("platform_id")
)

total_titles = dfGamesByPlatform.agg(fn.sum("games")).collect()[0][0]

(dfGamesByPlatform
 .join(dfIGDB_Platforms, dfGamesByPlatform.platform_id == dfIGDB_Platforms.id, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Undefined")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("Games") / total_titles) * 100), 2))
 .select(fn.col("name").alias("Platforms"), fn.col("games").alias("Games"), "Percent")
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+----------------------+-----+-------+
|Platforms             |Games|Percent|
+----------------------+-----+-------+
|PC (Microsoft Windows)|94060|26.09  |
|Undefined             |81010|22.47  |
|Mac                   |20452|5.67   |
|Nintendo Switch       |14773|4.1    |
|iOS                   |12768|3.54   |
|PlayStation 4         |12453|3.45   |
|Linux                 |12354|3.43   |
|Android               |10328|2.87   |
|Xbox One              |10245|2.84   |
|Web browser           |5719 |1.59   |
|PlayStation 3         |3954 |1.1    |
|PlayStation 5         |3896 |1.08   |
|Arcade                |3687 |1.02   |
|PlayStation 2         |3653 |1.01   |
|Xbox Series X|S       |3543 |0.98   |
|Xbox 360              |3395 |0.94   |
|DOS                   |3362 |0.93   |
|PlayStation           |3084 |0.86   |
|Wii                   |2770 |0.77   |
|Commodore C64/128/MAX |2601 |0.72   |
+----------------------+-----+-------+
only showing top 20 rows



## Platform Families

### Leitura da base

In [3]:
table_name = 'platform_families'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Platform_Families = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [4]:
# Exibe o DataFrame resultante
dfIGDB_Platform_Families.show(20, truncate=False)

+------------------------------------+---+-----------+-----------+
|checksum                            |id |name       |slug       |
+------------------------------------+---+-----------+-----------+
|eadc0208-7972-9364-88b9-d2a9de75a998|1  |PlayStation|playstation|
|5bb9bcec-4aa7-45b8-129b-a6f88932942e|2  |Xbox       |xbox       |
|bd1a9cc4-3632-33b0-ff4b-feab8cd15001|3  |Sega       |sega       |
|be8b89fa-823c-790a-1d5a-7e74315c1b27|4  |Linux      |linux      |
|398e5c87-f7db-6265-6da6-433441385208|5  |Nintendo   |nintendo   |
+------------------------------------+---+-----------+-----------+



In [5]:
# Exibe o Schema do DataFrame
dfIGDB_Platform_Families.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)



In [6]:
dfIGDB_Platform_Families.count()

5

#### Exploração

In [None]:
# TO-DO

## Platform Logos

### Leitura da base

In [8]:
table_name = 'platform_logos'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Platform_Logos = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [10]:
# Exibe o DataFrame resultante
dfIGDB_Platform_Logos.show(20, truncate=False)

+-------------+--------+------------------------------------+------+---+--------------------+--------------------------------------------------------------------+-----+
|alpha_channel|animated|checksum                            |height|id |image_id            |url                                                                 |width|
+-------------+--------+------------------------------------+------+---+--------------------+--------------------------------------------------------------------+-----+
|true         |false   |7ab4aaa1-c6bf-e8b3-1db1-7ff668132f10|1000  |38 |sqgw6vespav1buezgjjn|//images.igdb.com/igdb/image/upload/t_thumb/sqgw6vespav1buezgjjn.jpg|1000 |
|false        |false   |2081ddfe-e71f-6d54-e758-4bdf9520af6b|174   |40 |bezbkk17hk0uobdkhjcv|//images.igdb.com/igdb/image/upload/t_thumb/bezbkk17hk0uobdkhjcv.jpg|1000 |
|true         |false   |be0be98a-a023-52c7-bf1d-7409c4c8179f|700   |42 |lz0cuozrdeealg8hfzxg|//images.igdb.com/igdb/image/upload/t_thumb/lz0cuozrdeealg8hfz

In [9]:
# Exibe o Schema do DataFrame
dfIGDB_Platform_Logos.printSchema()

root
 |-- alpha_channel: boolean (nullable = true)
 |-- animated: boolean (nullable = true)
 |-- checksum: string (nullable = true)
 |-- height: long (nullable = true)
 |-- id: long (nullable = true)
 |-- image_id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- width: long (nullable = true)



In [11]:
dfIGDB_Platform_Logos.count()

346

#### Exploração

In [None]:
# TO-DO

### Leitura da base

In [12]:
table_name = 'platform_versions'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Platform_Versions = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [13]:
# Exibe o DataFrame resultante
dfIGDB_Platform_Versions.show(20, truncate=False)

+------------------------------------+----------+-------------------------------------------------------------------+-----------------------------------------------+---------------------------------------------------+---+-----------------+----------------------------------------------------------------------------------------------------------------+-----------------------------------------+------------------------+----------------------------+-------------------------------------------------------------+-----------------------------------------------------------+-------------+------------------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------+---------------------------------------------------------------+--------------------------------------------------+-------------------------------------------------------------------------------------

In [14]:
# Exibe o Schema do DataFrame
dfIGDB_Platform_Versions.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- companies: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- connectivity: string (nullable = true)
 |-- cpu: string (nullable = true)
 |-- graphics: string (nullable = true)
 |-- id: long (nullable = true)
 |-- main_manufacturer: long (nullable = true)
 |-- media: string (nullable = true)
 |-- memory: string (nullable = true)
 |-- name: string (nullable = true)
 |-- online: string (nullable = true)
 |-- os: string (nullable = true)
 |-- output: string (nullable = true)
 |-- platform_logo: long (nullable = true)
 |-- platform_version_release_dates: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- resolutions: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- sound: string (nullable = true)
 |-- storage: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- url: string (nullable = true)



In [15]:
dfIGDB_Platform_Versions.count()

371

#### Exploração

In [None]:
# TO-DO

### Leitura da base

In [16]:
table_name = 'platform_version_companies'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Platform_Version_Companies = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [17]:
# Exibe o DataFrame resultante
dfIGDB_Platform_Version_Companies.show(20, truncate=False)

+------------------------------------+-------+-------+---------+---+------------+
|checksum                            |comment|company|developer|id |manufacturer|
+------------------------------------+-------+-------+---------+---+------------+
|459b2566-8c93-57b1-7b6f-13f539e2684e|null   |2123   |false    |1  |true        |
|17ea4f71-e7e1-6ee5-fd36-35a72762387a|null   |45     |true     |2  |false       |
|9b310657-674c-6e9c-98d9-12fc245d2e82|null   |128    |true     |3  |false       |
|e9b66b9f-679c-5d39-2c9e-a50e65deb5c1|null   |2348   |false    |4  |true        |
|c007c51b-85b7-f1d5-c013-2f4d849663c1|null   |2123   |false    |5  |true        |
|43332237-22ed-fc58-1078-1ee0ddff5412|null   |128    |true     |6  |false       |
|a87b2e66-b677-57a0-6d0b-2890515d8e8f|null   |2350   |true     |7  |true        |
|36293e0a-e673-eedf-9f0f-4821b6addeb7|null   |128    |true     |8  |false       |
|b9760e01-3fa2-2738-1e23-5153c5ecfe3d|null   |128    |true     |9  |false       |
|fac9909c-5324-e

In [18]:
# Exibe o Schema do DataFrame
dfIGDB_Platform_Version_Companies.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- company: long (nullable = true)
 |-- developer: boolean (nullable = true)
 |-- id: long (nullable = true)
 |-- manufacturer: boolean (nullable = true)



In [19]:
dfIGDB_Platform_Version_Companies.count()

548

#### Exploração

In [None]:
# TO-DO

### Leitura da base

In [20]:
table_name = 'platform_version_release_dates'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Platform_Version_Release_Dates = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [21]:
# Exibe o DataFrame resultante
dfIGDB_Platform_Version_Release_Dates.show(20, truncate=False)

+--------+------------------------------------+----------+------------+---+---+------+----------+----+
|category|checksum                            |date      |human       |id |m  |region|updated_at|y   |
+--------+------------------------------------+----------+------------+---+---+------+----------+----+
|0       |b9365483-40a2-88ab-be8b-c20179db08f8|1256169600|Oct 22, 2009|1  |10 |8     |null      |2009|
|0       |e8d503b7-ff48-b5a0-2549-3e1abca5bfed|1193097600|Oct 23, 2007|2  |10 |2     |null      |2007|
|1       |4009b066-27fb-68d5-5b9d-f4ebbc0fd73a|1162339200|Nov 2006    |3  |11 |5     |null      |2006|
|0       |28c79b81-312f-c1c7-dafa-7a9b84b9fc96|1142467200|Mar 16, 2006|4  |3  |1     |null      |2006|
|0       |499becde-17c2-9ce3-3619-6e657635c93f|1143072000|Mar 23, 2006|5  |3  |3     |null      |2006|
|0       |e0dd767f-af1f-f9c1-43e3-1c1f55ce57b6|1351209600|Oct 26, 2012|6  |10 |8     |null      |2012|
|0       |766e141d-7004-df40-93ea-4946003093de|1170115200|Jan 30, 2007|7 

In [22]:
# Exibe o Schema do DataFrame
dfIGDB_Platform_Version_Release_Dates.printSchema()

root
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- date: long (nullable = true)
 |-- human: string (nullable = true)
 |-- id: long (nullable = true)
 |-- m: long (nullable = true)
 |-- region: long (nullable = true)
 |-- updated_at: long (nullable = true)
 |-- y: long (nullable = true)



In [23]:
dfIGDB_Platform_Version_Release_Dates.count()

487

#### Exploração

In [None]:
# TO-DO

### Leitura da base

In [24]:
table_name = 'platform_websites'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Platform_Websites = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [25]:
# Exibe o DataFrame resultante
dfIGDB_Platform_Websites.show(20, truncate=False)

+--------+------------------------------------+---+-------+---------------------------------------------------+
|category|checksum                            |id |trusted|url                                                |
+--------+------------------------------------+---+-------+---------------------------------------------------+
|1       |8d6e5c1c-e1ac-20aa-87e7-6a5222df7138|1  |false  |http://www.linux.org                               |
|1       |bedd3276-c425-1fb8-8177-ab106986b4e0|2  |false  |http://windows.microsoft.com/                      |
|1       |eba95370-74ac-e96a-c3a0-cedd1f664e74|3  |false  |http://us.playstation.com/ps3/                     |
|1       |58fa5f13-95e2-e504-b8c1-1ca1a65ffe66|4  |false  |http://www.xbox.com/en-US/xbox-360                 |
|1       |bf8b0ec9-4d7a-a954-5630-071035fbd9f8|5  |false  |https://www.apple.com/osx/                         |
|1       |f52efdbb-400a-1f74-d2eb-7d837ed5b266|6  |false  |https://sega.jp/history/hard/segasaturn/     

In [26]:
# Exibe o Schema do DataFrame
dfIGDB_Platform_Websites.printSchema()

root
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- id: long (nullable = true)
 |-- trusted: boolean (nullable = true)
 |-- url: string (nullable = true)



In [27]:
dfIGDB_Platform_Websites.count()

101

#### Exploração

In [None]:
# TO-DO

## External Games

### Leitura da base

In [4]:
table_name = 'external_games'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_ExtGames = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [5]:
# Exibe o DataFrame resultante
dfIGDB_ExtGames.show(truncate=False)

+--------+------------------------------------+----------+-----+---+---------------------------------------------------------------+------+----------+----+----+
|category|checksum                            |created_at|game |id |name                                                           |uid   |updated_at|url |year|
+--------+------------------------------------+----------+-----+---+---------------------------------------------------------------+------+----------+----+----+
|1       |e31731d6-678a-31e0-837b-7749db2f03b0|1494460800|28590|1  |Crappy Day Enhanced Edition                                    |636700|1494547200|null|null|
|1       |2e9851e5-0312-b3ca-abe1-e844a38d1da3|1494493232|28591|2  |Zombie Kill                                                    |636430|1633092888|null|null|
|1       |efbc0b6c-aa73-e43a-f587-11108f326d91|1494493238|28592|3  |Tesla vs Lovecraft                                             |636100|1643444761|null|null|
|1       |e61c5382-0391-5a75-0a1c-

In [9]:
# Exibe o Schema do DataFrame
dfIGDB_ExtGames.printSchema()

root
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- game: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)
 |-- year: long (nullable = true)



In [7]:
dfIGDB_ExtGames.count()

472539

#### Converte os campos de data para Unix Timestamp

In [8]:
dfIGDB_ExtGames = (dfIGDB_ExtGames
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Definição do Enum 'category'

(Os Enums serão criados na camada context, e são fornecidos pelo IGDB na documentação da API. Exemplo utilizado disponível em: https://api-docs.igdb.com/?python#external-game)

In [18]:
schema = StructType([
    StructField("name", StringType(), True),
    StructField("value", IntegerType(), True)
])

data = [
    ("steam", 1),
    ("gog", 5),
    ("youtube", 10),
    ("microsoft", 11),
    ("apple", 13),
    ("twitch", 14),
    ("android", 15),
    ("amazon_asin", 20),
    ("amazon_luna", 22),
    ("amazon_adg", 23),
    ("epic_game_store", 26),
    ("oculus", 28),
    ("utomik", 29),
    ("itch_io", 30),
    ("xbox_marketplace", 31),
    ("kartridge", 32),
    ("playstation_store_us", 36),
    ("focus_entertainment", 37),
    ("xbox_game_pass_ultimate_cloud", 54),
    ("gamejolt", 55)
]

df_EnumCategory = spark.createDataFrame(data, schema=schema)

In [None]:
df_EnumCategory.show()

+--------------------+-----+
|                name|value|
+--------------------+-----+
|               steam|    1|
|                 gog|    5|
|             youtube|   10|
|           microsoft|   11|
|               apple|   13|
|              twitch|   14|
|             android|   15|
|         amazon_asin|   20|
|         amazon_luna|   22|
|          amazon_adg|   23|
|     epic_game_store|   26|
|              oculus|   28|
|              utomik|   29|
|             itch_io|   30|
|    xbox_marketplace|   31|
|           kartridge|   32|
|playstation_store_us|   36|
| focus_entertainment|   37|
|xbox_game_pass_ul...|   54|
|            gamejolt|   55|
+--------------------+-----+



#### Exploração

In [24]:
# Número de Games por Plataforma (considerando multi-classificação)
dfGamesByExtPlatform = (dfIGDB_ExtGames
 .groupBy(fn.col("category"))
 .agg(fn.count("id").alias("Games"))
 .orderBy(fn.col("Games").desc())
)

total_titles = dfGamesByExtPlatform.agg(fn.sum("Games")).collect()[0][0]

(dfGamesByExtPlatform
 .join(df_EnumCategory, dfGamesByExtPlatform.category == df_EnumCategory.value, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Unknown")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("Games") / total_titles) * 100), 2))
 .select(fn.col("name").alias("External Platforms"), fn.col("Games"), "Percent")
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+-----------------------------+------+-------+
|External Platforms           |Games |Percent|
+-----------------------------+------+-------+
|twitch                       |199858|42.29  |
|steam                        |102246|21.64  |
|Unknown                      |73093 |15.47  |
|amazon_asin                  |32255 |6.83   |
|itch_io                      |11332 |2.4    |
|microsoft                    |10995 |2.33   |
|apple                        |9671  |2.05   |
|playstation_store_us         |7734  |1.64   |
|gog                          |6913  |1.46   |
|epic_game_store              |6239  |1.32   |
|android                      |3016  |0.64   |
|utomik                       |2984  |0.63   |
|xbox_marketplace             |1773  |0.38   |
|youtube                      |1376  |0.29   |
|kartridge                    |925   |0.2    |
|gamejolt                     |674   |0.14   |
|oculus                       |404   |0.09   |
|xbox_game_pass_ultimate_cloud|388   |0.08   |
|amazon_adg  

# Keywords

### Leitura da base

In [5]:
table_name = 'keywords'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Keywords = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [6]:
# Exibe o DataFrame resultante
dfIGDB_Keywords.show(truncate=False)

+------------------------------------+----------+---+------------------+----------------+----------+------------------------------------------------+
|checksum                            |created_at|id |name              |slug            |updated_at|url                                             |
+------------------------------------+----------+---+------------------+----------------+----------+------------------------------------------------+
|babab409-2c5d-9e05-1f1e-d2debfbb409e|1302739200|1  |modern warfare    |modern-warfare  |1323216000|https://www.igdb.com/categories/modern-warfare  |
|fe5a303a-fe19-8678-0e94-6caf0af7c0dc|1302739200|3  |aliens            |aliens          |1323216000|https://www.igdb.com/categories/aliens          |
|c3af99c2-270a-9df1-b06e-041b36f90857|1302739200|4  |pirates           |pirates         |1323216000|https://www.igdb.com/categories/pirates         |
|19746d97-9387-b093-4f07-3f5a09796ce8|1302739200|5  |zombies           |zombies         |1403395200|

In [11]:
# Exibe o Schema do DataFrame
dfIGDB_Keywords.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)



In [8]:
dfIGDB_Keywords.count()

4295

#### Converte os campos de data para Unix Timestamp

In [10]:
dfIGDB_Keywords = (dfIGDB_Keywords
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [12]:
# TO-DO

# Languages

### Leitura da base

In [6]:
table_name = 'languages'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_languages = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [14]:
# Exibe o DataFrame resultante
dfIGDB_languages.show(30, truncate=False)

+------------------------------------+-------------------+---+------+---------------------+--------------------+-------------------+
|checksum                            |created_at         |id |locale|name                 |native_name         |updated_at         |
+------------------------------------+-------------------+---+------+---------------------+--------------------+-------------------+
|e0963ddc-1e01-32e2-a959-d5ccda50410c|2022-10-03 13:27:19|1  |ar    |Arabic               |العربية             |2022-10-03 13:27:19|
|ddcc7457-8939-17a3-9826-ef92d0d72371|2022-10-03 13:27:19|2  |zh-CN |Chinese (Simplified) |简体中文            |2022-10-03 13:27:19|
|e7a793e9-d117-bc98-a000-67e444c0893d|2022-10-03 13:27:19|3  |zh-TW |Chinese (Traditional)|繁體中文            |2022-10-03 13:27:19|
|58621751-aa00-f068-dbaf-2369d2e5a015|2022-10-03 13:27:19|4  |cs-CZ |Czech                |čeština             |2022-10-03 13:27:19|
|613fc1f9-9c07-474a-1ad5-7a26b44cc7ea|2022-10-03 13:27:19|5  |da-DK |Danish  

In [13]:
# Exibe o Schema do DataFrame
dfIGDB_languages.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- locale: string (nullable = true)
 |-- name: string (nullable = true)
 |-- native_name: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)



In [9]:
dfIGDB_languages.count()

28

#### Converte os campos de data para Unix Timestamp

In [12]:
dfIGDB_languages = (dfIGDB_languages
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [None]:
# TO-DO

# Language Supports

### Leitura da base

In [16]:
table_name = 'language_supports'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_language_supports = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [22]:
# Exibe o DataFrame resultante
dfIGDB_language_supports.show(truncate=False)

+------------------------------------+-------------------+------+------+--------+---------------------+-------------------+
|checksum                            |created_at         |game  |id    |language|language_support_type|updated_at         |
+------------------------------------+-------------------+------+------+--------+---------------------+-------------------+
|78f54dd1-aaf5-e3ed-7826-a238e500625e|2023-10-01 02:10:54|95210 |808764|21      |1                    |2023-10-03 07:43:14|
|3b0e27ce-1c52-e936-f824-6611afe5bbe2|2023-10-01 02:10:54|95210 |808765|21      |2                    |2023-10-03 07:43:14|
|f12263c0-240a-f85a-1a41-a2c78fc97587|2023-10-01 02:10:54|95210 |808766|22      |1                    |2023-10-03 07:43:14|
|3fa9a31c-bb22-7b55-8e09-e7673b967fcc|2023-10-01 02:10:54|95210 |808767|22      |2                    |2023-10-03 07:43:14|
|7abc16ca-3120-7f8e-fa87-1e38dff9d2bc|2023-10-01 02:11:56|268150|808768|27      |3                    |2023-10-01 15:44:37|
|05aa46c

In [21]:
# Exibe o Schema do DataFrame
dfIGDB_language_supports.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- game: long (nullable = true)
 |-- id: long (nullable = true)
 |-- language: long (nullable = true)
 |-- language_support_type: long (nullable = true)
 |-- updated_at: timestamp (nullable = true)



In [19]:
dfIGDB_language_supports.count()

773558

#### Converte os campos de data para Unix Timestamp

In [20]:
dfIGDB_language_supports = (dfIGDB_language_supports
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [None]:
# TO-DO

# Language Support Types

### Leitura da base

In [None]:
table_name = 'language_support_types'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_language_support_types = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [42]:
# Exibe o DataFrame resultante
dfIGDB_language_support_types.show(truncate=False)

+------------------------------------+-------------------+---+---------+-------------------+
|checksum                            |created_at         |id |name     |updated_at         |
+------------------------------------+-------------------+---+---------+-------------------+
|17300b0f-8e60-5cf7-2b2d-6797e4ccb813|2022-10-03 13:27:19|1  |Audio    |2022-10-03 13:27:19|
|8de2491d-254c-c271-655c-07a9153883dc|2022-10-03 13:27:19|2  |Subtitles|2022-10-03 13:27:19|
|f7b81411-acba-02d2-9131-e8a4ebef763d|2022-10-03 13:27:19|3  |Interface|2022-10-03 13:27:19|
+------------------------------------+-------------------+---+---------+-------------------+



In [None]:
# Exibe o Schema do DataFrame
dfIGDB_language_support_types.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- game: long (nullable = true)
 |-- id: long (nullable = true)
 |-- language: long (nullable = true)
 |-- language_support_type: long (nullable = true)
 |-- updated_at: timestamp (nullable = true)



In [None]:
dfIGDB_language_support_types.count()

773558

#### Converte os campos de data para Unix Timestamp

In [None]:
dfIGDB_language_support_types = (dfIGDB_language_support_types
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [None]:
# TO-DO

# Themes

### Leitura da base

In [30]:
table_name = 'themes'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_themes = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [37]:
# Exibe o DataFrame resultante
dfIGDB_themes.show(30, truncate=False)

+------------------------------------+-------------------+---+----------------------------------------------+-----------------------------------------+-------------------+---------------------------------------------------------------------+
|checksum                            |created_at         |id |name                                          |slug                                     |updated_at         |url                                                                  |
+------------------------------------+-------------------+---+----------------------------------------------+-----------------------------------------+-------------------+---------------------------------------------------------------------+
|c177e0ff-f29a-2a2e-fce5-f945258ceb59|2011-11-29 00:00:00|1  |Action                                        |action                                   |2011-12-07 00:00:00|https://www.igdb.com/themes/action                                   |
|454bdc25-3dbf-5648-1e11-fc67ffc

In [36]:
# Exibe o Schema do DataFrame
dfIGDB_themes.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)



In [33]:
dfIGDB_themes.count()

22

#### Converte os campos de data para Unix Timestamp

In [35]:
dfIGDB_themes = (dfIGDB_themes
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [None]:
# TO-DO

# Multiplayer Modes

### Leitura da base

In [38]:
table_name = 'multiplayer_modes'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_multiplayer_modes = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [39]:
# Exibe o DataFrame resultante
dfIGDB_multiplayer_modes.show(truncate=False)

+------------+------------------------------------+------+-----+---+-------+-----------+--------------+----------+----------+-------------+---------+--------+-----------+
|campaigncoop|checksum                            |dropin|game |id |lancoop|offlinecoop|offlinecoopmax|offlinemax|onlinecoop|onlinecoopmax|onlinemax|platform|splitscreen|
+------------+------------------------------------+------+-----+---+-------+-----------+--------------+----------+----------+-------------+---------+--------+-----------+
|false       |03778a0f-3bb3-a0f0-c835-e3bbef146b4d|false |11120|5  |false  |false      |null          |null      |false     |null         |2        |6       |false      |
|false       |dc1f6640-eee4-7ed0-740b-4afb23667bcf|false |11120|6  |false  |false      |null          |null      |false     |null         |2        |14      |false      |
|false       |a41bcf9c-976d-8690-629f-4c2737f65bf1|false |46076|7  |false  |false      |null          |30        |false     |null         |null  

In [40]:
# Exibe o Schema do DataFrame
dfIGDB_multiplayer_modes.printSchema()

root
 |-- campaigncoop: boolean (nullable = true)
 |-- checksum: string (nullable = true)
 |-- dropin: boolean (nullable = true)
 |-- game: long (nullable = true)
 |-- id: long (nullable = true)
 |-- lancoop: boolean (nullable = true)
 |-- offlinecoop: boolean (nullable = true)
 |-- offlinecoopmax: long (nullable = true)
 |-- offlinemax: long (nullable = true)
 |-- onlinecoop: boolean (nullable = true)
 |-- onlinecoopmax: long (nullable = true)
 |-- onlinemax: long (nullable = true)
 |-- platform: long (nullable = true)
 |-- splitscreen: boolean (nullable = true)



In [41]:
dfIGDB_multiplayer_modes.count()

14131

#### Exploração

In [None]:
# TO-DO

# Franchises

### Leitura da base

In [4]:
table_name = 'franchises'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_franchises = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [5]:
# Exibe o DataFrame resultante
dfIGDB_franchises.show(truncate=False)

+------------------------------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
# Exibe o Schema do DataFrame
dfIGDB_franchises.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: long (nullable = true)
 |-- games: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: long (nullable = true)
 |-- url: string (nullable = true)



In [7]:
dfIGDB_franchises.count()

1855

#### Exploração

In [None]:
# TO-DO

# Collections

### Leitura da base

In [8]:
table_name = 'collections'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_collections = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [9]:
# Exibe o DataFrame resultante
dfIGDB_collections.show(truncate=False)

+------------------------------------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---+--------------------------------------+-------------------------------------+----------+-------------------------------------------------------------

In [11]:
# Exibe o Schema do DataFrame
dfIGDB_collections.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: long (nullable = true)
 |-- games: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: long (nullable = true)
 |-- url: string (nullable = true)



In [12]:
dfIGDB_collections.count()

7744

#### Exploração

In [None]:
# TO-DO

# Game Engines

### Leitura da base

In [22]:
table_name = 'game_engines'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_game_engines = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [23]:
# Exibe o DataFrame resultante
dfIGDB_game_engines.show(truncate=False)

+------------------------------------+----------+----------+---+----+----------------+----------------------------------------------------+----------------+----------+--------------------------------------------------+
|checksum                            |companies |created_at|id |logo|name            |platforms                                           |slug            |updated_at|url                                               |
+------------------------------------+----------+----------+---+----+----------------+----------------------------------------------------+----------------+----------+--------------------------------------------------+
|0451dbf6-af33-bd30-3e67-4d841bf540ce|[94]      |1399505504|2  |0   |Frostbite       |[6, 9, 12, 48, 49]                                  |frostbite       |1689275304|https://www.igdb.com/game_engines/frostbite       |
|819a0785-9dee-dcbb-8666-f2db43f0e26e|[56]      |1399579055|3  |0   |Source          |[3, 6, 9, 11, 12, 14, 34]             

In [24]:
# Exibe o Schema do DataFrame
dfIGDB_game_engines.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- companies: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- created_at: long (nullable = true)
 |-- id: long (nullable = true)
 |-- logo: long (nullable = true)
 |-- name: string (nullable = true)
 |-- platforms: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- slug: string (nullable = true)
 |-- updated_at: long (nullable = true)
 |-- url: string (nullable = true)



In [25]:
dfIGDB_game_engines.count()

1201

#### Exploração

In [None]:
# TO-DO

# Game Engine Logos

### Leitura da base

In [26]:
table_name = 'game_engine_logos'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_game_engine_logos = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [27]:
# Exibe o DataFrame resultante
dfIGDB_game_engine_logos.show(truncate=False)

+-------------+--------+------------------------------------+------+---+--------------------+--------------------------------------------------------------------+-----+
|alpha_channel|animated|checksum                            |height|id |image_id            |url                                                                 |width|
+-------------+--------+------------------------------------+------+---+--------------------+--------------------------------------------------------------------+-----+
|null         |null    |4a343234-49b9-be6f-56e3-5b37c1569979|256   |7  |yet7ledalhypkqxpdcfz|//images.igdb.com/igdb/image/upload/t_thumb/yet7ledalhypkqxpdcfz.jpg|256  |
|null         |null    |8d1d8a48-280d-452e-8192-4ab34fdea659|204   |9  |rk2pcdi9skhph6e5ysby|//images.igdb.com/igdb/image/upload/t_thumb/rk2pcdi9skhph6e5ysby.jpg|660  |
|null         |null    |8427de38-dd2a-1240-72b4-b6cb96c1c8e3|343   |11 |qqvzsxjdjr6qk310gzne|//images.igdb.com/igdb/image/upload/t_thumb/qqvzsxjdjr6qk310gz

In [28]:
# Exibe o Schema do DataFrame
dfIGDB_game_engine_logos.printSchema()

root
 |-- alpha_channel: boolean (nullable = true)
 |-- animated: boolean (nullable = true)
 |-- checksum: string (nullable = true)
 |-- height: long (nullable = true)
 |-- id: long (nullable = true)
 |-- image_id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- width: long (nullable = true)



In [30]:
dfIGDB_game_engine_logos.count()

169

# Companies

### Leitura da base

In [4]:
table_name = 'companies'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_companies = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [5]:
# Exibe o DataFrame resultante
dfIGDB_companies.show()

+-----------+--------------------+------------------+--------------------+-------+----------+--------------------+--------------------+---+----+--------------------+------+--------------------+--------------------+----------+-------------------+----------+--------------------+--------+
|change_date|change_date_category|changed_company_id|            checksum|country|created_at|         description|           developed| id|logo|                name|parent|           published|                slug|start_date|start_date_category|updated_at|                 url|websites|
+-----------+--------------------+------------------+--------------------+-------+----------+--------------------+--------------------+---+----+--------------------+------+--------------------+--------------------+----------+-------------------+----------+--------------------+--------+
|       null|                   7|                 5|00a2eed3-db13-e9a...|    840|1297810753|Electronic Arts i...|[1621, 2166, 3011...|  1|

In [14]:
dfIGDB_companies.describe().show()

+-------+--------------------+--------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-----------------+--------------------+------------------+--------------------+--------------------+-------------------+--------------------+--------------------+
|summary|         change_date|change_date_category|changed_company_id|            checksum|          country|          created_at|         description|                id|             logo|                name|            parent|                slug|          start_date|start_date_category|          updated_at|                 url|
+-------+--------------------+--------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-----------------+--------------------+------------------+--------------------+--------------------+-------------------+--------------------+--------------------+
|

In [46]:
(
    dfIGDB_companies
    .select("change_date", "changed_company_id", "id", "created_at", "name", "updated_at")
    .where(fn.col("changed_company_id").isNotNull())
    .orderBy("changed_company_id", ascending=True)
).show(50, False)

+-----------+------------------+-----+----------+-------------------------------------+----------+
|change_date|changed_company_id|id   |created_at|name                                 |updated_at|
+-----------+------------------+-----+----------+-------------------------------------+----------+
|1230681600 |0                 |2846 |1398917131|Mad Doc Software                     |1697735451|
|788832000  |1                 |10337|1468231604|Dro Soft                             |1698053280|
|null       |5                 |1    |1297810753|Electronic Arts                      |1699383317|
|915062400  |5                 |903  |1357165107|Interplay Productions                |1698308445|
|null       |5                 |2621 |1397072659|Intermetrics Entertainment Software  |1397072686|
|null       |6                 |410  |1317629025|Big Blue Box                         |1698840786|
|1262217600 |7                 |11   |1298982339|EA Redwood Shores                    |1697551965|
|662601600

In [49]:
(
    dfIGDB_companies
    .select("change_date", "changed_company_id", "id", "created_at", "name", "updated_at")
    .where((fn.col("changed_company_id") == 190) | (fn.col("id") == 190) | (fn.col("changed_company_id") == 7647)| (fn.col("id") == 7647))
    .orderBy("name", ascending=True)
).show(truncate=False)

+-----------+------------------+-----+----------+----------------------+----------+
|change_date|changed_company_id|id   |created_at|name                  |updated_at|
+-----------+------------------+-----+----------+----------------------+----------+
|457488000  |190               |82   |1300206663|Atari                 |1699373825|
|889747200  |7647              |190  |1301988748|Atari Corporation     |1698850846|
|null       |null              |7647 |1449925673|Atari Interactive     |1698786556|
|null       |190               |2406 |1392893070|Atari, Inc.           |1698879444|
|1051747200 |7647              |29492|1605029691|Infogrames Interactive|1698758031|
+-----------+------------------+-----+----------+----------------------+----------+



In [52]:
(
    dfIGDB_Games
    .select("id", "name", "involved_companies")
    .filter((fn.array_contains("involved_companies", 190)) | 
            (fn.array_contains("involved_companies", 82)) |
            (fn.array_contains("involved_companies", 2406)) |
            (fn.array_contains("involved_companies", 29492)) |
            (fn.array_contains("involved_companies", 7647)))
).show(20, False)

+-----+-------------+-----------------------------+
|id   |name         |involved_companies           |
+-----+-------------+-----------------------------+
|219  |MDK          |[186, 189, 190, 43831, 43832]|
|15495|Jones on Fire|[29492, 133567, 133568]      |
+-----+-------------+-----------------------------+



In [9]:
# Exibe o Schema do DataFrame
dfIGDB_companies.printSchema()

root
 |-- change_date: long (nullable = true)
 |-- change_date_category: long (nullable = true)
 |-- changed_company_id: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- country: long (nullable = true)
 |-- created_at: long (nullable = true)
 |-- description: string (nullable = true)
 |-- developed: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- id: long (nullable = true)
 |-- logo: long (nullable = true)
 |-- name: string (nullable = true)
 |-- parent: long (nullable = true)
 |-- published: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- slug: string (nullable = true)
 |-- start_date: long (nullable = true)
 |-- start_date_category: long (nullable = true)
 |-- updated_at: long (nullable = true)
 |-- url: string (nullable = true)
 |-- websites: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [10]:
dfIGDB_companies.count()

48401

#### Exploração

In [None]:
# TO-DO

# Involved Companies

### Leitura da base

In [39]:
table_name = 'involved_companies'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_involved_companies = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [40]:
# Exibe o DataFrame resultante
dfIGDB_involved_companies.show(truncate=False)

+------------------------------------+-------+----------+---------+------+------+-------+---------+----------+----------+
|checksum                            |company|created_at|developer|game  |id    |porting|publisher|supporting|updated_at|
+------------------------------------+-------+----------+---------+------+------+-------+---------+----------+----------+
|98678387-4f43-7c38-4fba-e1b52d04e07c|129    |1664981310|true     |4615  |189796|false  |true     |false     |1664982753|
|9af87b13-5fad-df94-9f0a-08691a15316d|129    |1664981375|true     |213191|189797|false  |true     |false     |1665005743|
|74ea3067-9bb9-124d-a276-b3891f62fa96|1287   |1664981540|true     |214458|189800|false  |false    |false     |1665005760|
|8f72268b-727f-16a8-4883-28ec39b886bc|1809   |1664981540|true     |214458|189801|false  |false    |false     |1665005760|
|13ba02ec-801e-ef46-5086-df6ff40b0fc9|70     |1664981540|false    |214458|189802|false  |true     |false     |1665005760|
|ad8ea895-29be-687b-b46a

In [41]:
# Exibe o Schema do DataFrame
dfIGDB_involved_companies.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- company: long (nullable = true)
 |-- created_at: long (nullable = true)
 |-- developer: boolean (nullable = true)
 |-- game: long (nullable = true)
 |-- id: long (nullable = true)
 |-- porting: boolean (nullable = true)
 |-- publisher: boolean (nullable = true)
 |-- supporting: boolean (nullable = true)
 |-- updated_at: long (nullable = true)



In [42]:
dfIGDB_involved_companies.count()

170891

#### Exploração

In [None]:
# TO-DO

# Websites

### Leitura da base

In [43]:
table_name = 'websites'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_websites = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [44]:
# Exibe o DataFrame resultante
dfIGDB_websites.show(truncate=False)

+--------+------------------------------------+----+---+-------+--------------------------------------------------------------+
|category|checksum                            |game|id |trusted|url                                                           |
+--------+------------------------------------+----+---+-------+--------------------------------------------------------------+
|1       |1ece8bf8-1741-757e-673f-fcd071398143|115 |11 |false  |http://www.leagueoflegends.com                                |
|1       |83fc76b2-befb-e0ec-e14d-7072bdfaaa65|120 |13 |false  |http://www.diablo3.com                                        |
|1       |632b7585-1af8-3a6e-6a62-7ea3e246cb2c|126 |17 |false  |http://us.blizzard.com/en-us/games/d2/index.html              |
|1       |ec217488-e5c3-69fb-c27f-5472ae523af1|141 |22 |false  |http://www.lucasarts.com/games/swbattlefront/                 |
|1       |a53fe91f-142a-9d74-a135-8d0f2c56aefc|228 |32 |false  |http://www.worldofwarcraft.com/wrath    

In [45]:
# Exibe o Schema do DataFrame
dfIGDB_websites.printSchema()

root
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- game: long (nullable = true)
 |-- id: long (nullable = true)
 |-- trusted: boolean (nullable = true)
 |-- url: string (nullable = true)



In [46]:
dfIGDB_websites.count()

524679

#### Exploração

In [None]:
# TO-DO