# Exploração de Dados - IGDB

### Configurações Iniciais

In [4]:
!spark-submit --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.2.1
      /_/
                        
Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 11.0.15
Branch HEAD
Compiled by user hgao on 2022-01-20T19:26:14Z
Revision 4f25b3f71238a00508a356591553f2dfa89f8290
Url https://github.com/apache/spark
Type --help for more information.


In [5]:
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql import SparkSession
from IPython.core.display import HTML

display(HTML("<style>pre { white-space: pre !important; }</style>"))

# Define a sessão do Spark com os jars necessários para conexão com o MINIO
spark = (SparkSession.builder
         .config("spark.jars","""/home/jovyan/jars/aws-java-sdk-core-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-dynamodb-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-s3-1.11.534.jar,
                                 /home/jovyan/jars/hadoop-aws-3.2.2.jar""")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
         .config("spark.hadoop.fs.s3a.access.key", "aulafia")
         .config("spark.hadoop.fs.s3a.secret.key", "aulafia@123")
         .config("spark.hadoop.fs.s3a.path.style.access", True)
         .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
         .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
         .getOrCreate()
        )

from delta.tables import DeltaTable

In [2]:
# Nome do bucket
bucket_name = "raw"

# Define o nome da API de onde foram extraídos os dados via arquivo JSON
api_name = 'igdb'

# Define o path do bucket dos dados
bucket_path = 's3a://' + bucket_name + '/' + api_name + '/'

## Games

### Leitura da base

In [3]:
table_name = 'games'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Games = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [4]:
# Exibe o DataFrame resultante
dfIGDB_Games.show(20, False)

+------------------------------------------------+-----------------+-----------------------+--------------------------------------------------------------------+------------------------------------------------------+-----------------------------+--------+------------------------------------+----------+------+----------+----+--------------+----------+---------------------------------------------------------------------------------------+------------------+-------+-----+---------+----------+------------+------------------+----------+--------------------+-----+---+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
# Exibe o Schema do DataFrame
dfIGDB_Games.printSchema()

root
 |-- age_ratings: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- aggregated_rating: double (nullable = true)
 |-- aggregated_rating_count: long (nullable = true)
 |-- alternative_names: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- artworks: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- bundles: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- collection: long (nullable = true)
 |-- cover: long (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- dlcs: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- expanded_games: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- expansions: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- external_games: array (nullable = true)
 |    |-- element: long (containsNull = 

#### Todos os registros do Dataframe são únicos, considerando a chave ID

In [5]:
dfIGDB_Games.count()

249891

In [6]:
dfIGDB_Games.select("id").distinct().count()

249891

#### Describe dos campos numéricos

In [11]:
dfIGDB_Games.describe().show()

+-------+-----------------+-----------------------+------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------+-----------------+------------------+------------------+--------+------------------+--------------------+--------------------+------------------+------------------+--------------------+-----------------+------------------+
|summary|aggregated_rating|aggregated_rating_count|          category|            checksum|        collection|             cover|           follows|         franchise|             hypes|                id|    name|      parent_game|            rating|      rating_count|    slug|            status|           storyline|             summary|      total_rating|total_rating_count|                 url|   version_parent|     version_title|
+-------+-----------------+-----------------------+------------------+--------------------+------------------+----------------

#### Converte os campos de data para Unix Timestamp

In [7]:
dfIGDB_Games = (dfIGDB_Games
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("first_release_date", fn.to_timestamp(fn.from_unixtime("first_release_date")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

In [7]:
import pandas as pd

pandas_df = dfIGDB_Games.toPandas()

In [34]:
pip install openpyxl


Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2
Note: you may need to restart the kernel to use updated packages.


In [11]:
pandas_df = pandas_df.drop('summary', axis=1)

In [10]:
pandas_df = pandas_df.drop('storyline', axis=1)

In [12]:
pandas_df.count()

age_ratings                 56161
aggregated_rating           14942
aggregated_rating_count     14942
alternative_names           56117
artworks                    92643
bundles                     10890
category                   244906
checksum                   244906
collection                  36584
cover                      181298
created_at                 244906
dlcs                         2536
expanded_games               1119
expansions                    665
external_games             231278
first_release_date         154862
follows                     23321
forks                          48
franchise                    1516
franchises                  18681
game_engines                21096
game_localizations          22475
game_modes                 129052
genres                     196762
hypes                        9550
id                         244906
involved_companies         104535
keywords                    81466
language_supports          111127
multiplayer_mo

In [13]:
pandas_df.to_csv("/home/jovyan/notebooks/dfIGDB_Games.csv", index=False)

#### Exploração

In [80]:
# Games atualizados recentemente (considerando a última ingestão até a data da análise)
(dfIGDB_Games
 .select("name", "updated_at")
 .orderBy(fn.col("updated_at").desc())
).show(10, False)

+---------------------------+----------+
|name                       |updated_at|
+---------------------------+----------+
|TheHunter: Call of the Wild|1692054284|
|Paper Planet               |1692053424|
|For Honor                  |1692053377|
|Ghosts of Tabor            |1692053339|
|Battlefield 2042           |1692053337|
|Colossal Cave              |1692053314|
|The Darkside Detective     |1692050900|
|Roguelight                 |1692050898|
|Neon Tail                  |1692050891|
|Dungeon Nightmares         |1692050884|
+---------------------------+----------+
only showing top 10 rows



In [13]:
# Games mais bem avaliados
(dfIGDB_Games
 .select("id", "name", "rating")
 .orderBy(fn.col("rating").desc())
).show(10, False)

+------+----------------------------------------+-----------------+
|id    |name                                    |rating           |
+------+----------------------------------------+-----------------+
|8863  |Age of Wonders III: Golden Realms       |100.0            |
|88973 |Goblin Sword                            |99.94224659928568|
|20196 |Metal Gear Solid: The Legacy Collection |99.68039606212281|
|41888 |Battlefield 3: Premium Edition          |99.67350381707095|
|50261 |Angband                                 |99.63776240578409|
|45131 |Grand Theft Auto V: Special Edition     |99.59974553815321|
|204360|Goat Simulator 3                        |99.5558811519708 |
|11226 |Anstoss 3                               |99.53177623237173|
|122661|Gwent: Iron Judgment                    |99.38977451446661|
|164664|Sekiro: Shadows Die Twice - GOTY Edition|99.36728476578347|
+------+----------------------------------------+-----------------+
only showing top 10 rows



## Genres

### Leitura da base

In [64]:
table_name = 'genres'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Genres = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [15]:
# Exibe o DataFrame resultante
dfIGDB_Genres.show(truncate=False)

+------------------------------------+----------+---+--------------------------+-------------------------+----------+-----------------------------------------------------+
|checksum                            |created_at|id |name                      |slug                     |updated_at|url                                                  |
+------------------------------------+----------+---+--------------------------+-------------------------+----------+-----------------------------------------------------+
|ef2ff68a-f7bd-d2d0-76cb-c830bd6e3191|1297555200|2  |Point-and-click           |point-and-click          |1323302400|https://www.igdb.com/genres/point-and-click          |
|2ccc6572-bdde-6ed4-8843-25447ea40782|1297555200|4  |Fighting                  |fighting                 |1323216000|https://www.igdb.com/genres/fighting                 |
|bb15fd3f-0f46-e5f3-2b40-d046cf9bd2ef|1297555200|5  |Shooter                   |shooter                  |1323216000|https://www.igdb.com/ge

In [66]:
# Exibe o Schema do DataFrame
dfIGDB_Genres.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)



In [17]:
dfIGDB_Genres.count()

23

#### Converte os campos de data para Unix Timestamp

In [65]:
dfIGDB_Genres = (dfIGDB_Genres
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [67]:
# Número de Games por Gênero (considerando multi-classificação)
dfGamesByGenre = (dfIGDB_Games
 .select("id", fn.explode_outer("genres").alias("genre_id"))
 .groupBy("genre_id")
 .agg(fn.count("id").alias("games"))
 .orderBy("genre_id")
)

total_titles = dfGamesByGenre.agg(fn.sum("games")).collect()[0][0]

(dfGamesByGenre
 .join(dfIGDB_Genres, dfGamesByGenre.genre_id == dfIGDB_Genres.id, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Undefined")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("Games") / total_titles) * 100), 2))
 .select(fn.col("name").alias("Genre"), fn.col("games").alias("Games"), "Percent")
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+--------------------------+-----+-------+
|Genre                     |Games|Percent|
+--------------------------+-----+-------+
|Indie                     |77364|18.56  |
|Adventure                 |63632|15.27  |
|Undefined                 |48187|11.56  |
|Simulator                 |35272|8.46   |
|Strategy                  |32439|7.78   |
|Role-playing (RPG)        |26778|6.42   |
|Puzzle                    |21036|5.05   |
|Shooter                   |17856|4.28   |
|Arcade                    |17344|4.16   |
|Platform                  |14865|3.57   |
|Sport                     |14439|3.46   |
|Racing                    |10149|2.44   |
|Visual Novel              |7596 |1.82   |
|Fighting                  |5160 |1.24   |
|Turn-based strategy (TBS) |3753 |0.9    |
|Point-and-click           |3710 |0.89   |
|Hack and slash/Beat 'em up|3571 |0.86   |
|Music                     |3048 |0.73   |
|Card & Board Game         |2994 |0.72   |
|Tactical                  |2653 |0.64   |
+----------

## Game Modes

### Leitura da base

In [68]:
table_name = 'game_modes'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_GameModes = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [69]:
# Exibe o DataFrame resultante
dfIGDB_GameModes.show(truncate=False)

+------------------------------------+----------+---+----------------------------------+--------------------------------+----------+----------------------------------------------------------------+
|checksum                            |created_at|id |name                              |slug                            |updated_at|url                                                             |
+------------------------------------+----------+---+----------------------------------+--------------------------------+----------+----------------------------------------------------------------+
|a43b1688-f968-3541-0897-9735ffde1745|1298937600|1  |Single player                     |single-player                   |1323216000|https://www.igdb.com/game_modes/single-player                   |
|288b548c-11e4-d910-f037-21d4e6a33b38|1298937600|2  |Multiplayer                       |multiplayer                     |1323216000|https://www.igdb.com/game_modes/multiplayer                     |
|e8329d55-

In [72]:
# Exibe o Schema do DataFrame
dfIGDB_GameModes.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: long (nullable = true)
 |-- url: string (nullable = true)



In [51]:
dfIGDB_GameModes.count()

6

#### Converte os campos de data para Unix Timestamp

In [73]:
dfIGDB_GameModes = (dfIGDB_GameModes
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [75]:
# Número de Games por Modo de Jogo (considerando multi-classificação)
dfGamesByGameMode = (dfIGDB_Games
 .select("id", fn.explode_outer("game_modes").alias("gmode_id"))
 .groupBy("gmode_id")
 .agg(fn.count("id").alias("games"))
 .orderBy("gmode_id")
)

total_titles = dfGamesByGameMode.agg(fn.sum("games")).collect()[0][0]

(dfGamesByGameMode
 .join(dfIGDB_GameModes, dfGamesByGameMode.gmode_id == dfIGDB_GameModes.id, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Undefined")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("Games") / total_titles) * 100), 2))
 .select(fn.col("name").alias("Game Mode"), fn.col("games").alias("Games"), "Percent")
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+----------------------------------+------+-------+
|Game Mode                         |Games |Percent|
+----------------------------------+------+-------+
|Single player                     |121367|42.54  |
|Undefined                         |115136|40.36  |
|Multiplayer                       |29065 |10.19  |
|Co-operative                      |12109 |4.24   |
|Massively Multiplayer Online (MMO)|3586  |1.26   |
|Split screen                      |3500  |1.23   |
|Battle Royale                     |523   |0.18   |
+----------------------------------+------+-------+



## Player Perspectives

### Leitura da base

In [16]:
table_name = 'player_perspectives'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_PlayerPersp = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [20]:
# Exibe o DataFrame resultante
dfIGDB_PlayerPersp.show(truncate=False)

+------------------------------------+-------------------+---+---------------------+-------------------------+-------------------+------------------------------------------------------------------+
|checksum                            |created_at         |id |name                 |slug                     |updated_at         |url                                                               |
+------------------------------------+-------------------+---+---------------------+-------------------------+-------------------+------------------------------------------------------------------+
|4e23cb22-7a70-effb-b8e1-151317c6cdbd|2011-03-01 00:00:00|1  |First person         |first-person             |2011-12-07 00:00:00|https://www.igdb.com/player_perspectives/first-person             |
|2788b856-580c-66d0-bef3-d6169034f175|2011-03-01 00:00:00|2  |Third person         |third-person             |2011-12-07 00:00:00|https://www.igdb.com/player_perspectives/third-person             |
|83c59132-

In [19]:
# Exibe o Schema do DataFrame
dfIGDB_PlayerPersp.printSchema()

root
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)



In [11]:
dfIGDB_PlayerPersp.count()

7

#### Converte os campos de data para Unix Timestamp

In [18]:
dfIGDB_PlayerPersp = (dfIGDB_PlayerPersp
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at", 'yyyy-MM-dd HH:mm:ss')))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at", 'yyyy-MM-dd HH:mm:ss')))
               )

#### Exploração

In [98]:
# Número de Games por Perspectiva de Jogador (considerando multi-classificação)
dfGamesByPersp = (dfIGDB_Games
 .select("id", fn.explode_outer("player_perspectives").alias("persp_id"))
 .groupBy("persp_id")
 .agg(fn.count("id").alias("games"))
 .orderBy("persp_id")
)

total_titles = dfGamesByPersp.agg(fn.sum("games")).collect()[0][0]

(dfGamesByPersp
 .join(dfIGDB_PlayerPersp, dfGamesByPersp.persp_id == dfIGDB_PlayerPersp.id, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Undefined")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("Games") / total_titles) * 100), 2))
 .select(fn.col("name").alias("Player Perspective"), fn.col("games").alias("Games"), "Percent")
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+---------------------+------+-------+
|Player Perspective   |Games |Percent|
+---------------------+------+-------+
|Undefined            |164837|64.69  |
|Bird view / Isometric|22132 |8.69   |
|Side view            |20627 |8.1    |
|Third person         |20148 |7.91   |
|First person         |17486 |6.86   |
|Text                 |6660  |2.61   |
|Virtual Reality      |2293  |0.9    |
|Auditory             |611   |0.24   |
+---------------------+------+-------+



## Platforms

### Leitura da base

In [22]:
table_name = 'platforms'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_Platforms = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [24]:
# Exibe o DataFrame resultante
dfIGDB_Platforms.show(20, truncate=False)

+------------+--------------------+--------+------------------------------------+----------+----------+---+-----------------------------------+---------------+-------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------------+-----------------------------------------------------------+--------+
|abbreviation|alternative_name    |category|checksum                            |created_at|generation|id |name                               |platform_family|platform_logo|slug   |summary                                                                                                                                                                      

In [41]:
dfIGDB_Platforms.filter("summary is not null").show(truncate = False)
print(dfIGDB_Platforms.filter("summary is not null").count())

+------------+----------------------+--------+------------------------------------+-------------------+----------+---+-------------+---------------+-------------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [111]:
# Exibe o Schema do DataFrame
dfIGDB_Platforms.printSchema()

root
 |-- abbreviation: string (nullable = true)
 |-- alternative_name: string (nullable = true)
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- generation: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- platform_family: long (nullable = true)
 |-- platform_logo: long (nullable = true)
 |-- slug: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- websites: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [26]:
dfIGDB_Platforms.count()

200

#### Converte os campos de data para Unix Timestamp

In [25]:
dfIGDB_Platforms = (dfIGDB_Platforms
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Exploração

In [113]:
# Número de Games por Plataforma (considerando multi-classificação)
dfGamesByPlatform = (dfIGDB_Games
 .select("id", fn.explode_outer("platforms").alias("platform_id"))
 .groupBy("platform_id")
 .agg(fn.count("id").alias("games"))
 .orderBy("platform_id")
)

total_titles = dfGamesByPlatform.agg(fn.sum("games")).collect()[0][0]

(dfGamesByPlatform
 .join(dfIGDB_Platforms, dfGamesByPlatform.platform_id == dfIGDB_Platforms.id, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Undefined")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("Games") / total_titles) * 100), 2))
 .select(fn.col("name").alias("Platforms"), fn.col("games").alias("Games"), "Percent")
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+----------------------+-----+-------+
|Platforms             |Games|Percent|
+----------------------+-----+-------+
|PC (Microsoft Windows)|94060|26.09  |
|Undefined             |81010|22.47  |
|Mac                   |20452|5.67   |
|Nintendo Switch       |14773|4.1    |
|iOS                   |12768|3.54   |
|PlayStation 4         |12453|3.45   |
|Linux                 |12354|3.43   |
|Android               |10328|2.87   |
|Xbox One              |10245|2.84   |
|Web browser           |5719 |1.59   |
|PlayStation 3         |3954 |1.1    |
|PlayStation 5         |3896 |1.08   |
|Arcade                |3687 |1.02   |
|PlayStation 2         |3653 |1.01   |
|Xbox Series X|S       |3543 |0.98   |
|Xbox 360              |3395 |0.94   |
|DOS                   |3362 |0.93   |
|PlayStation           |3084 |0.86   |
|Wii                   |2770 |0.77   |
|Commodore C64/128/MAX |2601 |0.72   |
+----------------------+-----+-------+
only showing top 20 rows



## External Games

### Leitura da base

In [4]:
table_name = 'external_games'

delta_table_path = bucket_path + table_name + '/delta/'

dfIGDB_ExtGames = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [5]:
# Exibe o DataFrame resultante
dfIGDB_ExtGames.show(truncate=False)

+--------+------------------------------------+----------+-----+---+---------------------------------------------------------------+------+----------+----+----+
|category|checksum                            |created_at|game |id |name                                                           |uid   |updated_at|url |year|
+--------+------------------------------------+----------+-----+---+---------------------------------------------------------------+------+----------+----+----+
|1       |e31731d6-678a-31e0-837b-7749db2f03b0|1494460800|28590|1  |Crappy Day Enhanced Edition                                    |636700|1494547200|null|null|
|1       |2e9851e5-0312-b3ca-abe1-e844a38d1da3|1494493232|28591|2  |Zombie Kill                                                    |636430|1633092888|null|null|
|1       |efbc0b6c-aa73-e43a-f587-11108f326d91|1494493238|28592|3  |Tesla vs Lovecraft                                             |636100|1643444761|null|null|
|1       |e61c5382-0391-5a75-0a1c-

In [9]:
# Exibe o Schema do DataFrame
dfIGDB_ExtGames.printSchema()

root
 |-- category: long (nullable = true)
 |-- checksum: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- game: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- url: string (nullable = true)
 |-- year: long (nullable = true)



In [7]:
dfIGDB_ExtGames.count()

472539

#### Converte os campos de data para Unix Timestamp

In [8]:
dfIGDB_ExtGames = (dfIGDB_ExtGames
                .withColumn("created_at", fn.to_timestamp(fn.from_unixtime("created_at")))
                .withColumn("updated_at", fn.to_timestamp(fn.from_unixtime("updated_at")))
               )

#### Definição do Enum 'category'

(Os Enums serão criados na camada context, e são fornecidos pelo IGDB na documentação da API. Exemplo utilizado disponível em: https://api-docs.igdb.com/?python#external-game)

In [18]:
schema = StructType([
    StructField("name", StringType(), True),
    StructField("value", IntegerType(), True)
])

data = [
    ("steam", 1),
    ("gog", 5),
    ("youtube", 10),
    ("microsoft", 11),
    ("apple", 13),
    ("twitch", 14),
    ("android", 15),
    ("amazon_asin", 20),
    ("amazon_luna", 22),
    ("amazon_adg", 23),
    ("epic_game_store", 26),
    ("oculus", 28),
    ("utomik", 29),
    ("itch_io", 30),
    ("xbox_marketplace", 31),
    ("kartridge", 32),
    ("playstation_store_us", 36),
    ("focus_entertainment", 37),
    ("xbox_game_pass_ultimate_cloud", 54),
    ("gamejolt", 55)
]

df_EnumCategory = spark.createDataFrame(data, schema=schema)

In [None]:
df_EnumCategory.show()

+--------------------+-----+
|                name|value|
+--------------------+-----+
|               steam|    1|
|                 gog|    5|
|             youtube|   10|
|           microsoft|   11|
|               apple|   13|
|              twitch|   14|
|             android|   15|
|         amazon_asin|   20|
|         amazon_luna|   22|
|          amazon_adg|   23|
|     epic_game_store|   26|
|              oculus|   28|
|              utomik|   29|
|             itch_io|   30|
|    xbox_marketplace|   31|
|           kartridge|   32|
|playstation_store_us|   36|
| focus_entertainment|   37|
|xbox_game_pass_ul...|   54|
|            gamejolt|   55|
+--------------------+-----+



#### Exploração

In [24]:
# Número de Games por Plataforma (considerando multi-classificação)
dfGamesByExtPlatform = (dfIGDB_ExtGames
 .groupBy(fn.col("category"))
 .agg(fn.count("id").alias("Games"))
 .orderBy(fn.col("Games").desc())
)

total_titles = dfGamesByExtPlatform.agg(fn.sum("Games")).collect()[0][0]

(dfGamesByExtPlatform
 .join(df_EnumCategory, dfGamesByExtPlatform.category == df_EnumCategory.value, how="left")
 .withColumn("name", fn.when(fn.col("name").isNull(), "Unknown")
                             .otherwise(fn.col("name")))
 .withColumn("Percent", fn.round(((fn.col("Games") / total_titles) * 100), 2))
 .select(fn.col("name").alias("External Platforms"), fn.col("Games"), "Percent")
 .orderBy(fn.col("Games").desc())
).show(truncate=False)

+-----------------------------+------+-------+
|External Platforms           |Games |Percent|
+-----------------------------+------+-------+
|twitch                       |199858|42.29  |
|steam                        |102246|21.64  |
|Unknown                      |73093 |15.47  |
|amazon_asin                  |32255 |6.83   |
|itch_io                      |11332 |2.4    |
|microsoft                    |10995 |2.33   |
|apple                        |9671  |2.05   |
|playstation_store_us         |7734  |1.64   |
|gog                          |6913  |1.46   |
|epic_game_store              |6239  |1.32   |
|android                      |3016  |0.64   |
|utomik                       |2984  |0.63   |
|xbox_marketplace             |1773  |0.38   |
|youtube                      |1376  |0.29   |
|kartridge                    |925   |0.2    |
|gamejolt                     |674   |0.14   |
|oculus                       |404   |0.09   |
|xbox_game_pass_ultimate_cloud|388   |0.08   |
|amazon_adg  