# Exploração de Dados - HLTB

### Configurações Iniciais

In [1]:
!spark-submit --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.2.1
      /_/
                        
Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 11.0.15
Branch HEAD
Compiled by user hgao on 2022-01-20T19:26:14Z
Revision 4f25b3f71238a00508a356591553f2dfa89f8290
Url https://github.com/apache/spark
Type --help for more information.


In [1]:
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql import SparkSession
from IPython.core.display import HTML

display(HTML("<style>pre { white-space: pre !important; }</style>"))

# Define a sessão do Spark com os jars necessários para conexão com o MINIO
spark = (SparkSession.builder
         .config("spark.jars","""/home/jovyan/jars/aws-java-sdk-core-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-dynamodb-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-s3-1.11.534.jar,
                                 /home/jovyan/jars/hadoop-aws-3.2.2.jar""")
         .config("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
         .config("spark.hadoop.fs.s3a.access.key", "aulafia")
         .config("spark.hadoop.fs.s3a.secret.key", "aulafia@123")
         .config("spark.hadoop.fs.s3a.path.style.access", True)
         .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
         .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
         .getOrCreate()
        )

from delta.tables import DeltaTable

In [2]:
# Nome do bucket
bucket_name = "raw"

# Define o nome da API de onde foram extraídos os dados via arquivo JSON
api_name = 'hltb'

# Define o path do bucket dos dados
bucket_path = 's3a://' + bucket_name + '/' + api_name + '/'

In [3]:
table_name = 'games'

control_table_path = bucket_path + table_name + '/control_table/'

df = spark.read.parquet(control_table_path)

df.show()

+-------------------+----------+-----------+---------+----------+--------------+
|          Exec_date| Exec_time|Loaded_date|Read_Rows|Table_Rows|Total_duration|
+-------------------+----------+-----------+---------+----------+--------------+
|2023-10-24 02:40:03|1698126003| 2023-10-23|    66712|     66717|      00:00:39|
|2023-10-07 15:40:31|1696704031| 2023-10-07|    66563|     66567|      00:00:43|
|2023-10-06 13:38:38|1696610318| 2023-10-06|    66566|     66566|      00:00:26|
+-------------------+----------+-----------+---------+----------+--------------+



## Games

### Leitura da base

In [5]:
table_name = 'games'

delta_table_path = bucket_path + table_name + '/delta/'

dfHLTB_Games = DeltaTable.forPath(spark, delta_table_path).toDF()

### Conhecendo a Base

In [13]:
# Exibe o DataFrame resultante
dfHLTB_Games.show(20, False)

+--------+--------------+--------+--------------+-----------+----------------+-----------+-----------+------------+---------+---------------+---------+---------------+-------------+----------+-------------+-------------+------------+--------------+-------------------+-----------------------------------------+-------+---------------------------------------------+-------------------------------------+--------------+---------+---+-----------+-----------------+-----------+-----------------+----------------------------------------------------------+-------------------------------------------------------------------------------------+---------------+-------------+-------------+------------+
|comp_100|comp_100_count|comp_all|comp_all_count|comp_lvl_co|comp_lvl_combine|comp_lvl_mp|comp_lvl_sp|comp_lvl_spd|comp_main|comp_main_count|comp_plus|comp_plus_count|count_backlog|count_comp|count_playing|count_retired|count_review|count_speedrun|extracted_datetime |game_alias                            

In [12]:
# Exibe o Schema do DataFrame
dfHLTB_Games.printSchema()

root
 |-- comp_100: long (nullable = true)
 |-- comp_100_count: long (nullable = true)
 |-- comp_all: long (nullable = true)
 |-- comp_all_count: long (nullable = true)
 |-- comp_lvl_co: long (nullable = true)
 |-- comp_lvl_combine: long (nullable = true)
 |-- comp_lvl_mp: long (nullable = true)
 |-- comp_lvl_sp: long (nullable = true)
 |-- comp_lvl_spd: long (nullable = true)
 |-- comp_main: long (nullable = true)
 |-- comp_main_count: long (nullable = true)
 |-- comp_plus: long (nullable = true)
 |-- comp_plus_count: long (nullable = true)
 |-- count_backlog: long (nullable = true)
 |-- count_comp: long (nullable = true)
 |-- count_playing: long (nullable = true)
 |-- count_retired: long (nullable = true)
 |-- count_review: long (nullable = true)
 |-- count_speedrun: long (nullable = true)
 |-- extracted_datetime: timestamp (nullable = true)
 |-- game_alias: string (nullable = true)
 |-- game_id: long (nullable = true)
 |-- game_image: string (nullable = true)
 |-- game_name: string 

#### Todos os registros do Dataframe são únicos, considerando a chave ID

In [8]:
dfHLTB_Games.count()

66717

In [9]:
dfHLTB_Games.select("id").distinct().count()

66717

#### Describe dos campos numéricos

In [10]:
dfHLTB_Games.describe().show()

+-------+-----------------+-----------------+-----------------+------------------+------------------+-------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+----------+------------------+--------------------+-----------------------+-------------------+---------+-----------------+------------------+------------------+-----------------+-------------------+-----------+----------------+------------------+-----------------+-----------------+------------------+
|summary|         comp_100|   comp_100_count|         comp_all|    comp_all_count|       comp_lvl_co|   comp_lvl_combine|       comp_lvl_mp|        comp_lvl_sp|       comp_lvl_spd|         comp_main|   comp_main_count|         comp_plus|   comp_plus_count|     count_backlog|        count_comp|     count_

#### Converte os campos de data para Unix Timestamp

In [11]:
dfHLTB_Games = (dfHLTB_Games
                .withColumn("extracted_datetime", fn.to_timestamp(fn.from_unixtime("extracted_datetime")))
                )

#### Exploração

In [16]:
# Games atualizados recentemente (considerando a última ingestão até a data da análise) – considera a data de extração
(dfHLTB_Games
 .select("game_name", "extracted_datetime")
 .orderBy(fn.col("extracted_datetime").desc())
).show(10, False)

+--------------------------------+-------------------+
|game_name                       |extracted_datetime |
+--------------------------------+-------------------+
|Thief II: The Metal Age         |2023-10-24 02:00:16|
|Thief: The Dark Project         |2023-10-24 02:00:16|
|Thief: Deadly Shadows           |2023-10-24 02:00:16|
|Thief                           |2023-10-24 02:00:16|
|Baldur's Gate                   |2023-10-24 02:00:16|
|Baldur's Gate II: Shadows of Amn|2023-10-24 02:00:16|
|Jagged Alliance                 |2023-10-24 02:00:16|
|Jagged Alliance: Deadly Games   |2023-10-24 02:00:16|
|Jagged Alliance 2               |2023-10-24 02:00:16|
|Jade Empire: Special Edition    |2023-10-24 02:00:16|
+--------------------------------+-------------------+
only showing top 10 rows



In [18]:
# Games mais bem avaliados
(dfHLTB_Games
 .select("id", "game_name", "review_score")
 .orderBy(fn.col("review_score").desc())
).show(10, False)

+----+------------------------------------------+------------+
|id  |game_name                                 |review_score|
+----+------------------------------------------+------------+
|2500|Nemesis: The Wizardry Adventure           |100         |
|5158|Ski-Doo: Snowmobile Challenge             |100         |
|2811|Cartoon Network Universe: FusionFall      |100         |
|208 |Their Finest Hour: The Battle of Britain  |100         |
|2885|Wizardry IV: The Return of Werdna         |100         |
|1936|Beyond Good & Evil 2                      |100         |
|3311|Barbie Horse Adventures: Wild Horse Rescue|100         |
|4417|Operation Europe: Path to Victory 1939-45 |100         |
|4268|Panzer General                            |100         |
|728 |Grand Prix 2                              |100         |
+----+------------------------------------------+------------+
only showing top 10 rows

