## Criação das partições de competition

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os
import boto3

In [6]:
spark = SparkSession.builder.getOrCreate()

In [9]:
# read competitions

path_competitions =  os.path.join("s3://","sor","football","data","competitions.json")
df_competition = spark.read.option("multiline", "true").json(path_competitions)

In [10]:
# Write competitions in LandingZone

path_competitions_landing = os.path.join("s3://","landing","competitions") 
print(path_competitions_landing)
df_competition.write.partitionBy("season_id").format("json").mode("overwrite").save(path_competitions_landing)

s3://landing/competitions


                                                                                

## Criação das partições de Matches

In [11]:
# Read all matches
path_all_matches = os.path.join("s3://","sor","football","data","matches","*","*.json")

df_match = spark.read.option("multiline", "true").json(path_all_matches)
print(df_match.count())

[Stage 8:>                                                          (0 + 4) / 4]

2886


                                                                                

In [15]:
df_match.printSchema()

root
 |-- away_score: long (nullable = true)
 |-- away_team: struct (nullable = true)
 |    |-- away_team_gender: string (nullable = true)
 |    |-- away_team_group: string (nullable = true)
 |    |-- away_team_id: long (nullable = true)
 |    |-- away_team_name: string (nullable = true)
 |    |-- country: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- managers: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- country: struct (nullable = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- dob: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- nickname: string (nullable = true)
 |-- competition: struct (nullable = true)
 |    |-- competition_id: long (nullable = true)
 |    |-- competition_nam

In [17]:
df_match.show()

+----------+--------------------+--------------------+-------------------+----------+--------------------+------------+--------------------+----------------+----------+--------+------------+----------------+----------+-------------+--------------------+---------------+--------------------+
|away_score|           away_team|         competition|  competition_stage|home_score|           home_team|    kick_off|        last_updated|last_updated_360|match_date|match_id|match_status|match_status_360|match_week|     metadata|             referee|         season|             stadium|
+----------+--------------------+--------------------+-------------------+----------+--------------------+------------+--------------------+----------------+----------+--------+------------+----------------+----------+-------------+--------------------+---------------+--------------------+
|         2|{male, null, 322,...|{11, La Liga, Spain}|{1, Regular Season}|         2|{{214, Spain}, ma...|20:00:00.000|2023-02-

In [19]:
df_match.select("season.season_id", "competition.competition_id").show()

+---------+--------------+
|season_id|competition_id|
+---------+--------------+
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
|       27|            11|
+---------+--------------+
only showing top 20 rows



In [28]:
df_match_add_col_competition = df_match.withColumn("competition_id", F.col("competition.competition_id"))
df_match_add_col_season = df_match_add_col_competition.withColumn("season_id", F.col("season.season_id"))
df_match_add_col_season.show()

+----------+--------------------+--------------------+-------------------+----------+--------------------+------------+--------------------+----------------+----------+--------+------------+----------------+----------+-------------+--------------------+---------------+--------------------+--------------+---------+
|away_score|           away_team|         competition|  competition_stage|home_score|           home_team|    kick_off|        last_updated|last_updated_360|match_date|match_id|match_status|match_status_360|match_week|     metadata|             referee|         season|             stadium|competition_id|season_id|
+----------+--------------------+--------------------+-------------------+----------+--------------------+------------+--------------------+----------------+----------+--------+------------+----------------+----------+-------------+--------------------+---------------+--------------------+--------------+---------+
|         2|{male, null, 322,...|{11, La Liga, Spain

In [30]:
path_match_landing = os.path.join("s3://","landing","matches") 

# Escreve na camada landing particionando por competition_id e season_id
df_match_add_col_season.write.partitionBy("competition_id","season_id").format("json").mode("overwrite").save(path_match_landing)

                                                                                

# análise de casos de matches

In [40]:
# Consulta por competicao e temporada

competition_id = "2"
season_id = "27"

df_competition2_season27 = spark.read.json(f"s3://landing/matches/competition_id={competition_id}/season_id={season_id}")
print(df_competition2_season27.count())
df_competition2_season27.select("away_team").orderBy("match_id").show(truncate=False)



380
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|away_team                                                                                                                                                |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|{male, 28, AFC Bournemouth, {68, England}, [{{68, England}, 1977-11-29, 38, Eddie Howe, null}]}                                                          |
|{male, 40, West Ham United, {68, England}, [{{56, Croatia}, 1968-09-11, 150, Slaven Bilić, null}]}                                                       |
|{male, 29, Everton, {68, England}, [{{214, Spain}, 1973-07-13, 263, Roberto Martínez Montoliú, Roberto Martínez}]}                                       |
|{male, 28, AFC Bournemouth, {68, England}, [{{68, England},

In [38]:
# Consulta por competicao e temporada usando where
spark.conf.set("spark.sql.repl.eagerEval.enabled", False)

df_matches_landing = spark.read.json(path_match_landing)
df_matches_landing.select("away_team").where(f"season_id = {season_id} and competition_id = {competition_id}").orderBy("match_id").show(truncate=False)
print(df_matches_landing.where(f"season_id = {season_id} and competition_id = {competition_id}").count())

+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|away_team                                                                                                                                                      |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{male, null, 28, AFC Bournemouth, {68, England}, [{{68, England}, 1977-11-29, 38, Eddie Howe, null}]}                                                          |
|{male, null, 40, West Ham United, {68, England}, [{{56, Croatia}, 1968-09-11, 150, Slaven Bilić, null}]}                                                       |
|{male, null, 29, Everton, {68, England}, [{{214, Spain}, 1973-07-13, 263, Roberto Martínez Montoliú, Roberto Martínez}]}                                       |
|{male, null, 28, AFC Bourne

In [41]:
# consulta por competicao, exibindo temporadas distintas
df_competition2 = spark.read.json(f"s3://landing/matches/competition_id={competition_id}")
df_competition2.select("season_id").distinct().show()

+---------+
|season_id|
+---------+
|       27|
|       44|
+---------+



## Particao LineUp

In [42]:


path_lineup = os.path.join("s3://","sor","football","data","lineups","*.json")

df_lineup = spark.read.option("multiline", "true").json(path_lineup)
print(df_lineup.count())
df_lineup.show()

                                                                                

6526
+--------------------+-------+--------------------+
|              lineup|team_id|           team_name|
+--------------------+-------+--------------------+
|[{[], {241, Unite...|   1839|       United States|
|[{[], {107, Iran,...|    797|                Iran|
|[{[], {214, Spain...|    863|       Spain Women's|
|[{[], {160, Nethe...|    851| Netherlands Women's|
|[{[{Yellow Card, ...|    941|         Netherlands|
|[{[], {11, Argent...|    779|           Argentina|
|[{[], {121, Korea...|   1211|Korea Republic Wo...|
|[{[], {49, Colomb...|  16802|    Colombia Women's|
|[{[], {31, Brazil...|    217|           Barcelona|
|[{[], {214, Spain...|    209|          Celta Vigo|
|[{[], {112, Italy...|    231|              Chievo|
|[{[], {112, Italy...|    291|           Frosinone|
|[{[{Yellow Card, ...|    771|              France|
|[{[], {11, Argent...|    779|           Argentina|
|[{[], {202, Seneg...|    290|              Empoli|
|[{[], {112, Italy...|   1683|               Carpi|
|[{[], 

In [None]:
# jogar na landing
df_lineup_match = df_lineup.withColumn()
df_lineup.write.json

In [43]:
path_lineup2 = os.path.join("s3://","sor","football","data","lineups")
df_lineup2 = spark.read.json(path_lineup2) 
df_lineup2.show()

                                                                                

AnalysisException: 
Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the
referenced columns only include the internal corrupt record column
(named _corrupt_record by default). For example:
spark.read.schema(schema).csv(file).filter($"_corrupt_record".isNotNull).count()
and spark.read.schema(schema).csv(file).select("_corrupt_record").show().
Instead, you can cache or save the parsed results and then send the same query.
For example, val df = spark.read.schema(schema).csv(file).cache() and then
df.filter($"_corrupt_record".isNotNull).count().
      