# Windows Partitioning

## Prerrequisites

Install Spark and Java in VM

In [3]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [4]:
ls -l # check the .tgz is there

total 391016
drwxr-xr-x 1 root root      4096 Jan 11 17:02 [0m[01;34msample_data[0m/
-rw-r--r-- 1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz


In [5]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [6]:
!pip install -q findspark

Defining the environment

In [7]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [37]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import IntegerType

# create the session
spark = SparkSession \
        .builder \
        .appName("Window Partitioning") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.5.0'

In [23]:
spark

In [24]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [11]:
# Import sql functions
from pyspark.sql.functions import *

##Exercise Fooball Players

Mostramos el DataFrame original

In [41]:
# Cargamos el DataFrame desde un archivo CSV usando Spark
football_df = spark.read.option("header", "true").csv("./Data.csv")

# Imprimimos el esquema del DataFrame para mostrar la estructura de las columnas y sus tipos de datos
football_df.printSchema()

# Mostramos los resultados
football_df.show(5, truncate=False)

root
 |-- Country: string (nullable = true)
 |-- League: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- Player Names: string (nullable = true)
 |-- Matches_Played: string (nullable = true)
 |-- Substitution : string (nullable = true)
 |-- Mins: string (nullable = true)
 |-- Goals: string (nullable = true)
 |-- xG: string (nullable = true)
 |-- xG Per Avg Match: string (nullable = true)
 |-- Shots: string (nullable = true)
 |-- OnTarget: string (nullable = true)
 |-- Shots Per Avg Match: string (nullable = true)
 |-- On Target Per Avg Match: string (nullable = true)
 |-- Year: string (nullable = true)

+-------+-------+-----+-----------------+--------------+-------------+----+-----+-----+----------------+-----+--------+-------------------+-----------------------+----+
|Country|League |Club |Player Names     |Matches_Played|Substitution |Mins|Goals|xG   |xG Per Avg Match|Shots|OnTarget|Shots Per Avg Match|On Target Per Avg Match|Year|
+-------+-------+-----+-----------

Hacemos uso de la funciones select y join

In [42]:
# Dividimos el DataFrame original en dos para demostrar el uso de la función Join
# Usamos la variable "Player Names" para poder unirlas

# Hacemos un el df1
first_part_df = football_df.select(
    "Player Names",
    "Country",
    "League",
    "Club",
    "Matches_Played",
    "Substitution ",
    "Mins",
    "Goals"
)

# Hacemos un el df2
second_part_df = football_df.select(
    "Player Names",
    "xG",
    "xG Per Avg Match",
    "Shots",
    "OnTarget",
    "Shots Per Avg Match",
    "On Target Per Avg Match",
    "Year"
)

# Mostramos los DataFrames creados
first_part_df.show(truncate=False)
second_part_df.show(truncate=False)

# Usamos la función Join para juntar los DataFrames creados en uno solo
joined_df = first_part_df.join(second_part_df, "Player Names", "inner")

# Mostramos el resultado del join
joined_df.show(truncate=False)

+-----------------+-------+-------+------+--------------+-------------+----+-----+
|Player Names     |Country|League |Club  |Matches_Played|Substitution |Mins|Goals|
+-----------------+-------+-------+------+--------------+-------------+----+-----+
|Juanmi Callejon  |Spain  |La Liga|(BET) |19            |16           |1849|11   |
|Antoine Griezmann|Spain  |La Liga|(BAR) |36            |0            |3129|16   |
|Luis Suarez      |Spain  |La Liga|(ATL) |34            |1            |2940|28   |
|Ruben Castro     |Spain  |La Liga|(CAR) |32            |3            |2842|13   |
|Kevin Gameiro    |Spain  |La Liga|(VAL) |21            |10           |1745|13   |
|Cristiano Ronaldo|Spain  |La Liga|(JUV) |29            |0            |2634|25   |
|Karim Benzema    |Spain  |La Liga|(RMA) |23            |6            |1967|11   |
|Neymar           |Spain  |La Liga|(PSG) |30            |0            |2694|13   |
|Iago Aspas       |Spain  |La Liga|(CEL) |25            |7            |2354|19   |
|Ser

Hacemos uso de las funciones withColumn, groupBy y orderBy

In [43]:
# Convertimos la columna "Goals" a IntegerType
football_df = football_df.withColumn("Goals", football_df["Goals"].cast(IntegerType()))

# Agrupamos por nombre del jugador y sumamos los goles
goleadores_por_jugador_df = football_df.groupBy("Player Names").agg(sum("Goals").alias("Total_Goals"))

# Ordenamos el DataFrame en orden descendente por la columna "Total_Goals"
top_goleadores_por_jugador_df = goleadores_por_jugador_df.orderBy(desc("Total_Goals")).limit(10)

# Mostramos los resultados
print("Jugadores con más goles marcados:")
top_goleadores_por_jugador_df.show(truncate=False)

Jugadores con más goles marcados:
+-------------------------+-----------+
|Player Names             |Total_Goals|
+-------------------------+-----------+
|Lionel Messi             |135        |
|Robert Lewandowski       |127        |
|Cristiano Ronaldo        |111        |
|Ciro Immobile            |107        |
|Luis Suarez              |95         |
|Pierre-Emerick Aubameyang|88         |
|Timo Werner              |82         |
|Iago Aspas               |80         |
|Mauro Icardi             |76         |
|Andrea Belotti           |74         |
+-------------------------+-----------+



Hacemos uso de la función Window

In [44]:
# Convertimos la columna "Mins" a IntegerType
football_df = football_df.withColumn("Mins", football_df["Mins"].cast(IntegerType()))

# Definimos la especificación de la ventana sobre la columna 'Player Names'
player_window = Window.partitionBy("Player Names")

# Calculamos el total de minutos por jugador utilizando la función de ventana
total_mins_per_player = sum("Mins").over(player_window)

# Agregamos una nueva columna 'Last Years Total Mins' al DataFrame original
total_mins_football_df = football_df.withColumn("Last Years Total Mins", total_mins_per_player)

# Ordenar el DataFrame según los minutos jugados en los últimos años, de más a menos
total_mins_football_df2 = total_mins_football_df.orderBy(desc("Last Years Total Mins"))

# Mostramos el resultado
total_mins_football_df2.show(truncate=False)

+-------+-------+-----+--------------+--------------+-------------+----+-----+-----+----------------+-----+--------+-------------------+-----------------------+----+---------------------+
|Country|League |Club |Player Names  |Matches_Played|Substitution |Mins|Goals|xG   |xG Per Avg Match|Shots|OnTarget|Shots Per Avg Match|On Target Per Avg Match|Year|Last Years Total Mins|
+-------+-------+-----+--------------+--------------+-------------+----+-----+-----+----------------+-----+--------+-------------------+-----------------------+----+---------------------+
|Italy  |Serie A|(TOR)|Andrea Belotti|29            |3            |2854|10   |11.72|0.39            |86   |28      |2.86               |0.93                   |2017|13596                |
|Italy  |Serie A|(TOR)|Andrea Belotti|34            |1            |3241|26   |19.45|0.57            |130  |78      |3.81               |2.29                   |2016|13596                |
|Italy  |Serie A|(TOR)|Andrea Belotti|37            |0      