##Setup

In [50]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

!pip install -q findspark

!pip install py4j

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Aggregations and Grouping") \
        .master("local[*]") \
        .getOrCreate()

spark.version


# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Import sql functions
from pyspark.sql.functions import *



##Ingesta de datos

In [59]:
!mkdir -p dataset
!wget -q https://raw.githubusercontent.com/cmarty02/EDEM_MDA2324/spark/Alumnos/FS/CRISTIAN_MARTY/SPARK/NBA-BoxScores-2023-2024.csv -P /dataset

!ls /dataset

NBA-BoxScores-2023-2024.csv    NBA-BoxScores-2023-2024.csv.2
NBA-BoxScores-2023-2024.csv.1  NBA-BoxScores-2023-2024.csv.3


In [66]:
df_nba = spark.read.option("header", "true").csv("/dataset/NBA-BoxScores-2023-2024.csv")

df_nba = df_nba.withColumn("MIN", col("MIN").substr(1, 4))
df_nba = df_nba.withColumn("MIN", col("MIN").cast("float"))

# Esquema y df
df_nba.printSchema()
df_nba.show()

root
 |-- _c0: string (nullable = true)
 |-- GAME_ID: string (nullable = true)
 |-- TEAM_ID: string (nullable = true)
 |-- TEAM_ABBREVIATION: string (nullable = true)
 |-- TEAM_CITY: string (nullable = true)
 |-- PLAYER_ID: string (nullable = true)
 |-- PLAYER_NAME: string (nullable = true)
 |-- NICKNAME: string (nullable = true)
 |-- START_POSITION: string (nullable = true)
 |-- COMMENT: string (nullable = true)
 |-- MIN: float (nullable = true)
 |-- FGM: string (nullable = true)
 |-- FGA: string (nullable = true)
 |-- FG_PCT: string (nullable = true)
 |-- FG3M: string (nullable = true)
 |-- FG3A: string (nullable = true)
 |-- FG3_PCT: string (nullable = true)
 |-- FTM: string (nullable = true)
 |-- FTA: string (nullable = true)
 |-- FT_PCT: string (nullable = true)
 |-- OREB: string (nullable = true)
 |-- DREB: string (nullable = true)
 |-- REB: string (nullable = true)
 |-- AST: string (nullable = true)
 |-- STL: string (nullable = true)
 |-- BLK: string (nullable = true)
 |-- TO: s

##Seleccion de los jugadores de Denver

In [58]:
filter_denver = df_nba.filter("TEAM_CITY = 'Denver'")
filter_denver.show()

+---+----------+----------+-----------------+---------+---------+--------------------+----------+--------------+--------------------+------------+----+----+------+----+----+-------+----+----+------+----+----+----+----+----+----+----+----+----+----------+
|_c0|   GAME_ID|   TEAM_ID|TEAM_ABBREVIATION|TEAM_CITY|PLAYER_ID|         PLAYER_NAME|  NICKNAME|START_POSITION|             COMMENT|         MIN| FGM| FGA|FG_PCT|FG3M|FG3A|FG3_PCT| FTM| FTA|FT_PCT|OREB|DREB| REB| AST| STL| BLK|  TO|  PF| PTS|PLUS_MINUS|
+---+----------+----------+-----------------+---------+---------+--------------------+----------+--------------+--------------------+------------+----+----+------+----+----+-------+----+----+------+----+----+----+----+----+----+----+----+----+----------+
| 12|0022300061|1610612743|              DEN|   Denver|  1629008|  Michael Porter Jr.|   Michael|             F|                NULL|30.000000:07| 5.0|13.0| 0.385| 2.0| 9.0|  0.222| 0.0| 0.0|   0.0| 2.0|10.0|12.0| 2.0| 2.0| 0.0| 0.0| 1

##Estadisticas de la temporada

In [74]:
puntos = filter_denver.withColumn("PTS", col("PTS").cast("double"))

# Suma total de puntos
suma_total_pts = filter_denver.select(sum("PTS").alias("Suma_Total_Puntos"))
suma_total_pts.show()

+-----------------+
|Suma_Total_Puntos|
+-----------------+
|           4280.0|
+-----------------+



In [73]:
# Total de puntos por jugador
total_pts_por_jugador = puntos.groupBy("PLAYER_ID", "PLAYER_NAME").agg(sum("PTS").alias("Total_Puntos"))
total_pts_por_jugador = total_pts_por_jugador.orderBy("Total_Puntos", ascending=False)
total_pts_por_jugador.show()

+---------+--------------------+------------+
|PLAYER_ID|         PLAYER_NAME|Total_Puntos|
+---------+--------------------+------------+
|   203999|        Nikola Jokic|       938.0|
|  1629008|  Michael Porter Jr.|       601.0|
|  1627750|        Jamal Murray|       458.0|
|   202704|      Reggie Jackson|       445.0|
|   203932|        Aaron Gordon|       436.0|
|   203484|Kentavious Caldwe...|       344.0|
|  1631128|     Christian Braun|       300.0|
|  1631212|       Peyton Watson|       241.0|
|  1631124|    Julian Strawther|       195.0|
|  1630192|          Zeke Nnaji|       105.0|
|   203200|      Justin Holiday|        84.0|
|   201599|      DeAndre Jordan|        67.0|
|  1629618|       Jalen Pickett|        31.0|
|  1631221|    Collin Gillespie|        20.0|
|  1630296|         Braxton Key|        11.0|
|  1630643|            Jay Huff|         2.0|
|  1641816|        Hunter Tyson|         2.0|
+---------+--------------------+------------+



In [72]:
#Premedio de puntos por partido
promedio_pts_por_jugador_por_partido = puntos.groupBy("PLAYER_ID", "PLAYER_NAME").agg(avg("PTS").alias("Promedio_Puntos_Por_Partido"))
promedio_pts_por_jugador_por_partido = promedio_pts_por_jugador_por_partido.orderBy("Promedio_Puntos_Por_Partido", ascending=False)
promedio_pts_por_jugador_por_partido.show()

+---------+--------------------+---------------------------+
|PLAYER_ID|         PLAYER_NAME|Promedio_Puntos_Por_Partido|
+---------+--------------------+---------------------------+
|   203999|        Nikola Jokic|         26.055555555555557|
|  1627750|        Jamal Murray|          19.91304347826087|
|  1629008|  Michael Porter Jr.|         16.243243243243242|
|   203932|        Aaron Gordon|         14.064516129032258|
|   202704|      Reggie Jackson|         12.027027027027026|
|   203484|Kentavious Caldwe...|           9.82857142857143|
|  1631128|     Christian Braun|          8.108108108108109|
|  1631212|       Peyton Watson|          6.885714285714286|
|  1631124|    Julian Strawther|          5.909090909090909|
|   201599|      DeAndre Jordan|          4.785714285714286|
|   203200|      Justin Holiday|                        4.2|
|  1630192|          Zeke Nnaji|         3.3870967741935485|
|  1629618|       Jalen Pickett|          2.066666666666667|
|  1631221|    Collin Gi

In [69]:
minutos = filter_denver.withColumn("MIN", col("MIN").cast("double"))

# Cantidad de partidos jugados y el tiempo total jugado por jugador
estadisticas_jugador = df.groupBy("PLAYER_ID", "PLAYER_NAME").agg(
    count("GAME_ID").alias("Partidos_Jugados"),
    sum("MIN").alias("Minutos_Jugados"))

estadisticas_jugador = estadisticas_jugador.orderBy("Minutos_Jugados", ascending=False)

estadisticas_jugador.show()

+---------+--------------------+----------------+---------------+
|PLAYER_ID|         PLAYER_NAME|Partidos_Jugados|Minutos_Jugados|
+---------+--------------------+----------------+---------------+
|   203999|        Nikola Jokic|              36|         1194.0|
|  1629008|  Michael Porter Jr.|              37|         1133.0|
|   203484|Kentavious Caldwe...|              35|         1108.0|
|   203932|        Aaron Gordon|              31|          978.0|
|   202704|      Reggie Jackson|              37|          849.0|
|  1631128|     Christian Braun|              37|          712.0|
|  1627750|        Jamal Murray|              23|          677.0|
|  1631212|       Peyton Watson|              37|          600.0|
|  1631124|    Julian Strawther|              36|          440.0|
|  1630192|          Zeke Nnaji|              37|          303.0|
|   203200|      Justin Holiday|              37|          294.0|
|   201599|      DeAndre Jordan|              37|          179.0|
|  1629618

In [79]:
df = df.withColumn("FG3M", col("FG3M").cast("double"))
df = df.withColumn("FTA", col("FTA").cast("double"))
df = df.withColumn("PF", col("PF").cast("double"))

# Calcula la suma total de puntos en triple, dobles y simples.
puntos_totales_por_jugador = df.groupBy("PLAYER_ID", "PLAYER_NAME").agg(
    sum("FG3M").alias("Total_tirod_3"),
    sum("FTA").alias("Total_tiros_2"),
    sum("PF").alias("Totla_Tiros_1")
)


puntos_totales_por_jugador = puntos_totales_por_jugador.orderBy("Total_tirod_3", ascending=False)
puntos_totales_por_jugador.show()

+---------+--------------------+-------------+-------------+-------------+
|PLAYER_ID|         PLAYER_NAME|Total_tirod_3|Total_tiros_2|Totla_Tiros_1|
+---------+--------------------+-------------+-------------+-------------+
|  1629008|  Michael Porter Jr.|        104.0|         46.0|         73.0|
|  1627750|        Jamal Murray|         55.0|         73.0|         45.0|
|   202704|      Reggie Jackson|         54.0|         48.0|         70.0|
|   203484|Kentavious Caldwe...|         49.0|         57.0|         67.0|
|   203999|        Nikola Jokic|         39.0|        209.0|         93.0|
|  1631124|    Julian Strawther|         37.0|         23.0|         47.0|
|  1631128|     Christian Braun|         27.0|         69.0|         60.0|
|  1631212|       Peyton Watson|         22.0|         49.0|         62.0|
|   203200|      Justin Holiday|         20.0|          0.0|         26.0|
|   203932|        Aaron Gordon|         18.0|        108.0|         52.0|
|  1629618|       Jalen P