In [None]:
# Installing required packages
# !pip install pyspark
# !pip install findspark

In [None]:
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import findspark
findspark.init()

In [None]:
# import the Pandas UDF function
from pyspark.sql.functions import pandas_udf, PandasUDFType

In [None]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
spark

Loading data into a Pandas Dataframe then into a Spark DataFrame

In [None]:
all_seasons = pd.read_csv('all_seasons.csv')

sdf = spark.createDataFrame(all_seasons)

In [None]:
sdf.printSchema()

root
 |-- Unnamed: 0: long (nullable = true)
 |-- player_name: string (nullable = true)
 |-- team_abbreviation: string (nullable = true)
 |-- age: double (nullable = true)
 |-- player_height: double (nullable = true)
 |-- player_weight: double (nullable = true)
 |-- college: string (nullable = true)
 |-- country: string (nullable = true)
 |-- draft_year: string (nullable = true)
 |-- draft_round: string (nullable = true)
 |-- draft_number: string (nullable = true)
 |-- gp: long (nullable = true)
 |-- pts: double (nullable = true)
 |-- reb: double (nullable = true)
 |-- ast: double (nullable = true)
 |-- net_rating: double (nullable = true)
 |-- oreb_pct: double (nullable = true)
 |-- dreb_pct: double (nullable = true)
 |-- usg_pct: double (nullable = true)
 |-- ts_pct: double (nullable = true)
 |-- ast_pct: double (nullable = true)
 |-- season: string (nullable = true)



In [None]:
sdf.show(10)

+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|Unnamed: 0|      player_name|team_abbreviation| age|player_height|player_weight|             college|country|draft_year|draft_round|draft_number| gp| pts| reb|ast|net_rating|oreb_pct|          dreb_pct|           usg_pct|            ts_pct|ast_pct| season|
+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|         0|    Dennis Rodman|              CHI|36.0|       198.12|     99.79024|Southeastern Okla...|    USA|      1986|          2|          27| 55| 5.7|16.1|3.1|      16.1|   0.186|0.3229999999999999|               0.1|    

In [None]:
sdf.createOrReplaceTempView("allseasons")

Remove all undrafted players

In [None]:
sdfTemp = spark.sql("SELECT * FROM allseasons where draft_round!='Undrafted' OR draft_number!='Undrafted'")
sdfTemp.show(10)

+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|Unnamed: 0|      player_name|team_abbreviation| age|player_height|player_weight|             college|country|draft_year|draft_round|draft_number| gp| pts| reb|ast|net_rating|oreb_pct|          dreb_pct|           usg_pct|            ts_pct|ast_pct| season|
+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|         0|    Dennis Rodman|              CHI|36.0|       198.12|     99.79024|Southeastern Okla...|    USA|      1986|          2|          27| 55| 5.7|16.1|3.1|      16.1|   0.186|0.3229999999999999|               0.1|    

Show all players named David

In [None]:
sdfTemp = spark.sql("SELECT * FROM allseasons where player_name like '%David%'")
sdfTemp.show(10)

+----------+--------------+-----------------+----+-------------+-------------+-----------+-------+----------+-----------+------------+---+----+----+---+----------+------------------+------------------+------------------+------------------+------------------+-------+
|Unnamed: 0|   player_name|team_abbreviation| age|player_height|player_weight|    college|country|draft_year|draft_round|draft_number| gp| pts| reb|ast|net_rating|          oreb_pct|          dreb_pct|           usg_pct|            ts_pct|           ast_pct| season|
+----------+--------------+-----------------+----+-------------+-------------+-----------+-------+----------+-----------+------------+---+----+----+---+----------+------------------+------------------+------------------+------------------+------------------+-------+
|       110|    David Wood|              MIL|32.0|       205.74|    104.32616|Nevada-Reno|    USA| Undrafted|  Undrafted|   Undrafted| 46| 1.2| 0.6|0.3|       3.9|             0.026|             0.12

Display the information of players that come from Mexico

In [None]:
sdfTemp = spark.sql("SELECT * FROM allseasons where country = 'Mexico'")
sdfTemp.show(10)

+----------+--------------+-----------------+----+-------------+-------------+--------+-------+----------+-----------+------------+---+---+---+---+----------+--------+--------+-------+------------------+------------------+-------+
|Unnamed: 0|   player_name|team_abbreviation| age|player_height|player_weight| college|country|draft_year|draft_round|draft_number| gp|pts|reb|ast|net_rating|oreb_pct|dreb_pct|usg_pct|            ts_pct|           ast_pct| season|
+----------+--------------+-----------------+----+-------------+-------------+--------+-------+----------+-----------+------------+---+---+---+---+----------+--------+--------+-------+------------------+------------------+-------+
|      1765|Eduardo Najera|              DAL|24.0|        203.2|   106.140528|Oklahoma| Mexico|      2000|          2|          38| 40|3.3|2.4|0.7|      -4.9|   0.107|   0.142|  0.146|             0.522|             0.093|2000-01|
|      2624|Eduardo Najera|              DAL|25.0|        203.2|   106.14052

Show the tallest 3 players, displaying their height in meters

In [None]:
@pandas_udf("float")
def convert_ht(s: pd.Series) -> pd.Series:
    # Convert to meters
    return s/100

spark.udf.register("convert_height", convert_ht)

<pyspark.sql.udf.UserDefinedFunction at 0x78b2063bea70>

In [None]:
sdfTemp = spark.sql("SELECT DISTINCT player_name, convert_height(player_height) as player_height_meters FROM allseasons ORDER BY player_height_meters DESC")
sdfTemp.show(3)

+----------------+--------------------+
|     player_name|player_height_meters|
+----------------+--------------------+
|Gheorghe Muresan|              2.3114|
|   Shawn Bradley|               2.286|
|        Yao Ming|               2.286|
+----------------+--------------------+
only showing top 3 rows



In [None]:
# spark.catalog.dropTempView("allseasons")
# sdfTemp.createOrReplaceTempView("allseasons")

Calculate the BMI of each player, insert into the collection as a new column, then display the players according to their BMI descendingly

In [None]:
@pandas_udf("float")
def calc_bmi(w: pd.Series,h: pd.Series) -> pd.Series:
    # Calculate bmi
    return (w/(h**2)) * 10000

spark.udf.register("bmi_cal", calc_bmi)

<pyspark.sql.udf.UserDefinedFunction at 0x78b205a2efe0>

In [None]:
sdfTemp = spark.sql('SELECT * FROM allseasons')
sdfTemp.show(2)

+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+---+----+---+----------+--------+------------------+-------+------+-------+-------+
|Unnamed: 0|      player_name|team_abbreviation| age|player_height|player_weight|             college|country|draft_year|draft_round|draft_number| gp|pts| reb|ast|net_rating|oreb_pct|          dreb_pct|usg_pct|ts_pct|ast_pct| season|
+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+---+----+---+----------+--------+------------------+-------+------+-------+-------+
|         0|    Dennis Rodman|              CHI|36.0|       198.12|     99.79024|Southeastern Okla...|    USA|      1986|          2|          27| 55|5.7|16.1|3.1|      16.1|   0.186|0.3229999999999999|    0.1| 0.479|  0.113|1996-97|
|         1|Dwayne Schintzius|              LAC|28.0|        215

In [None]:
sdfTemp = spark.sql("SELECT *, bmi_cal(player_weight, allseasons.player_height) as bmi FROM allseasons ORDER BY bmi DESC")
sdfTemp.show(10)

+----------+----------------+-----------------+----+-------------+-------------+----------------+-------+----------+-----------+------------+---+----+----+---+----------+------------------+------------------+------------------+------------------+------------------+-------+---------+
|Unnamed: 0|     player_name|team_abbreviation| age|player_height|player_weight|         college|country|draft_year|draft_round|draft_number| gp| pts| reb|ast|net_rating|          oreb_pct|          dreb_pct|           usg_pct|            ts_pct|           ast_pct| season|      bmi|
+----------+----------------+-----------------+----+-------------+-------------+----------------+-------+----------+-----------+------------+---+----+----+---+----------+------------------+------------------+------------------+------------------+------------------+-------+---------+
|       480|   Oliver Miller|              TOR|28.0|       205.74|     147.4174|        Arkansas|    USA|      1992|          1|          22| 64| 6.

Display the number of players from each country, showing the 5 least present countries

In [None]:
sdfTemp = spark.sql("SELECT country, count(*) as num_of_players from allseasons GROUP BY country ORDER BY num_of_players ASC")
sdfTemp.show(5)

+-------------------+--------------+
|            country|num_of_players|
+-------------------+--------------+
|         Sudan (UK)|             1|
|              Ghana|             1|
|              Sudan|             1|
|             Angola|             1|
|Trinidad and Tobago|             1|
+-------------------+--------------+
only showing top 5 rows

