In [None]:
# Installing required packages
# !pip install pyspark
# !pip install findspark
# !pip install pandas

In [None]:
import findspark
findspark.init()

import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
spark

Loading data into a Pandas Dataframe then into a Spark DataFrame

In [None]:
all_seasons = pd.read_csv('all_seasons.csv')

sdf = spark.createDataFrame(all_seasons)

In [None]:
sdf.printSchema()

root
 |-- Unnamed: 0: long (nullable = true)
 |-- player_name: string (nullable = true)
 |-- team_abbreviation: string (nullable = true)
 |-- age: double (nullable = true)
 |-- player_height: double (nullable = true)
 |-- player_weight: double (nullable = true)
 |-- college: string (nullable = true)
 |-- country: string (nullable = true)
 |-- draft_year: string (nullable = true)
 |-- draft_round: string (nullable = true)
 |-- draft_number: string (nullable = true)
 |-- gp: long (nullable = true)
 |-- pts: double (nullable = true)
 |-- reb: double (nullable = true)
 |-- ast: double (nullable = true)
 |-- net_rating: double (nullable = true)
 |-- oreb_pct: double (nullable = true)
 |-- dreb_pct: double (nullable = true)
 |-- usg_pct: double (nullable = true)
 |-- ts_pct: double (nullable = true)
 |-- ast_pct: double (nullable = true)
 |-- season: string (nullable = true)



In [None]:
sdf.show(10)

+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|Unnamed: 0|      player_name|team_abbreviation| age|player_height|player_weight|             college|country|draft_year|draft_round|draft_number| gp| pts| reb|ast|net_rating|oreb_pct|          dreb_pct|           usg_pct|            ts_pct|ast_pct| season|
+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|         0|    Dennis Rodman|              CHI|36.0|       198.12|     99.79024|Southeastern Okla...|    USA|      1986|          2|          27| 55| 5.7|16.1|3.1|      16.1|   0.186|0.3229999999999999|               0.1|    

Remove all undrafted players

In [None]:
sdf.filter((sdf.draft_round != 'Undrafted') | (sdf.draft_number != 'Undrafted')).show(10)

+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|Unnamed: 0|      player_name|team_abbreviation| age|player_height|player_weight|             college|country|draft_year|draft_round|draft_number| gp| pts| reb|ast|net_rating|oreb_pct|          dreb_pct|           usg_pct|            ts_pct|ast_pct| season|
+----------+-----------------+-----------------+----+-------------+-------------+--------------------+-------+----------+-----------+------------+---+----+----+---+----------+--------+------------------+------------------+------------------+-------+-------+
|         0|    Dennis Rodman|              CHI|36.0|       198.12|     99.79024|Southeastern Okla...|    USA|      1986|          2|          27| 55| 5.7|16.1|3.1|      16.1|   0.186|0.3229999999999999|               0.1|    

Show all players named David

In [None]:
sdf.filter(sdf.player_name.like('%David%')).show(10)

+----------+--------------+-----------------+----+-------------+-------------+-----------+-------+----------+-----------+------------+---+----+----+---+----------+------------------+------------------+------------------+------------------+------------------+-------+
|Unnamed: 0|   player_name|team_abbreviation| age|player_height|player_weight|    college|country|draft_year|draft_round|draft_number| gp| pts| reb|ast|net_rating|          oreb_pct|          dreb_pct|           usg_pct|            ts_pct|           ast_pct| season|
+----------+--------------+-----------------+----+-------------+-------------+-----------+-------+----------+-----------+------------+---+----+----+---+----------+------------------+------------------+------------------+------------------+------------------+-------+
|       110|    David Wood|              MIL|32.0|       205.74|    104.32616|Nevada-Reno|    USA| Undrafted|  Undrafted|   Undrafted| 46| 1.2| 0.6|0.3|       3.9|             0.026|             0.12

Show the tallest 3 players, displaying their height in meters

In [None]:
sdf = sdf.withColumn('player_height_meters', sdf['player_height']/100)
sdf.select("player_name","player_height_meters").distinct().sort("player_height_meters", ascending=False).show(3)

+----------------+--------------------+
|     player_name|player_height_meters|
+----------------+--------------------+
|Gheorghe Muresan|              2.3114|
|   Shawn Bradley|               2.286|
|        Yao Ming|               2.286|
+----------------+--------------------+
only showing top 3 rows



Display the information of players that come from Mexico

In [None]:
sdf.filter(sdf.country == 'Mexico').show(10)

+----------+--------------+-----------------+----+-------------+-------------+--------+-------+----------+-----------+------------+---+---+---+---+----------+--------+--------+-------+------------------+------------------+-------+--------------------+
|Unnamed: 0|   player_name|team_abbreviation| age|player_height|player_weight| college|country|draft_year|draft_round|draft_number| gp|pts|reb|ast|net_rating|oreb_pct|dreb_pct|usg_pct|            ts_pct|           ast_pct| season|player_height_meters|
+----------+--------------+-----------------+----+-------------+-------------+--------+-------+----------+-----------+------------+---+---+---+---+----------+--------+--------+-------+------------------+------------------+-------+--------------------+
|      1765|Eduardo Najera|              DAL|24.0|        203.2|   106.140528|Oklahoma| Mexico|      2000|          2|          38| 40|3.3|2.4|0.7|      -4.9|   0.107|   0.142|  0.146|             0.522|             0.093|2000-01|              

Calculate the BMI of each player, insert into the collection as a new column, then display the players according to their BMI descendingly

In [None]:
sdf = sdf.withColumn('bmi', sdf['player_weight']/sdf['player_height_meters']**2)
sdf.select("player_name","bmi").distinct().sort("bmi", ascending=False).show(5)

+----------------+-----------------+
|     player_name|              bmi|
+----------------+-----------------+
|   Oliver Miller|34.82661591142417|
|   Oliver Miller| 33.7550277295342|
|   Oliver Miller|33.21923363858921|
|Shaquille O'Neal|33.08560028884764|
| Zion Williamson|32.81913177896047|
+----------------+-----------------+
only showing top 5 rows



Display the number of players from each country, showing the 5 least present countries

In [None]:
country_counts = sdf.groupby(['country'])\
.agg({"country": "count"})\
.sort("count(country)", ascending=True)\
.show(5)

+-------------------+--------------+
|            country|count(country)|
+-------------------+--------------+
|         Sudan (UK)|             1|
|              Ghana|             1|
|              Sudan|             1|
|             Angola|             1|
|Trinidad and Tobago|             1|
+-------------------+--------------+
only showing top 5 rows

