# Script for CSV Data


Here, the CSV dataset imported from Kaggle is being transformed into a SparkDataFrame and subsequently cleaned. The numeric data, which is initially in string format, is converted to integer or double values. Duplicates are removed, and the season year is adjusted to match the season years in the other datasets. Additionally, the name of Michael Jordan had to be explicitly changed because it was listed as "Michael Jordan*", which would have resulted in the loss of the player during the inner join. Therefore, it was renamed to "Michael Jordan."

### Convert CSV-Data in Spark-DataFrame

In [1]:
import pymongo as mdb
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.getOrCreate()
spark_df= spark.read.csv("Seasons_Stats.csv", header=True)

## View Schema of Dataframe

In [3]:
spark_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Player: string (nullable = true)
 |-- Pos: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Tm: string (nullable = true)
 |-- G: string (nullable = true)
 |-- GS: string (nullable = true)
 |-- MP: string (nullable = true)
 |-- PER: string (nullable = true)
 |-- TS%: string (nullable = true)
 |-- 3PAr: string (nullable = true)
 |-- FTr: string (nullable = true)
 |-- ORB%: string (nullable = true)
 |-- DRB%: string (nullable = true)
 |-- TRB%: string (nullable = true)
 |-- AST%: string (nullable = true)
 |-- STL%: string (nullable = true)
 |-- BLK%: string (nullable = true)
 |-- TOV%: string (nullable = true)
 |-- USG%: string (nullable = true)
 |-- blanl: string (nullable = true)
 |-- OWS: string (nullable = true)
 |-- DWS: string (nullable = true)
 |-- WS: string (nullable = true)
 |-- WS/48: string (nullable = true)
 |-- blank2: string (nullable = true)
 |-- OBPM: string (nullable = true)


### Data Cleaning

In [4]:
# Transform the Numerical Data to Doubles/Integers
cleaned_df = spark_df.withColumn("Year", col("Year").cast("integer")) \
                     .withColumn("Age", col("Age").cast("integer")) \
                     .withColumn("GS", col("GS").cast("double")) \
                     .withColumn("PER", col("PER").cast("double")) \
                     .withColumn("TS%", col("TS%").cast("double")) \
                     .withColumn("FTr", col("FTr").cast("double")) \
                     .withColumn("OWS", col("OWS").cast("double")) \
                     .withColumn("DWS", col("DWS").cast("double")) \
                     .withColumn("WS", col("WS").cast("double")) \
                     .withColumn("OBPM", col("OBPM").cast("double")) \
                     .withColumn("DBPM", col("DBPM").cast("double")) \
                     .withColumn("2P", col("2P").cast("double")) \
                     .withColumn("2PA", col("2PA").cast("double")) \
                     .withColumn("2P%", col("2P%").cast("double")) \
                     .withColumn("eFG%", col("eFG%").cast("double")) \
                     .withColumn("USG%", col("USG%").cast("double")) \

cleaned_df.show(5)

+---+----+---------------+---+---+---+---+----+----+----+-----+----+-----+----+----+----+----+----+----+----+----+-----+----+----+----+-----+------+----+----+----+----+---+---+-----+----+----+----+-----+-----+-----+-----+---+---+-----+----+----+----+---+----+----+----+---+---+
|_c0|Year|         Player|Pos|Age| Tm|  G|  GS|  MP| PER|  TS%|3PAr|  FTr|ORB%|DRB%|TRB%|AST%|STL%|BLK%|TOV%|USG%|blanl| OWS| DWS|  WS|WS/48|blank2|OBPM|DBPM| BPM|VORP| FG|FGA|  FG%|  3P| 3PA| 3P%|   2P|  2PA|  2P%| eFG%| FT|FTA|  FT%| ORB| DRB| TRB|AST| STL| BLK| TOV| PF|PTS|
+---+----+---------------+---+---+---+---+----+----+----+-----+----+-----+----+----+----+----+----+----+----+----+-----+----+----+----+-----+------+----+----+----+----+---+---+-----+----+----+----+-----+-----+-----+-----+---+---+-----+----+----+----+---+----+----+----+---+---+
|  0|1950|Curly Armstrong|G-F| 31|FTW| 63|null|null|null|0.368|null|0.467|null|null|null|null|null|null|null|null| null|-0.1| 3.6| 3.5| null|  null|null|null|null|nul

In [5]:
cleaned_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Player: string (nullable = true)
 |-- Pos: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tm: string (nullable = true)
 |-- G: string (nullable = true)
 |-- GS: double (nullable = true)
 |-- MP: string (nullable = true)
 |-- PER: double (nullable = true)
 |-- TS%: double (nullable = true)
 |-- 3PAr: string (nullable = true)
 |-- FTr: double (nullable = true)
 |-- ORB%: string (nullable = true)
 |-- DRB%: string (nullable = true)
 |-- TRB%: string (nullable = true)
 |-- AST%: string (nullable = true)
 |-- STL%: string (nullable = true)
 |-- BLK%: string (nullable = true)
 |-- TOV%: string (nullable = true)
 |-- USG%: double (nullable = true)
 |-- blanl: string (nullable = true)
 |-- OWS: double (nullable = true)
 |-- DWS: double (nullable = true)
 |-- WS: double (nullable = true)
 |-- WS/48: string (nullable = true)
 |-- blank2: string (nullable = true)
 |-- OBPM: double (nullable = true

### Remove Duplicates

In [6]:
withoutDuplicates_df = cleaned_df.dropDuplicates(subset= ['Player', 'Year'])

In [7]:
# Renaming the columns "Year" to "Season" and "Player" to "Full_Name"
updated_df = withoutDuplicates_df.withColumnRenamed("Player", "Full_Name")
updated_year_df = updated_df.withColumn("Season", col("Year") - 1)
updated_year_df.show(5)

+-----+----+------------+----+----+----+----+----+----+----+-----+-----+-----+----+----+----+----+----+----+----+----+-----+----+----+----+-----+------+----+----+----+----+----+----+-----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+----+----+----+----+----+----+----+------+
|  _c0|Year|   Full_Name| Pos| Age|  Tm|   G|  GS|  MP| PER|  TS%| 3PAr|  FTr|ORB%|DRB%|TRB%|AST%|STL%|BLK%|TOV%|USG%|blanl| OWS| DWS|  WS|WS/48|blank2|OBPM|DBPM| BPM|VORP|  FG| FGA|  FG%|  3P| 3PA|  3P%|   2P|  2PA|  2P%| eFG%|  FT| FTA|  FT%| ORB| DRB| TRB| AST| STL| BLK| TOV|  PF| PTS|Season|
+-----+----+------------+----+----+----+----+----+----+----+-----+-----+-----+----+----+----+----+----+----+----+----+-----+----+----+----+-----+------+----+----+----+----+----+----+-----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+----+----+----+----+----+----+----+------+
|  312|null|        null|null|null|null|null|null|null|null| null| null| null|null|null|null|null|null|null|n

### Drop Further Columns

In [8]:
# Drop the column "year" because "Season" is the updated version
updated_drop_df=updated_year_df.drop("Year")
updated_drop_df.show(5)

+-----+------------+----+----+----+----+----+----+----+-----+-----+-----+----+----+----+----+----+----+----+----+-----+----+----+----+-----+------+----+----+----+----+----+----+-----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+----+----+----+----+----+----+----+------+
|  _c0|   Full_Name| Pos| Age|  Tm|   G|  GS|  MP| PER|  TS%| 3PAr|  FTr|ORB%|DRB%|TRB%|AST%|STL%|BLK%|TOV%|USG%|blanl| OWS| DWS|  WS|WS/48|blank2|OBPM|DBPM| BPM|VORP|  FG| FGA|  FG%|  3P| 3PA|  3P%|   2P|  2PA|  2P%| eFG%|  FT| FTA|  FT%| ORB| DRB| TRB| AST| STL| BLK| TOV|  PF| PTS|Season|
+-----+------------+----+----+----+----+----+----+----+-----+-----+-----+----+----+----+----+----+----+----+----+-----+----+----+----+-----+------+----+----+----+----+----+----+-----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+----+----+----+----+----+----+----+------+
|  312|        null|null|null|null|null|null|null|null| null| null| null|null|null|null|null|null|null|null|null| null|null|

### Handle Null/NA Values

In [None]:
# Fill the Null-Values with 0
filled_df = updated_drop_df.fillna(0)
filled_df.show(5)

### Type Casting

In [10]:
# Transform the Seson from Double to Integer
integer_df = filled_df.withColumn("Season", filled_df["Season"].cast("integer"))
integer_df.show(5)

+-----+------------+----+---+----+----+----+----+----+-----+-----+-----+----+----+----+----+----+----+----+----+-----+---+---+---+-----+------+----+----+----+----+----+----+-----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+----+----+----+----+----+----+----+------+
|  _c0|   Full_Name| Pos|Age|  Tm|   G|  GS|  MP| PER|  TS%| 3PAr|  FTr|ORB%|DRB%|TRB%|AST%|STL%|BLK%|TOV%|USG%|blanl|OWS|DWS| WS|WS/48|blank2|OBPM|DBPM| BPM|VORP|  FG| FGA|  FG%|  3P| 3PA|  3P%|   2P|  2PA|  2P%| eFG%|  FT| FTA|  FT%| ORB| DRB| TRB| AST| STL| BLK| TOV|  PF| PTS|Season|
+-----+------------+----+---+----+----+----+----+----+-----+-----+-----+----+----+----+----+----+----+----+----+-----+---+---+---+-----+------+----+----+----+----+----+----+-----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+----+----+----+----+----+----+----+------+
|  312|        null|null|  0|null|null| 0.0|null| 0.0|  0.0| null|  0.0|null|null|null|null|null|null|null| 0.0| null|0.0|0.0|0.0| null|

### Special Case

In [11]:
# Renaming Michael Jordan* from Michael Jordan
mj_renaming_df = integer_df.withColumn('Full_Name', F.when(col('Full_Name') == 'Michael Jordan*', 'Michael Jordan').otherwise(col('Full_Name')))

In [12]:
# Put season on the 2nd place
reordered_df = mj_renaming_df.withColumn("Season", col("Season").cast("integer")) \
                         .select(integer_df.columns[0], "Season", *integer_df.columns[1:-1])
reordered_df.show(5)

+-----+------+------------+----+---+----+----+----+----+----+-----+-----+-----+----+----+----+----+----+----+----+----+-----+---+---+---+-----+------+----+----+----+----+----+----+-----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+----+----+----+----+----+----+----+
|  _c0|Season|   Full_Name| Pos|Age|  Tm|   G|  GS|  MP| PER|  TS%| 3PAr|  FTr|ORB%|DRB%|TRB%|AST%|STL%|BLK%|TOV%|USG%|blanl|OWS|DWS| WS|WS/48|blank2|OBPM|DBPM| BPM|VORP|  FG| FGA|  FG%|  3P| 3PA|  3P%|   2P|  2PA|  2P%| eFG%|  FT| FTA|  FT%| ORB| DRB| TRB| AST| STL| BLK| TOV|  PF| PTS|
+-----+------+------------+----+---+----+----+----+----+----+-----+-----+-----+----+----+----+----+----+----+----+----+-----+---+---+---+-----+------+----+----+----+----+----+----+-----+----+----+-----+-----+-----+-----+-----+----+----+-----+----+----+----+----+----+----+----+----+----+
|  312|     0|        null|null|  0|null|null| 0.0|null| 0.0|  0.0| null|  0.0|null|null|null|null|null|null|null| 0.0| null|0.0|0.0|0.0

### Further Type Casting

In [13]:
# Convert the "Season" column to integer type
reordered_df = reordered_df.withColumn("Season", reordered_df["Season"].cast("integer"))

## Upload Cleaned Data to MongoDB Collection

In [14]:
years_seasons=[1995, 1996, 1997, 2005, 2006, 2007, 2011, 2012, 2013]

In [15]:
# Select the desired columns from the reordered DataFrame
selected_columns = ['Full_Name','Season', 'Pos', 'GS', 'PER', 'TS%',
                    'FTr', 'USG%', 'OWS', 'DWS',
                    'WS', 'OBPM', 'DBPM', '2P',
                    '2PA', '2P%', 'eFG%']
selected_df = reordered_df.select(selected_columns)

# Filter the data based on the desired years/seasons
filtered_data = selected_df.filter(selected_df["Season"].isin(years_seasons))

# Convert the Spark DataFrame to a pandas DataFrame
pandas_df = filtered_data.toPandas()

### Connect to MongoDB

In [None]:
# Upload data to MongoDB
client = mdb.MongoClient("mongodb://pt-n20.p4001.w3.cs.technikum-wien.at:4001")
db = client["nba_data"]
collection = db["season_stats_csv"]

### Insert Records into MongoDB Collection

In [None]:
#collection.insert_many(pandas_df.to_dict("records"))
print("Data uploaded to MongoDB successfully.")

### Close Connection

In [None]:
client.close()

In [None]:
spark.stop()