Import needed libraries

In [39]:
%%pyspark

 from pyspark.sql.types import *
 from pyspark.sql.functions import *
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import to_date

Read in all datasets to dataframes

In [40]:
df_skatergames = spark.read.csv('abfss://nhlroot@simondemosimondemo.dfs.core.windows.net/nhlrawplayerlevel/*.csv', header=True)
display(df_skatergames.limit(20))
num_rows = df_skatergames.count()

# Check the qualities of the dataframe
num_players = df_skatergames.select("playerID").distinct().count()
print(f"Number of players: {num_players}")
num_rows = df_skatergames.count()
print(f"Number of rows: {num_rows}")
num_columns = len(df_skatergames.columns)
print(f"Number of columns: {num_columns}")

In [41]:
df_teams = spark.read.csv('abfss://nhlroot@simondemosimondemo.dfs.core.windows.net/teams.csv', header=True)
display(df_teams)

In [42]:
df_seasons = spark.read.csv('abfss://nhlroot@simondemosimondemo.dfs.core.windows.net/seasons.csv', header=True)
display(df_seasons)

Select wanted columns for the games facts table

In [43]:
df_cleanskatergames = df_skatergames.select("playerID", "name", "gameID", "gameDate", "playerTeam", "opposingTeam", "season", "situation", "I_F_points", "I_F_goals", "penalties", "I_F_penalityminutes", "faceoffsWon", "faceoffsLost", "icetime")
display(df_cleanskatergames.limit(20))

Clean up team IDs

In [44]:

df_cleanskatergames = df_cleanskatergames.withColumn(
    'playerTeam', 
    when(col('playerTeam') == "L.A", "LAK")
    .when(col('playerTeam') == "N.J", "NJD")
    .when(col('playerTeam') == "S.J", "SJS")
    .when(col('playerTeam') == "T.B", "TBL")
    .otherwise(col('playerTeam'))
)

df_cleanskatergames = df_cleanskatergames.withColumn(
    'opposingTeam', 
    when(col('opposingTeam') == "L.A", "LAK")
    .when(col('opposingTeam') == "N.J", "NJD")
    .when(col('opposingTeam') == "S.J", "SJS")
    .when(col('opposingTeam') == "T.B", "TBL")
    .otherwise(col('opposingTeam'))
)

Cast columns to proper types

In [45]:
df_cleanskatergames = df_cleanskatergames.withColumn("playerID", col("playerID").cast(IntegerType()))
df_cleanskatergames = df_cleanskatergames.withColumn("name", col("name").cast(StringType()))
df_cleanskatergames = df_cleanskatergames.withColumn("gameId", col("gameId").cast(IntegerType()))
df_cleanskatergames = df_cleanskatergames.withColumn("gameDate", to_date(df_cleanskatergames.gameDate, "yyyyMMdd"))
df_cleanskatergames = df_cleanskatergames.withColumn("gameDate", col("gameDate").cast(DateType()))
df_cleanskatergames = df_cleanskatergames.withColumn("playerTeam", col("playerTeam").cast(StringType()))
df_cleanskatergames = df_cleanskatergames.withColumn("opposingTeam", col("opposingTeam").cast(StringType()))
df_cleanskatergames = df_cleanskatergames.withColumn("season", col("season").cast(IntegerType()))
df_cleanskatergames = df_cleanskatergames.withColumn("situation", col("situation").cast(StringType()))
df_cleanskatergames = df_cleanskatergames.withColumn("I_F_points", col("I_F_points").cast(IntegerType()))
df_cleanskatergames = df_cleanskatergames.withColumn("I_F_goals", col("I_F_goals").cast(IntegerType()))
df_cleanskatergames = df_cleanskatergames.withColumn("penalties", col("penalties").cast(IntegerType()))
df_cleanskatergames = df_cleanskatergames.withColumn("I_F_penalityminutes", col("I_F_penalityminutes").cast(IntegerType()))
df_cleanskatergames = df_cleanskatergames.withColumn("faceoffsWon", col("faceoffsWon").cast(IntegerType()))
df_cleanskatergames = df_cleanskatergames.withColumn("faceoffsLost", col("faceoffsLost").cast(IntegerType()))
df_cleanskatergames = df_cleanskatergames.withColumn("icetime", col("icetime").cast(IntegerType()))

display(df_cleanskatergames)

In [46]:
df_teams = df_teams.withColumn("TeamID", col("TeamID").cast(StringType()))
df_teams = df_teams.withColumn("Conference", col("Conference").cast(StringType()))
df_teams = df_teams.withColumn("Division", col("Division").cast(StringType()))
df_teams = df_teams.withColumn("TeamName", col("TeamName").cast(StringType()))
df_teams = df_teams.withColumn("City", col("City").cast(StringType()))
df_teams = df_teams.withColumn("State", col("State").cast(StringType()))
df_teams = df_teams.withColumn("Arena", col("Arena").cast(StringType()))
df_teams = df_teams.withColumn("Capacity", col("Capacity").cast(IntegerType()))
df_teams = df_teams.withColumn("Founded", col("Founded").cast(IntegerType()))
df_teams = df_teams.withColumn("Joined", col("Joined").cast(IntegerType()))
df_teams = df_teams.withColumn("General manager", col("General manager").cast(StringType()))
df_teams = df_teams.withColumn("Head coach", col("Head coach").cast(StringType()))
df_teams = df_teams.withColumn("Captain", col("Captain").cast(StringType()))

In [47]:
df_seasons = df_seasons.withColumn("Season", col("Season").cast(IntegerType()))
df_seasons = df_seasons.withColumn("NoTeams", col("NoTeams").cast(IntegerType()))
df_seasons = df_seasons.withColumn("RegGames", col("RegGames").cast(IntegerType()))
df_seasons = df_seasons.withColumn("Start", col("Start").cast(DateType()))
df_seasons = df_seasons.withColumn("Finish", col("Finish").cast(DateType()))
df_seasons = df_seasons.withColumn("Champion", col("Champion").cast(StringType()))

Write dfs to data warehouse

In [48]:
import com.microsoft.spark.sqlanalytics
from com.microsoft.spark.sqlanalytics.Constants import Constants
from pyspark.sql.functions import col

df_teams.write.mode("overwrite").synapsesql("nhlsqlpool.dbo.dimteamsp")
df_seasons.write.mode("overwrite").synapsesql("nhlsqlpool.dbo.dimseasonsp")
df_cleanskatergames.write.mode("overwrite").synapsesql("nhlsqlpool.dbo.factgamesp")