## Data cleaning

### Import libraries

In [7]:
import os
import sys
import pandas as pd

### Import dataset

In [8]:
project_path = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
if project_path not in sys.path:
    sys.path.append(project_path)

In [None]:
matthews_df = os.path.join(project_path, "data", "raw", "matthews_auston.csv")
df = pd.read_csv(matthews_df)

### Pivot table

In [9]:
from utils.data_cleaning import pivot_df

index = ["playerId", "season", "name", "gameId", "playerTeam", "opposingTeam", "home_or_away", "gameDate", "position"]
pivot = "situation"
df = pivot_df(df, index, pivot)
df = df.drop(["gameScore_other", "gameScore_5on4", "gameScore_4on5", "gameScore_5on5"],axis=1)
df = df.rename(columns={"gameScore_all": "gameScore"})

### Drop duplicate columns

In [10]:
duplicate_cols = df.columns[df.T.duplicated()]
    
if len(duplicate_cols) > 0:
    df = df.loc[:, ~df.T.duplicated()]
    print("Removed duplicate columns:", list(duplicate_cols))
else:
    print("No duplicate columns found.")
    
print(f"Number of features: {df.shape[1]}")

Removed duplicate columns: ['I_F_flurryScoreVenueAdjustedxGoals_4on5', 'I_F_flurryScoreVenueAdjustedxGoals_5on4', 'I_F_flurryScoreVenueAdjustedxGoals_other', 'I_F_giveaways_4on5', 'I_F_highDangerGoals_4on5', 'I_F_lowDangerGoals_4on5', 'I_F_mediumDangerGoals_4on5', 'I_F_penalityMinutes_other', 'I_F_playStopped_4on5', 'I_F_reboundGoals_4on5', 'I_F_reboundxGoals_4on5', 'I_F_secondaryAssists_4on5', 'I_F_shotAttempts_4on5', 'I_F_shotAttempts_5on4', 'I_F_shotAttempts_other', 'I_F_unblockedShotAttempts_4on5', 'I_F_unblockedShotAttempts_5on4', 'I_F_unblockedShotAttempts_other', 'I_F_xGoals_4on5', 'I_F_xGoals_5on4', 'I_F_xGoals_other', 'I_F_xGoals_with_earned_rebounds_scoreAdjusted_4on5', 'I_F_xGoals_with_earned_rebounds_scoreAdjusted_5on4', 'I_F_xGoals_with_earned_rebounds_scoreAdjusted_other', 'OnIce_A_flurryScoreVenueAdjustedxGoals_4on5', 'OnIce_A_flurryScoreVenueAdjustedxGoals_5on4', 'OnIce_A_flurryScoreVenueAdjustedxGoals_other', 'OnIce_A_shotAttempts_4on5', 'OnIce_A_shotAttempts_5on4', 'O

### Save cleaned data

In [11]:
output_path = os.path.join(project_path, "data", "processed", "matthews_auston_processed.csv")
df.to_csv(output_path, index=False)