In [1]:
"""
01_data_cleaning.ipynb

Purpose:
    Load, inspect, and clean all raw NBA data tables.
    This notebook standardizes column types, handles missing values,
    and validates consistency across datasets (e.g., team vs player stats).

Input:
    Raw CSV files from the Kaggle dataset.
Output:
    Cleaned versions stored under ./processed/
"""

'\n01_data_cleaning.ipynb\n\nPurpose:\n    Load, inspect, and clean all raw NBA data tables.\n    This notebook standardizes column types, handles missing values,\n    and validates consistency across datasets (e.g., team vs player stats).\n\nInput:\n    Raw CSV files from the Kaggle dataset.\nOutput:\n    Cleaned versions stored under ./processed/\n'

In [2]:
# === Setup ===
import pandas as pd
import numpy as np
import os

# Create processed folder if not exists
os.makedirs("../data/processed", exist_ok=True)

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

In [3]:
# === File paths ===
# Base directories
RAW_PATH = "../data/raw"
PROCESSED_PATH = "../data/processed"

# Define input files
files = {
    "games": os.path.join(RAW_PATH, "Games.csv"),
    "team_stats": os.path.join(RAW_PATH, "TeamStatistics.csv"),
    "player_stats": os.path.join(RAW_PATH, "PlayerStatistics.csv"),
    "players": os.path.join(RAW_PATH, "Players.csv"),
    "team_histories": os.path.join(RAW_PATH, "TeamHistories.csv"),
    "schedule": os.path.join(RAW_PATH, "LeagueSchedule25_26.csv"),
}

In [4]:
# === Load datasets with dtype consistency ===
def load_csv_safe(path):
    """Load CSV with consistent dtype handling."""
    try:
        df = pd.read_csv(path, low_memory=False)
        print(f"Loaded {os.path.basename(path)} — shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"File not found: {path}")
        return pd.DataFrame()

games = load_csv_safe(files["games"])
team_stats = load_csv_safe(files["team_stats"])
player_stats = load_csv_safe(files["player_stats"])
players = load_csv_safe(files["players"])
team_histories = load_csv_safe(files["team_histories"])
schedule = load_csv_safe(files["schedule"])

Loaded Games.csv — shape: (72008, 17)
Loaded TeamStatistics.csv — shape: (144016, 48)
Loaded PlayerStatistics.csv — shape: (1631540, 35)
Loaded Players.csv — shape: (6678, 14)
Loaded TeamHistories.csv — shape: (140, 7)
Loaded LeagueSchedule25_26.csv — shape: (1278, 17)


In [5]:
# === Dataset summaries ===
def describe_dataset(name, df):
    print(f"\n=== {name.upper()} SUMMARY ===")
    print(f"Shape: {df.shape}")
    print("Columns:", list(df.columns))
    print(df.info(verbose=False))
    print(df.head(3))

datasets = {
    "games": games,
    "team_stats": team_stats,
    "player_stats": player_stats,
    "players": players,
    "team_histories": team_histories,
    "schedule": schedule,
}

for name, df in datasets.items():
    describe_dataset(name, df)


=== GAMES SUMMARY ===
Shape: (72008, 17)
Columns: ['gameId', 'gameDate', 'hometeamCity', 'hometeamName', 'hometeamId', 'awayteamCity', 'awayteamName', 'awayteamId', 'homeScore', 'awayScore', 'winner', 'gameType', 'attendance', 'arenaId', 'gameLabel', 'gameSubLabel', 'seriesGameNumber']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72008 entries, 0 to 72007
Columns: 17 entries, gameId to seriesGameNumber
dtypes: float64(3), int64(6), object(8)
memory usage: 9.3+ MB
None
     gameId              gameDate   hometeamCity hometeamName  hometeamId awayteamCity awayteamName  awayteamId  \
0  22500127  2025-10-28T23:00:00Z   Golden State     Warriors  1610612744           LA     Clippers  1610612746   
1  22500126  2025-10-28T20:00:00Z  Oklahoma City      Thunder  1610612760   Sacramento        Kings  1610612758   
2  22500125  2025-10-28T20:00:00Z      Milwaukee        Bucks  1610612749     New York       Knicks  1610612752   

   homeScore  awayScore      winner gameType  attendance  ar

In [6]:
# === Standardize and clean dataframes ===
def clean_dataframe(df):
    """Generic cleaning: remove duplicate rows, trim spaces, standardize dtypes."""
    df = df.copy()
    df = df.drop_duplicates()
    df.columns = df.columns.str.strip()
    df = df.replace(["None", "NaN", "nan"], np.nan)
    return df

for name in datasets:
    datasets[name] = clean_dataframe(datasets[name])

In [7]:
# === Optimize column dtypes ===
def optimize_dtypes(df):
    """Downcast numerical types to save memory."""
    for col in df.select_dtypes(include=["float64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="float")
    for col in df.select_dtypes(include=["int64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")
    return df

for name, df in datasets.items():
    datasets[name] = optimize_dtypes(df)

In [8]:
# === Data integrity validation ===
# Ensure consistent game IDs between team and player datasets

assert team_stats["gameId"].nunique() <= games["gameId"].nunique(), \
    "Mismatch: team_stats contains gameIds not in games.csv"

# Auto-detect team total points column name
possible_point_cols = [c for c in team_stats.columns if c.lower() in ["points", "pts", "teamscore", "score"]]
if not possible_point_cols:
    raise KeyError("No column found in team_stats matching ['points', 'pts', 'teamScore', 'score']")
points_col = possible_point_cols[0]
print(f"Using team total column: '{points_col}'")

# Check approximate consistency between player totals and team totals
agg_points = (
    player_stats.groupby(["gameId", "playerteamName"])["points"]
    .sum()
    .reset_index()
)

merged_points = pd.merge(
    agg_points,
    team_stats[["gameId", "teamName", points_col]],
    left_on=["gameId", "playerteamName"],
    right_on=["gameId", "teamName"],
    how="inner"
)

# Calculate correlation only if numeric and not empty
if not merged_points.empty and np.issubdtype(merged_points[points_col].dtype, np.number):
    correlation = merged_points["points"].corr(merged_points[points_col])
    print(f"Player vs Team total points correlation: {correlation:.3f}")
else:
    print("Unable to compute correlation — check column types or join keys.")

Using team total column: 'teamScore'
Player vs Team total points correlation: 0.985


In [9]:
# === Save cleaned outputs  ===
for name, df in datasets.items():
    output_path = os.path.join(PROCESSED_PATH, f"{name}_cleaned.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path} — shape {df.shape}")

Saved: ../data/processed/games_cleaned.csv — shape (72008, 17)
Saved: ../data/processed/team_stats_cleaned.csv — shape (144016, 48)
Saved: ../data/processed/player_stats_cleaned.csv — shape (1631540, 35)
Saved: ../data/processed/players_cleaned.csv — shape (6678, 14)
Saved: ../data/processed/team_histories_cleaned.csv — shape (140, 7)
Saved: ../data/processed/schedule_cleaned.csv — shape (1278, 17)
