In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from pathlib import Path

plt.rcParams["figure.figsize"] = (8, 4)
plt.rcParams["axes.grid"] = True

parquet_dir = Path("Parquet Output Files")

injuries_df = pd.read_parquet(parquet_dir / "injuries_2018_clean.parquet")
games_df    = pd.read_parquet(parquet_dir / "nba_analysis_ready.parquet")


injuries_df = injuries_df.rename(columns={"date": "injuryDate"})

print("Injuries columns:", injuries_df.columns.tolist())
print("Games columns:", games_df.columns.tolist())


In [None]:

injuries_df["injuryDate"] = pd.to_datetime(injuries_df["injuryDate"], errors="coerce")
games_df["gameDate"]      = pd.to_datetime(games_df["gameDate"], errors="coerce")

def classify_severity(note: str, injury_type: str) -> str:
    text = f"{injury_type} {note}".lower()
    
   
    if re.search(r"\b(surgery|surgical|repair|fracture|broken|rupture|torn|tear)\b", text):
        return "HIGH"
    
    
    if re.search(r"\b(grade\s*2|grade\s*iii|partial tear|dislocation)\b", text):
        return "MEDIUM"
    
   
    if re.search(r"\b(sprain|strain|soreness|contusion|bruise|tightness)\b", text):
        return "LOW"
    
    return "UNKNOWN"

injuries_df["severity"] = injuries_df.apply(
    lambda r: classify_severity(r.get("Notes", ""), r.get("Injury_Type", "")),
    axis=1
)

print("Severity counts:")
print(injuries_df["severity"].value_counts())


In [None]:

injuries_clean = injuries_df.dropna(subset=["playerNameFormatted", "injuryDate"]).copy()


severe_injuries = injuries_clean[injuries_clean["severity"] == "HIGH"].copy()
print("Number of HIGH-severity injuries:", len(severe_injuries))


severe_injuries["window_start"] = severe_injuries["injuryDate"] - pd.Timedelta(days=7)
severe_injuries["window_end"]   = severe_injuries["injuryDate"] - pd.Timedelta(days=1)

severe_injuries[["playerNameFormatted", "injuryDate", "window_start", "window_end"]].head()


In [None]:

games_clean = games_df.dropna(subset=["playerNameFormatted", "gameDate"]).copy()


games_with_windows = games_clean.merge(
    severe_injuries[["playerNameFormatted", "injuryDate", "window_start", "window_end"]],
    on="playerNameFormatted",
    how="inner",
    suffixes=("_game", "_injury"),
)


mask = (
    (games_with_windows["gameDate"] >= games_with_windows["window_start"]) &
    (games_with_windows["gameDate"] <  games_with_windows["injuryDate"])
)

games_pre_injury = games_with_windows[mask].copy()

print("Number of game events in the week before a severe injury:", len(games_pre_injury))
games_pre_injury[["playerNameFormatted", "gameDate", "injuryDate"]].head()


In [None]:


def classify_play(row):
    desc = str(row.get("description", "")).lower()
    at   = str(row.get("actionType", "")).lower()
    text = " ".join([desc, at])
    
    if "dunk" in text:
        return "DUNK"
    if "layup" in text:
        return "LAYUP"
    if "drive" in text:
        return "DRIVE"
    if "3pt" in text or "3-pt" in text or "three point" in text or "3-point" in text:
        return "THREE_POINTER"
    if "jump shot" in text or "jumper" in text or "pullup" in text:
        return "JUMP_SHOT"
    if "rebound" in text:
        return "REBOUND"
    if "block" in text:
        return "BLOCK"
    if "foul" in text:
        return "FOUL"
    return "OTHER"


games_clean["play_type"] = games_clean.apply(classify_play, axis=1)


games_pre_injury["play_type"] = games_pre_injury.apply(classify_play, axis=1)


pre_counts  = games_pre_injury["play_type"].value_counts()
pre_share   = pre_counts / pre_counts.sum()

print("\nPlay-type distribution in 7 days before severe injuries (%):")
print((pre_share * 100).round(2))

plt.figure()
pre_share.sort_values(ascending=False).plot(kind="bar")
plt.title("Play Types in Week Before Severe Injuries")
plt.ylabel("Share of Events")
plt.tight_layout()
plt.show()


In [None]:


severe_players = severe_injuries["playerNameFormatted"].unique()


events_severe_players = games_clean[games_clean["playerNameFormatted"].isin(severe_players)].copy()

season_counts = events_severe_players["play_type"].value_counts()
season_share  = season_counts / season_counts.sum()

print("\nSeason-long play-type distribution for severe-injury players (%):")
print((season_share * 100).round(2))


compare = pd.concat(
    [season_share.rename("Season"), pre_share.rename("Week_Before_Injury")],
    axis=1
).fillna(0)

print("\nCombined distribution (Season vs Week_Before_Injury, %):")
print((compare * 100).round(2))

plt.figure(figsize=(10, 5))
compare.sort_values("Season", ascending=False).plot(kind="bar")
plt.title("Play Types: Season vs Week Before Severe Injuries")
plt.ylabel("Share of Events")
plt.tight_layout()
plt.show()


In [None]:

compare["delta"] = compare["Week_Before_Injury"] - compare["Season"]
compare_sorted = compare.sort_values("delta", ascending=False)
compare_sorted
