In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from pathlib import Path


plt.rcParams["figure.figsize"] = (8, 4)
plt.rcParams["axes.grid"] = True


In [None]:


inj_clean_path = Path("injuries_2010-2020_cleaned.csv")
inj_raw_path   = Path("injuries_2010-2020.csv")

if inj_clean_path.exists():
    print("Loading cleaned injuries file...")
    inj = pd.read_csv(inj_clean_path)
else:
    print("Cleaned file not found; falling back to raw and doing minimal cleaning...")
    inj = pd.read_csv(inj_raw_path)
    
  
    if "Acquired" in inj.columns:
        inj = inj.drop(columns=["Acquired"])
    
  
    if "Relinquished" in inj.columns and "Player_Name" not in inj.columns:
        inj = inj.rename(columns={"Relinquished": "Player_Name"})
    
   
    for col in ["Player_Name", "Team", "Notes"]:
        if col in inj.columns:
            inj[col] = (
                inj[col]
                .astype(str)
                .str.strip()
                .str.title()
            )
    
    
    inj["Date"] = pd.to_datetime(inj["Date"], errors="coerce")
    inj["Year"] = inj["Date"].dt.year
    
    
    def get_injury_type(note: str) -> str:
        n = str(note).lower()
        keywords = [
            "fracture", "sprain", "tear", "torn", "strain", "break",
            "contusion", "surgery", "dislocation", "bruise",
            "soreness", "procedure", "inflammation", "rupture"
        ]
        for k in keywords:
            if k in n:
                return k.title()
        return "Other"
    
    if "Injury_Type" not in inj.columns:
        inj["Injury_Type"] = inj["Notes"].apply(get_injury_type)
    
    
    inj = inj.dropna(subset=["Player_Name", "Team"])
    inj = inj.drop_duplicates(subset=["Player_Name", "Date"])

print("Injury data shape:", inj.shape)
print("Columns:", list(inj.columns))


In [None]:

inj["Year"] = pd.to_numeric(inj["Year"], errors="coerce")
inj2018 = inj[inj["Year"] == 2018].copy()

print("\nInjuries in 2018:", len(inj2018))


inj2018["Date"] = pd.to_datetime(inj2018["Date"], errors="coerce")



def classify_severity(note: str) -> str:
    n = str(note).lower()
    
   
    if re.search(r"\b(surgery|surgical|repair|fracture|broken|rupture|torn|tear)\b", n):
        return "HIGH"
    
  
    if re.search(r"\b(grade\s*2|grade\s*iii|partial tear|dislocation)\b", n):
        return "MEDIUM"
    
    
    if re.search(r"\b(sprain|strain|soreness|contusion|bruise|tightness)\b", n):
        return "LOW"
    
    return "UNKNOWN"

inj2018["severity"] = inj2018["Notes"].apply(classify_severity)

print("\nSeverity breakdown (2018):")
print(inj2018["severity"].value_counts(dropna=False))


def classify_region(note: str, injury_type: str) -> str:
    text = f"{injury_type} {note}".lower()
    
    if re.search(r"\b(head|concussion|face|eye|nose|ear|jaw|mouth|tooth)\b", text):
        return "HEAD"
    if re.search(r"\b(neck)\b", text):
        return "NECK"
    if re.search(r"\b(shoulder|collarbone|clavicle)\b", text):
        return "SHOULDER"
    if re.search(r"\b(elbow)\b", text):
        return "ELBOW"
    if re.search(r"\b(wrist)\b", text):
        return "WRIST"
    if re.search(r"\b(hand|finger|thumb)\b", text):
        return "HAND"
    if re.search(r"\b(chest|rib|sternum)\b", text):
        return "CHEST"
    if re.search(r"\b(back|spine|spinal)\b", text):
        return "BACK"
    if re.search(r"\b(hip)\b", text):
        return "HIP"
    if re.search(r"\b(groin)\b", text):
        return "GROIN"
    if re.search(r"\b(hamstring)\b", text):
        return "HAMSTRING"
    if re.search(r"\b(quad|quadricep)\b", text):
        return "QUAD"
    if re.search(r"\b(calf|achilles)\b", text):
        return "CALF"
    if re.search(r"\b(knee|acl|mcl|meniscus|patella)\b", text):
        return "KNEE"
    if re.search(r"\b(ankle)\b", text):
        return "ANKLE"
    if re.search(r"\b(foot|toe|plantar)\b", text):
        return "FOOT"
    return "OTHER"

inj2018["region"] = inj2018.apply(
    lambda r: classify_region(r.get("Notes", ""), r.get("Injury_Type", "")),
    axis=1
)

print("\nTop injury regions (2018):")
print(inj2018["region"].value_counts().head(10))


In [None]:


severe = inj2018[inj2018["severity"] == "HIGH"].copy()
print("\nNumber of HIGH-severity injuries (2018):", len(severe))


region_counts = severe["region"].value_counts().sort_values(ascending=False)

plt.figure()
region_counts.plot(kind="bar")
plt.title("High-Severity Injuries by Body Region (2018)")
plt.xlabel("Body Region")
plt.ylabel("Number of Injuries")
plt.tight_layout()
plt.show()


if "Team" in severe.columns:
    severe_team = severe["Team"].value_counts().head(10)
    
    plt.figure()
    severe_team.plot(kind="bar")
    plt.title("Teams with Most High-Severity Injuries (2018, Top 10)")
    plt.xlabel("Team")
    plt.ylabel("High-Severity Injuries")
    plt.tight_layout()
    plt.show()


severe_dates = severe.dropna(subset=["Date"]).copy()
if not severe_dates.empty:
    severe_by_month = severe_dates.groupby(severe_dates["Date"].dt.to_period("M")).size()
    severe_by_month.index = severe_by_month.index.to_timestamp()
    
    plt.figure()
    severe_by_month.plot(marker="o")
    plt.title("High-Severity Injuries Over Time (2018)")
    plt.xlabel("Month")
    plt.ylabel("Number of High-Severity Injuries")
    plt.tight_layout()
    plt.show()


In [None]:


stats_path = Path("nbastatsv3_2018.csv")
if not stats_path.exists():
    raise FileNotFoundError("nbastatsv3_2018.csv not found in current directory.")

print("\nLoading play-by-play data...")
stats = pd.read_csv(stats_path)

print("Play-by-play shape:", stats.shape)
print("Sample columns:", list(stats.columns)[:15])


player_col = "playerName" if "playerName" in stats.columns else None
if player_col is None:
    raise KeyError("Expected column 'playerName' not found in play-by-play CSV.")

events = stats[stats[player_col].notna()].copy()

def classify_play(row):
    desc = str(row.get("description", "")).lower()
    at   = str(row.get("actionType", "")).lower()
    st   = str(row.get("subType", "")).lower()
    
    text = " ".join([desc, at, st])
    
    if "dunk" in text:
        return "DUNK"
    if "layup" in text:
        return "LAYUP"
    if "3pt" in text or "3-pt" in text or "three point" in text or "3-point" in text:
        return "THREE_POINTER"
    if "jump shot" in text or "jumper" in text:
        return "JUMP_SHOT"
    if "rebound" in text:
        return "REBOUND"
    if "block" in text:
        return "BLOCK"
    if "foul" in text:
        return "FOUL"
    return "OTHER"

print("\nClassifying play types...")
events["play_type"] = events.apply(classify_play, axis=1)


league_counts = events["play_type"].value_counts()
league_share = league_counts / league_counts.sum()

print("\nLeague-wide play type distribution (%):")
print((league_share * 100).round(2))

plt.figure()
league_share.sort_values(ascending=False).plot(kind="bar")
plt.title("League-Wide Distribution of Play Types (2018)")
plt.ylabel("Share of Actions")
plt.tight_layout()
plt.show()


In [None]:

severe_players_full = (
    severe[severe["Player_Name"].notna()]["Player_Name"]
    .drop_duplicates()
    .tolist()
)

def last_name(full_name: str) -> str:
    parts = str(full_name).split()
    return parts[-1] if parts else ""

severe_last_names = {last_name(p) for p in severe_players_full if p}

print("\nNumber of players with at least one HIGH-severity injury:", len(severe_last_names))


events_severe = events[events[player_col].isin(severe_last_names)].copy()
print("Events involving (approx) severe-injury players:", len(events_severe))

severe_counts = events_severe["play_type"].value_counts()
severe_share = severe_counts / severe_counts.sum()

print("\nPlay type distribution for severe-injury player group (%):")
print((severe_share * 100).round(2))


play_compare = pd.concat(
    [
        league_share.rename("League"),
        severe_share.rename("Severe_Group")
    ],
    axis=1
).fillna(0)

print("\nCombined play type distribution (League vs Severe_Group, %):")
print((play_compare * 100).round(2))


plt.figure(figsize=(10, 5))
play_compare.sort_values("League", ascending=False).plot(kind="bar")
plt.title("Play Type Distribution: League vs Players with High-Severity Injuries (2018)")
plt.ylabel("Share of Actions")
plt.tight_layout()
plt.show()
