In [None]:
import requests, os, json, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import torch

# Data Acquisition

## Gathering Data: via filter criteria (pubs) (TBC)

At this point, selected `match_ids` are ready to be queried further for any specific ML task.

## Gathering Data: via tournament ID (pro games)

- alternative: scrape `match_id`s based on specific criteria. (bracket, hero)

ref: https://api.opendota.com/api/leagues

In [None]:
def get_match_ids_from_league(league_id=16935, limit=None): # TI14 16899
    """
    Return a list of match_ids for a given league.
    """
    url = f"https://api.opendota.com/api/leagues/{league_id}/matches"
    resp = requests.get(url)
    resp.raise_for_status()
    matches = resp.json()

    match_ids = [m["match_id"] for m in matches[:limit] if "match_id" in m]

    if limit is not None:
        return match_ids[:limit]
    return match_ids

In [None]:
with open('ti13_match_ids.json') as f:
    match_ids = json.load(f)

In [None]:
match_ids = get_match_ids_from_league(limit=None)
len(match_ids)

# Data Extraction

In [None]:
with open('ti13_match_ids.json', 'w') as f:
    json.dump(match_ids, f)

In [None]:
RAW_DIR = "data/raw_matches"

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def fetch_and_save_match(match_id, overwrite=False, sleep_time=1):
    """
    Fetch a single match JSON from OpenDota and save to disk.
    Skip if already exists unless overwrite=True.
    """
    ensure_dir(RAW_DIR)
    file_path = os.path.join(RAW_DIR, f"{match_id}.json")

    if os.path.exists(file_path) and not overwrite:
        print(f"✅ Skipping {match_id}, already cached.")
        return file_path

    url = f"https://api.opendota.com/api/matches/{match_id}"
    resp = requests.get(url)
    if resp.status_code == 200:
        with open(file_path, "w") as f:
            json.dump(resp.json(), f, indent=2)
        print(f"💾 Saved match {match_id} → {file_path}")
        return file_path
    else:
        print(f"⚠️ Failed to fetch {match_id}: {resp.status_code}")
        return None

def scrape_from_match_list(match_ids, overwrite=False):
    """
    Given a list of match_ids, fetch and save them locally.
    """
    saved_files = []
    for mid in match_ids:
        path = fetch_and_save_match(mid, overwrite=overwrite)
        if path:
            saved_files.append(path)
    return saved_files

In [None]:
files = scrape_from_match_list(match_ids)

In [None]:
def extract_json_to_csv(input_dir, output_dir, keys):
    """
    Extract specific keys from local JSON match files and save as separate CSVs.
    
    Each CSV will be titled `<key>.csv` (e.g. picks_bans.csv).
    
    Parameters
    ----------
    input_dir : str
        Folder with raw JSON files (one per match).
    output_dir : str
        Folder to save CSV outputs.
    keys : list[str]
        Keys inside match JSONs to extract (e.g. ["picks_bans", "draft_timings"]).
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for key in keys:
        all_rows = []
        for fname in os.listdir(input_dir):
            if not fname.endswith(".json"):
                continue
            with open(os.path.join(input_dir, fname)) as f:
                data = json.load(f)

            match_id = data.get("match_id")
            if key in data and isinstance(data[key], list):
                for row in data[key]:
                    row["match_id"] = match_id
                    all_rows.append(row)

        if all_rows:
            df = pd.DataFrame(all_rows)
            out_file = os.path.join(output_dir, f"{key}.csv")
            df.to_csv(out_file, index=False)
            print(f"💾 Saved {len(df)} rows → {out_file}")
        else:
            print(f"⚠️ No data found for key `{key}` in provided matches.")

In [None]:
extract_json_to_csv(input_dir='raw_matches/', output_dir='data/', keys=['picks_bans', 'draft_timings'])

# Preprocessing

### Preprocess

In [None]:
pb_df = pd.read_csv('data/picks_bans.csv')
dt_df = pd.read_csv('data/draft_timings.csv')
pb_df.info(), dt_df.info()

For simplicity, we will use only `picks_bans`.

In [None]:
# for future

def merge_csvs(csv_paths, merge_keys=["match_id", "order"]):
    """
    Merge multiple CSVs on given keys (default: match_id + order).
    Uses inner join to keep only rows where all files align.
    """
    if not csv_paths:
        raise ValueError("No CSV paths provided")

    # Load first file
    merged_df = pd.read_csv(csv_paths[0])

    # Iteratively merge the rest
    for path in csv_paths[1:]:
        df = pd.read_csv(path)
        merged_df = merged_df.merge(df, on=merge_keys, how="inner")

    return merged_df

# Example usage
# csvs = ["data/picks_bans.csv", "data/draft_timings.csv"]
# df_merged = merge_csvs(csvs)
# display(df_merged.head())
# print(f"Final shape: {df_merged.shape}")

In [None]:
# Load picks_bans CSV
pb_df = pd.read_csv("data/picks_bans.csv")

# Ensure clean dtypes
pb_df = pb_df.sort_values(by=["match_id", "order"])
pb_df["is_pick"] = pb_df["is_pick"].astype(int)
pb_df["team"] = pb_df["team"].astype(int)

# X: context features
X = pb_df[["team", "is_pick", "order"]]
y = pb_df["hero_id"]



In [None]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.head(), X_test.head()

# Classical ML

In [None]:
def evaluate_model(model, X_test, y_test):
    """
    Train a single model and return evaluation metrics.
    """
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="weighted", zero_division=0
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
def benchmark_models(models_dict, X_train, y_train_enc, X_test, y_test_enc):
    """
    Train and benchmark multiple models.
    """
    results = []

    for name, model in models_dict.items():
        print(f"⚡ Training {name}...")
        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_test, y_test)
        metrics["model"] = name
        results.append(metrics)

    return pd.DataFrame(results).set_index("model")

In [None]:
models_dict = {
    "Logistic Regression": LogisticRegression(max_iter=500, multi_class="multinomial"),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
    "XGBoost": XGBClassifier(eval_metric="mlogloss"),
    "LightGBM": LGBMClassifier()
}

In [None]:
results_df = benchmark_models(models_dict, X_train, y_train, X_test, y_test)
print(results_df.sort_values("accuracy", ascending=False))

# Sequence Models

# Evaluation

## Sequence Modelling

### predicting other stuff