In [None]:
import requests, os, json, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import torch

# Data Acquisition

Parameterize: pro/pubs, tournament ID/bracket, hero

Output: [`match_ids`] in .json.

## Gathering Data: via Filter Criteria (Pubs) (TBC)

scrape `match_id`s based on specific criteria. (bracket, hero)

In [None]:
# placeholder to modify get_match_ids()

## Gathering Data: via Tournament ID (pro games)

ref: https://api.opendota.com/api/leagues


Example: TI13

In [None]:
def get_match_ids_from_league(league_id: int, limit: int | None = None) -> list[int]:
    """
    Fetch match_ids for a given league.

    Args:
        league_id (int): The league ID to query.
        limit (int | None): Optional maximum number of match_ids to return.

    Returns:
        list[int]: List of match IDs.
    """
    url = f"https://api.opendota.com/api/leagues/{league_id}/matches"
    resp = requests.get(url)
    resp.raise_for_status()
    matches = resp.json()

    match_ids = [m["match_id"] for m in matches if "match_id" in m]

    if limit is not None:
        return match_ids[:limit]
    return match_ids

In [None]:
league_id=16935 # TI14 16899

match_ids = get_match_ids_from_league(league_id)
len(match_ids)

Write `match_ids` to file to limit API calls.

In [None]:
with open('ti13_match_ids.json', 'w') as f:
    json.dump(match_ids, f)

After this point, selected `match_ids` are used as keys for further queries for any specific ML task.

# Data Extraction

1. Store match data in raw format locally.

2. Selectively extract match keys to be aggregated into .csv files. 

### Stage 1: Store match data

Given `match_id`, save entire .json file to disk.


Parameterize: .json file of `match_ids`

Output: `data/raw_matches`of individual matches. Store in this format.


In [None]:
RAW_DIR = "data/raw_matches"

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def fetch_and_save_match(match_id, overwrite=False, sleep_time=1):
    """
    Fetch a single match JSON from OpenDota and save to disk.
    Skip if already exists unless overwrite=True.
    """
    ensure_dir(RAW_DIR)
    file_path = os.path.join(RAW_DIR, f"{match_id}.json")

    if os.path.exists(file_path) and not overwrite:
        print(f"✅ Skipping {match_id}, already cached.")
        return file_path

    url = f"https://api.opendota.com/api/matches/{match_id}"
    resp = requests.get(url)
    if resp.status_code == 200:
        with open(file_path, "w") as f:
            json.dump(resp.json(), f, indent=2)
        print(f"💾 Saved match {match_id} → {file_path}")
        return file_path
    else:
        print(f"⚠️ Failed to fetch {match_id}: {resp.status_code}")
        return None
    
    

def scrape_from_match_list(match_ids, overwrite=False):
    """
    Given a list of match_ids, fetch and save them locally.
    """
    saved_files = []
    for mid in match_ids:
        path = fetch_and_save_match(mid, overwrite=overwrite)
        if path:
            saved_files.append(path)
    return saved_files

In [None]:
files = scrape_from_match_list(match_ids)
files[:2]

### Stage 2: Extract match keys and aggregate into .csv file

In [None]:
def extract_json_to_csv(input_dir, output_dir, keys):
    """
    Extract specific keys from JSON match files and save as separate CSVs.
    
    Each CSV will be titled `<key>.csv` (e.g. picks_bans.csv).
    
    Parameters
    ----------
    input_dir : str
        Folder with raw JSON files (one per match).
    output_dir : str
        Folder to save CSV outputs.
    keys : list[str]
        Keys inside match JSONs to extract (e.g. ["picks_bans", "draft_timings"]).
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for key in keys:
        all_rows = []
        for fname in os.listdir(input_dir):
            if not fname.endswith(".json"):
                continue
            with open(os.path.join(input_dir, fname)) as f:
                data = json.load(f)

            match_id = data.get("match_id")
            if key in data and isinstance(data[key], list):
                for row in data[key]:
                    row["match_id"] = match_id
                    all_rows.append(row)

        if all_rows:
            df = pd.DataFrame(all_rows)
            out_file = os.path.join(output_dir, f"{key}.csv")
            df.to_csv(out_file, index=False)
            print(f"💾 Saved {len(df)} rows → {out_file}")
        else:
            print(f"⚠️ No data found for key `{key}` in provided matches.")

In [None]:
extract_json_to_csv(input_dir='data/raw_matches/', 
                    output_dir='data/', 
                    keys=['picks_bans', 'draft_timings'])

# Preprocessing

Turn raw data into recognizable inputs for machine learning models.

In [None]:
pb_df = pd.read_csv('data/picks_bans.csv')
dt_df = pd.read_csv('data/draft_timings.csv')
pb_df.info(), dt_df.info()

For simplicity, we will use only `picks_bans`.

### `merge_csvs()` 

In [None]:
# for future

def merge_csvs(csv_paths, merge_keys=["match_id", "order"]):
    """
    Merge multiple CSVs on given keys (default: match_id + order).
    Uses inner join to keep only rows where all files align.
    """
    if not csv_paths:
        raise ValueError("No CSV paths provided")

    # Load first file
    merged_df = pd.read_csv(csv_paths[0])

    # Iteratively merge the rest
    for path in csv_paths[1:]:
        df = pd.read_csv(path)
        merged_df = merged_df.merge(df, on=merge_keys, how="inner")

    return merged_df

# Example usage
# csvs = ["data/picks_bans.csv", "data/draft_timings.csv"]
# df_merged = merge_csvs(csvs)
# display(df_merged.head())
# print(f"Final shape: {df_merged.shape}")

## Cleaning, assigning target variable, splitting

In [None]:
# Ensure clean dtypes
pb_df = pb_df.sort_values(by=["match_id", "order"])
pb_df["is_pick"] = pb_df["is_pick"].astype(int)
pb_df["team"] = pb_df["team"].astype(int)

# X: context features
X = pb_df[["team", "is_pick", "order"]]
y = pb_df["hero_id"]


encoder = LabelEncoder()
y = encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head(), X_test.head()

# ML

In [None]:
def evaluate_model(model, X_test, y_test):
    """
    Train a single model and return evaluation metrics.
    """
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="weighted", zero_division=0
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
def benchmark_models(models_dict, X_train, y_train_enc, X_test, y_test_enc):
    """
    Train and benchmark multiple models.
    """
    results = []

    for name, model in models_dict.items():
        print(f"⚡ Training {name}...")
        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_test, y_test)
        metrics["model"] = name
        results.append(metrics)

    return pd.DataFrame(results).set_index("model")

In [None]:
models_dict = {
    "Logistic Regression": LogisticRegression(max_iter=500, multi_class="multinomial"),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
    "XGBoost": XGBClassifier(eval_metric="mlogloss"),
    "LightGBM": LGBMClassifier()
}

In [None]:
results_df = benchmark_models(models_dict, X_train, y_train, X_test, y_test)
results_df

In [None]:
results_df

As expected, the models could not detect underlying domain structure that dictates the choices of picks and bans.

With baseline established, potential experiments include:

**Baseline 2.0**

•	Rerun but incorporate past TIs, Majors etc

**Role prediction**

•	Feature engineering: normalize stats, add per-role averages

•	Train models (role classification)

**Hero prediction conditional on role**

•	Use predicted role as an input

•	Reduce class imbalance

**Sequential models**

•	Reframe input as sequences (e.g., draft order, time-series stats)

•	Try RNN/LSTM/Transformer baselines