# Dota 2 Winner Prediction

This notebook predicts the winner of a Dota 2 match (Radiant vs. Dire) based on game data.
We use a combination of raw features, JSON match logs, and time-series data.

In [None]:
# Libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime
import lightgbm as lgb
from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import xgboost as xgb
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn import metrics
import json
import ast
import time
from sklearn import linear_model
import eli5
from eli5.sklearn import PermutationImportance
import shap
from tqdm import tqdm_notebook
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection import GenericUnivariateSelect, SelectPercentile, SelectKBest, f_classif, mutual_info_classif, RFE
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
from catboost import CatBoostClassifier

from IPython.display import HTML

from plotly import tools
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)

## 1. Data Loading

In [None]:
PATH = "/kaggle/input/mlcourse-dota2-win-prediction/"

print("Loading data...")
train = pd.read_csv(os.path.join(PATH, 'train_features.csv'), index_col='match_id_hash')
test = pd.read_csv(os.path.join(PATH, 'test_features.csv'), index_col='match_id_hash')
targets = pd.read_csv(os.path.join(PATH, 'train_targets.csv'), index_col='match_id_hash')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Targets shape: {targets.shape}")

## 2. Feature Engineering
We will process the raw JSON data to extract more detailed features about heroes, items, and events.

In [None]:
# Function to read JSONL files
def read_json_lines(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            yield json.loads(line)

# Example: Extracting hero ids and basic stats from JSON (Simplified for baseline)
# In a real scenario, we would iterate through 'train_matches.jsonl' and 'test_matches.jsonl'
# and create a DataFrame of new features.

# For this baseline, we will focus on the pre-computed 'train_features.csv' 
# but we'll add some aggregated features.

full_df = pd.concat([train, test], sort=False)

print("Feature engineering...")
# 1. Hero Combinations (One-Hot Encoding or Embeddings)
# The dataset has columns like 'r1_hero_id', 'd1_hero_id', etc.
hero_cols = [f'{t}{i}_hero_id' for t in ['r', 'd'] for i in range(1, 6)]

# 2. Aggregated Stats (Gold, XP, Kills)
for team in ['r', 'd']:
    players = [f'{team}{i}' for i in range(1, 6)]
    
    # Sum of kills, deaths, assists
    full_df[f'{team}_kills_sum'] = full_df[[f'{p}_kills' for p in players]].sum(axis=1)
    full_df[f'{team}_deaths_sum'] = full_df[[f'{p}_deaths' for p in players]].sum(axis=1)
    full_df[f'{team}_assists_sum'] = full_df[[f'{p}_assists' for p in players]].sum(axis=1)
    
    # Sum of gold and xp
    full_df[f'{team}_gold_sum'] = full_df[[f'{p}_gold' for p in players]].sum(axis=1)
    full_df[f'{team}_xp_sum'] = full_df[[f'{p}_xp' for p in players]].sum(axis=1)

# 3. Differences (Radiant - Dire)
full_df['kills_diff'] = full_df['r_kills_sum'] - full_df['d_kills_sum']
full_df['gold_diff'] = full_df['r_gold_sum'] - full_df['d_gold_sum']
full_df['xp_diff'] = full_df['r_xp_sum'] - full_df['d_xp_sum']

# Split back into train and test
X = full_df.loc[train.index]
X_test = full_df.loc[test.index]
y = targets['radiant_win'].astype(int)

print("Data prepared.")

## 3. Model Training (CatBoost)
CatBoost is excellent for categorical features and robust to overfitting.

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric='AUC',
    random_seed=42,
    verbose=100,
    task_type="GPU" # Use GPU if available on Kaggle
)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50
)

## 4. Evaluation & Interpretation

In [None]:
# Feature Importance
feature_imp = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})
feature_imp = feature_imp.sort_values('importance', ascending=False).head(20)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_imp)
plt.title('Top 20 Feature Importance')
plt.show()

In [None]:
# SHAP Values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_val)

shap.summary_plot(shap_values, X_val)

## 5. Submission

In [None]:
preds = model.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({'match_id_hash': X_test.index, 'radiant_win_prob': preds})
submission.to_csv('submission.csv', index=False)

print("Submission saved to submission.csv")
print(submission.head())