# Your Title Here

**Name(s)**: Vincent Cho

**Website Link**: (your website link)

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import ks_2samp

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

# from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

In [2]:
data = pd.read_csv('2024_LoL_esports_match_data_from_OraclesElixir.csv')

question = 'Does an early CS lead matter more than an early kill lead?'

  data = pd.read_csv('2024_LoL_esports_match_data_from_OraclesElixir.csv')


## Step 2: Data Cleaning and Exploratory Data Analysis

In [None]:
cleaned = data.copy()
# # made into datetime
# cleaned['date'] = pd.to_datetime(cleaned['date'])

# removed all partial columns
cleaned =  cleaned[cleaned['datacompleteness'] != 'partial'] 

# drop all rows that are not participantid that is not 100, since those are the ones that refer to the overall team vs team stats and not individual players
cleaned = cleaned[(cleaned['participantid'] == 100)]

# # drop all rows in which one team had both an advantage in CS and an advantage in kill lead at 10 minutes
# cleaned = cleaned[((cleaned['csdiffat10'] > 0) & (cleaned['killsat10'] < cleaned['opp_killsat10'])) | ((cleaned['csdiffat10'] < 0) & (cleaned['killsat10'] > cleaned['opp_killsat10']))]

# dropped all columns that are not useful to analysis
cleaned = cleaned[['gameid', 'league', 'year', 'split', 'playoffs', 'date', 'game', 'patch', 'teamname', 'teamid', 'result', 'golddiffat10', 'xpdiffat10', 'csdiffat10', 'killsat10', 'opp_killsat10']]

# drop all rows in which team is NaN
cleaned = cleaned[cleaned['teamid'].notna()]

cleaned = cleaned.assign(killdiffat10 = cleaned['killsat10'] - cleaned['opp_killsat10'])

cleaned['results_str'] = cleaned['result'].astype(str).replace(['1', '0'], ['Game Won', 'Game Lost'])

In [4]:
fig = px.histogram(cleaned, x="csdiffat10", nbins=30, title="Distribution of CS Lead at 10 Minutes")

fig.show()

fig.write_html('fig.html', include_plotlyjs='cdn')

fig0 = px.scatter(cleaned, 
                 x='csdiffat10', 
                 y='killdiffat10', 
                 color='results_str', 
                 color_discrete_map={'Game Won': 'blue', 'Game Lost': 'red'}, 
                 labels={'csdiffat10': 'CS Difference at 10 Minutes', 
                         'killdiffat10': 'Kill Difference at 10 Minutes', 
                         'results_str': 'Game Outcome'}, 
                 title="CS vs Kill Difference at 10 Minutes (Win vs Loss)",
                 trendline="ols")
fig0.show()

fig0.write_html('fig0.html', include_plotlyjs='cdn')

fig1 = px.box(cleaned, 
             x="results_str", 
             y="csdiffat10", 
             color="results_str", 
             color_discrete_map={"Game Won": "blue", "Game Lost": "red"},
             labels={"result_str": "Game Outcome", "csdiffat10": "CS Difference at 10 Min"},
             title="CS Difference at 10 Minutes vs. Game Outcome")
fig1.show()

fig1.write_html('fig1.html', include_plotlyjs='cdn')

fig2 = px.box(cleaned, 
             x="results_str", 
             y="killdiffat10", 
             color="results_str", 
             color_discrete_map={"Game Won": "blue", "Game Lost": "red"},
             labels={"result_str": "Game Outcome", "killdiffat10": "Kill Difference at 10 Min"},
             title="Kill Difference at 10 Minutes vs. Game Outcome")

fig2.write_html('fig2.html', include_plotlyjs='cdn')

fig2.show()

In [5]:
corr = cleaned[['csdiffat10', 'killdiffat10', 'result']].corr()

print(corr.to_markdown())

|              |   csdiffat10 |   killdiffat10 |   result |
|:-------------|-------------:|---------------:|---------:|
| csdiffat10   |     1        |       0.206053 | 0.32504  |
| killdiffat10 |     0.206053 |       1        | 0.313346 |
| result       |     0.32504  |       0.313346 | 1        |


Null Hypothesis: CS difference and Kill difference at 10 minutes have the same effect on winning. 

Alternative Hypothesis: CS difference or Kill difference at 10 minutes has a significantly stronger effect on winning. 

Test Statistic: K-S Test Statistic

## Step 3: Assessment of Missingness

In [6]:
missing_col = 'split'

def permutation(df, column, num=1000):
    missing = df[column].isna()
    results = {}

    for col in df.columns:
        if col != column and df[col].dtype in [np.float64, np.int64]:
            observed_diff = df.loc[missing, col].mean() - df.loc[~missing, col].mean()
            perm_diffs = []

            for x in range(num):
                permuted = np.random.permutation(df[col])
                perm_diffs.append(permuted[missing].mean() - permuted[~missing].mean())

            p_value = np.mean(np.abs(perm_diffs) >= np.abs(observed_diff))
            results[col] = p_value

    return results

missingness_results = permutation(cleaned, missing_col)

missingness = pd.DataFrame(list(missingness_results.items()), columns=['Column', 'P-Value'])
missingness

Unnamed: 0,Column,P-Value
0,year,0.0
1,playoffs,0.0
2,game,0.0
3,patch,0.0
4,result,0.137
5,golddiffat10,0.0
6,xpdiffat10,0.0
7,csdiffat10,0.0
8,killsat10,0.0
9,opp_killsat10,0.0


An example of a column that is statistically relevant to the missingness of split is year. 

A column that is not relevant to the missingness of split is result. 

## Step 4: Hypothesis Testing

In [None]:
# Null Hypothesis: CS difference and Kill difference at 10 minutes have the same effect on winning. 

# Alternative Hypothesis: CS difference or Kill difference at 10 minutes has a significantly stronger effect on winning. 

# Test Statistic: Mean Difference 

new_cleaned = cleaned.dropna(subset=['csdiffat10', 'killdiffat10', 'golddiffat10', 'xpdiffat10'])

cs_diff = new_cleaned['csdiffat10'].values
kill_diff = new_cleaned['killdiffat10'].values
win = new_cleaned['result'].values

cs_win_mean = np.mean(cs_diff[win == 1])
cs_lose_mean = np.mean(cs_diff[win == 0])
kill_win_mean = np.mean(kill_diff[win == 1])
kill_lose_mean = np.mean(kill_diff[win == 0])

obs_cs_diff = cs_win_mean - cs_lose_mean
obs_kill_diff = kill_win_mean - kill_lose_mean

n = 10000
perm_diffs = np.zeros(n)

for i in range(n):
    perm_labels = np.random.permutation(win)
    perm_cs_win_mean = np.mean(cs_diff[perm_labels == 1])
    perm_cs_lose_mean = np.mean(cs_diff[perm_labels == 0])
    perm_kill_win_mean = np.mean(kill_diff[perm_labels == 1])
    perm_kill_lose_mean = np.mean(kill_diff[perm_labels == 0])
    
    perm_diffs[i] = abs((perm_cs_win_mean - perm_cs_lose_mean) - (perm_kill_win_mean - perm_kill_lose_mean))

obs_diff = abs(obs_cs_diff - obs_kill_diff)
p_value = np.mean(perm_diffs >= obs_diff)

obs_diff, p_value

perm_df = pd.DataFrame({'Permutation Differences': perm_diffs})

fig = px.histogram(
    perm_df,
    x="Permutation Differences",
    nbins=50,
    histnorm='probability density',
    title="Permutation Test Distribution",
    labels={"Permutation Differences": "Difference in Effect (CS vs. Kill Difference)"},
    opacity=0.7
)

fig.add_vline(x=obs_diff, line_dash="dash", line_color="red", annotation_text="Observed Difference")

fig.show()


## Step 5: Framing a Prediction Problem

In [8]:
prediction_problem = 'Is it possible to predict a win or a loss based on data within the first 10 minutes?'

## Step 6: Baseline Model

In [None]:
X = new_cleaned[['csdiffat10', 'killdiffat10']]  
y = new_cleaned['result']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)


baseline_pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=1)) 
])

baseline_pipeline.fit(X_train, y_train)

y_pred = baseline_pipeline.predict(X_test)
y_pred_proba = baseline_pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.6290


## Step 7: Final Model

In [None]:
X = new_cleaned[['csdiffat10', 'killdiffat10', 'xpdiffat10', 'golddiffat10']]  
y = new_cleaned['result']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)

new_features = new_cleaned[['golddiffat10', 'xpdiffat10']]

X_train1 = X_train.reset_index().merge(new_features.reset_index(), on='index', how='left').set_index('index')
X_test1 = X_test.reset_index().merge(new_features.reset_index(), on='index', how='left').set_index('index')

final_pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=1)) 
])

hyperparameters = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(final_pipeline, hyperparameters, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.6789


## Step 8: Fairness Analysis

In [26]:
from sklearn.metrics import precision_score

X_test = X_test.copy()
X_test['patch'] = X_test['patch'].astype(str)
old_patch_mask = (X_test['patch'] >= '14.01') & (X_test['patch'] <= '14.12')
new_patch_mask = (X_test['patch'] >= '14.13') & (X_test['patch'] <= '14.23')
y_test_old = y_test[old_patch_mask]
y_test_new = y_test[new_patch_mask]
X_test_old = X_test[old_patch_mask].drop(columns=['patch'])
X_test_new = X_test[new_patch_mask].drop(columns=['patch'])
y_pred_old = final_pipeline.predict(X_test_old)
y_pred_new = final_pipeline.predict(X_test_new)
precision_old = precision_score(y_test_old, y_pred_old)
precision_new = precision_score(y_test_new, y_pred_new)
obs_diff = abs(precision_old - precision_new)
n_permutations = 10000
perm_diffs = np.zeros(n_permutations)
y_test_combined = np.concatenate([y_test_old, y_test_new])
y_pred_combined = np.concatenate([y_pred_old, y_pred_new])
for i in range(n_permutations):
    perm_labels = np.random.permutation(y_test_combined)
    perm_old = precision_score(perm_labels[:len(y_test_old)], y_pred_combined[:len(y_test_old)])
    perm_new = precision_score(perm_labels[len(y_test_old):], y_pred_combined[len(y_test_old):])
    perm_diffs[i] = abs(perm_old - perm_new)
p_value = np.mean(perm_diffs >= obs_diff)

print(f"P-Value: {p_value:.4f}")

P-Value: 0.2072
