# DECISION TREE

## Imports

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## Load data and initial setup

We will be seeing how well a decision tree regressor can do at predicting final score values based on the stats at the end of each quarter.

By separating the stats per quarter and not just including a sum total, I hope to capture the effects of momentum. For example, a team scored a ton in first q but by q3 is sucking is probably less likely to score high than a team that scored nothing in first q but in q2 and q3 really started to hit their stride.

We are assuming that given the entirety (q1,q2,q3,q4) of the game stats, the decision tree should be able to easily pick the correct amount of points, however, this is not guaranteed which is why I am doing a full game stats model.

In [8]:
df = pd.read_csv('compiledData/cleanedCompiledData.csv')
y = df['points']
X_q1 = df.filter(regex='q1$')
X_q1_q2 = df.filter(regex='q[12]$')
X_q1_q2_q3 = df.filter(regex='q[123]$')
X_q1_q2_q3_q4 = df.filter(regex='q[1234]$')

In [9]:
X_q1_train, X_q1_test, y_q1_train, y_q1_test = train_test_split(X_q1, y, test_size=0.2, shuffle=True)
X_q1_q2_train, X_q1_q2_test, y_q1_q2_train, y_q1_q2_test = train_test_split(X_q1_q2, y, test_size=0.2, shuffle=True)
X_q1_q2_q3_train, X_q1_q2_q3_test, y_q1_q2_q3_train, y_q1_q2_q3_test = train_test_split(X_q1_q2_q3, y, test_size=0.2, shuffle=True)
X_q1_q2_q3_q4_train, X_q1_q2_q3_q4_test, y_q1_q2_q3_q4_train, y_q1_q2_q3_q4_test = train_test_split(X_q1_q2_q3_q4, y, test_size=0.2, shuffle=True)

## Decision Tree Experiments

In [35]:
def run_experiment(max_depth=None, verbose=True):
    q1_dt = DecisionTreeRegressor(max_depth=max_depth)
    q1_dt.fit(X_q1_train, y_q1_train)
    q1_q2_dt = DecisionTreeRegressor(max_depth=max_depth)
    q1_q2_dt.fit(X_q1_q2_train, y_q1_q2_train)
    q1_q2_q3_dt = DecisionTreeRegressor(max_depth=max_depth)
    q1_q2_q3_dt.fit(X_q1_q2_q3_train, y_q1_q2_q3_train)
    q1_q2_q3_q4_dt = DecisionTreeRegressor(max_depth=max_depth)
    q1_q2_q3_q4_dt.fit(X_q1_q2_q3_q4_train, y_q1_q2_q3_q4_train)

    version = "Baseline" if max_depth is None else f"Max Depth {max_depth}"

    if verbose:
        print(f"{version} Train Q1 R^2: {q1_dt.score(X_q1_train, y_q1_train)}")
        print(f"{version} Test Q1 R^2: {q1_dt.score(X_q1_test, y_q1_test)}")
        print()
        print(f"{version} Train Q1 MAE: {mean_absolute_error(y_q1_train, q1_dt.predict(X_q1_train))}")
        print(f"{version} Test Q1 MAE: {mean_absolute_error(y_q1_test, q1_dt.predict(X_q1_test))}")
        print()
        print()

        print(f"{version} Train Q1+Q2 R^2: {q1_q2_dt.score(X_q1_q2_train, y_q1_q2_train)}")
        print(f"{version} Q1+Q2 R^2: {q1_q2_dt.score(X_q1_q2_test, y_q1_q2_test)}")
        print()
        print(f"{version} Train Q1+Q2 MAE: {mean_absolute_error(y_q1_q2_train, q1_q2_dt.predict(X_q1_q2_train))}")
        print(f"{version} Q1+Q2 MAE: {mean_absolute_error(y_q1_q2_test, q1_q2_dt.predict(X_q1_q2_test))}")
        print()
        print()


        print(f"{version} Train Q1+Q2+Q3 R^2: {q1_q2_q3_dt.score(X_q1_q2_q3_train, y_q1_q2_q3_train)}")
        print(f"{version} Q1+Q2+Q3 R^2: {q1_q2_q3_dt.score(X_q1_q2_q3_test, y_q1_q2_q3_test)}")
        print()
        print(f"{version} Train Q1+Q2+Q3 MAE: {mean_absolute_error(y_q1_q2_q3_train, q1_q2_q3_dt.predict(X_q1_q2_q3_train))}")
        print(f"{version} Q1+Q2+Q3 MAE: {mean_absolute_error(y_q1_q2_q3_test, q1_q2_q3_dt.predict(X_q1_q2_q3_test))}")
        print()
        print()


        print(f"{version} Train Q1+Q2+Q3+Q4 R^2: {q1_q2_q3_q4_dt.score(X_q1_q2_q3_q4_train, y_q1_q2_q3_q4_train)}")
        print(f"{version} Q1+Q2+Q3+Q4 R^2: {q1_q2_q3_q4_dt.score(X_q1_q2_q3_q4_test, y_q1_q2_q3_q4_test)}")
        print()
        print(f"{version} Train Q1+Q2+Q3+Q4 MAE: {mean_absolute_error(y_q1_q2_q3_q4_train, q1_q2_q3_q4_dt.predict(X_q1_q2_q3_q4_train))}")
        print(f"{version} Q1+Q2+Q3+Q4 MAE: {mean_absolute_error(y_q1_q2_q3_q4_test, q1_q2_q3_q4_dt.predict(X_q1_q2_q3_q4_test))}")

    test_scores = {
        "Q1": (q1_dt.score(X_q1_test, y_q1_test), mean_absolute_error(y_q1_test, q1_dt.predict(X_q1_test))),
        "Q1+Q2": (q1_q2_dt.score(X_q1_q2_test, y_q1_q2_test), mean_absolute_error(y_q1_q2_test, q1_q2_dt.predict(X_q1_q2_test))),
        "Q1+Q2+Q3": (q1_q2_q3_dt.score(X_q1_q2_q3_test, y_q1_q2_q3_test), mean_absolute_error(y_q1_q2_q3_test, q1_q2_q3_dt.predict(X_q1_q2_q3_test))),
        "Q1+Q2+Q3+Q4": (q1_q2_q3_q4_dt.score(X_q1_q2_q3_q4_test, y_q1_q2_q3_q4_test), mean_absolute_error(y_q1_q2_q3_q4_test, q1_q2_q3_q4_dt.predict(X_q1_q2_q3_q4_test)))
    }
    return test_scores
run_experiment()

Baseline Train Q1 R^2: 1.0
Baseline Test Q1 R^2: -0.8999333339037794

Baseline Train Q1 MAE: 0.0
Baseline Test Q1 MAE: 11.325376884422111


Baseline Train Q1+Q2 R^2: 1.0
Baseline Q1+Q2 R^2: -0.40347814279821437

Baseline Train Q1+Q2 MAE: 0.0
Baseline Q1+Q2 MAE: 9.899497487437186


Baseline Train Q1+Q2+Q3 R^2: 1.0
Baseline Q1+Q2+Q3 R^2: -0.21069882708224008

Baseline Train Q1+Q2+Q3 MAE: 0.0
Baseline Q1+Q2+Q3 MAE: 9.194723618090451


Baseline Train Q1+Q2+Q3+Q4 R^2: 1.0
Baseline Q1+Q2+Q3+Q4 R^2: -0.012159425889279651

Baseline Train Q1+Q2+Q3+Q4 MAE: 0.0
Baseline Q1+Q2+Q3+Q4 MAE: 8.430904522613066


{'Q1': (-0.8999333339037794, np.float64(11.325376884422111)),
 'Q1+Q2': (-0.40347814279821437, np.float64(9.899497487437186)),
 'Q1+Q2+Q3': (-0.21069882708224008, np.float64(9.194723618090451)),
 'Q1+Q2+Q3+Q4': (-0.012159425889279651, np.float64(8.430904522613066))}

### Baseline Analysis

Given only default hyperparameters, our models are all SEVERELY overfitting to the training data. You can see here that in every case, the model gets a perfect R^2 of 1.0 and MAE of 0.0 on the training set while performing exceptionally bad on the test sets.

The severe overfitting can probably be largely combatted through limiting DT depth which I will explore in the following cells

In [42]:
potential_max_depths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
results = []
for max_depth in potential_max_depths:
    results.append(run_experiment(max_depth=max_depth, verbose=False))

# find the max depth with the best overall test scores
best_max_depth_r2 = {"Q1": None,
                     "Q1+Q2": None,
                     "Q1+Q2+Q3": None,
                     "Q1+Q2+Q3+Q4": None}
best_max_depth_mae = {"Q1": None,
                     "Q1+Q2": None,
                     "Q1+Q2+Q3": None,
                     "Q1+Q2+Q3+Q4": None}
best_score = {"Q1": None,
                     "Q1+Q2": None,
                     "Q1+Q2+Q3": None,
                     "Q1+Q2+Q3+Q4": None}
best_mae = {"Q1": None,
                     "Q1+Q2": None,
                     "Q1+Q2+Q3": None,
                     "Q1+Q2+Q3+Q4": None}
for i, result in enumerate(results):
    for key, (r2, mae) in result.items():
        if best_score[key] is None or r2 > best_score[key]:
            best_score[key] = r2
            best_max_depth_r2[key] = potential_max_depths[i]
        if best_mae[key] is None or mae < best_mae[key]:
            best_mae[key] = mae
            best_max_depth_mae[key] = potential_max_depths[i]

In [46]:
print("Best Max Depth by R^2")
print(best_max_depth_r2)
print(best_score)
print()
print("Best Max Depth by MAE")
print(best_max_depth_mae)
print(best_mae)

Best Max Depth by R^2
{'Q1': 3, 'Q1+Q2': 5, 'Q1+Q2+Q3': 5, 'Q1+Q2+Q3+Q4': 4}
{'Q1': 0.1381728883974469, 'Q1+Q2': 0.22639226189613315, 'Q1+Q2+Q3': 0.3208318910359955, 'Q1+Q2+Q3+Q4': 0.3050557642406375}

Best Max Depth by MAE
{'Q1': 3, 'Q1+Q2': 5, 'Q1+Q2+Q3': 5, 'Q1+Q2+Q3+Q4': 4}
{'Q1': np.float64(7.465173120862425), 'Q1+Q2': np.float64(7.209425254406501), 'Q1+Q2+Q3': np.float64(6.883489179721094), 'Q1+Q2+Q3+Q4': np.float64(6.932185380405856)}


### Interpretation

We see here that given different quarter information, we get different best max_depths for test MAE/R2. It looks like early in the game (q1) it is best not to make too many splits. This makes sense, because there is a lot of variability later in the game, and if we fit too closely to the training set, we will almost certainly do poorly when generalizing.

We see that there was a dramatic improvement in the  model's ability to predict the final score after q1 with the R^2 improving from -0.899 (much worse than just choosing the mean) to 0.138 and MAE from 11.3 to 7.4 points. This means that our model can predict the final score within roughly 1 touch down after only the first quarter of play! Also note that even given Q1,2,3, and 4 stats, the model still has an average error of nearly 7 points which suggests that when using the stats that we are using, even knowing the outcome of the entire game hardly improves the model's ability to predict the score.