In [32]:
import numpy as np
import pandas as pd

from sklearn import preprocessing, model_selection, metrics
from sklearn import tree as Tree


import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [252]:
data = pd.read_csv("citi_field.csv")

#Keep only necessary attributes
data = data[['events','description','type','outs_when_up',
      'hc_x','hc_y','hit_distance_sc','launch_speed','launch_angle','p_throws', 'pfx_x',
        'pfx_z', 'plate_x', 'plate_z',
      'estimated_ba_using_speedangle','release_pos_z','release_pos_x']]
#Drop instances where ball wasn't hit into play
data = data[data['description']=='hit_into_play']

hits = ['single','double','triple','home_run']
#data = data[data['events'].isin(hits)]

data.loc[:,'y'] = data['events'].isin(hits)

feats = ['launch_speed','launch_angle', 'hit_distance_sc']
features = data[feats]

data = data[~features.isna().any(axis=1)]
features = data[feats]
features['bias'] = 1
features = features[['launch_speed','launch_angle']]
features['launch_angle'] = features['launch_angle']**2

X_data = np.matrix(features.to_numpy())

y=np.matrix(np.where(data['y'],1,0)).T

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['bias'] = 1


In [253]:
x_train, x_test, y_train,y_test = model_selection.train_test_split(X_data, y)

scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Task 1

In [343]:
tree = Tree.DecisionTreeClassifier()
tree = tree.fit(x_train, y_train)
yhat = tree.predict(x_test)
cm = metrics.confusion_matrix(y_test, yhat)
stats = metrics.recall_score(y_test, yhat), metrics.accuracy_score(y_test, yhat), metrics.precision_score(y_test, yhat)

In [345]:
cm, stats, metrics.confusion_matrix(tree.predict(x_train),y_train)

(array([[228,  60],
        [ 70,  55]], dtype=int64),
 (0.44, 0.6852300242130751, 0.4782608695652174),
 array([[840,  18],
        [  0, 379]], dtype=int64))

The model is performing very well on the training data and poorly on the test data. It appears we are overfitting the data.

In [346]:
tree2 = Tree.DecisionTreeClassifier(splitter='random')
tree2 = tree2.fit(x_train, y_train)
yhat = tree2.predict(x_test)
cm2 = metrics.confusion_matrix(y_test, yhat)
stats2 = metrics.recall_score(y_test, yhat), metrics.accuracy_score(y_test, yhat), metrics.precision_score(y_test, yhat)

In [347]:
cm2, stats2, metrics.confusion_matrix(tree2.predict(x_train),y_train)

(array([[232,  56],
        [ 58,  67]], dtype=int64),
 (0.536, 0.7239709443099274, 0.5447154471544715),
 array([[840,  18],
        [  0, 379]], dtype=int64))

In [334]:
tree3 = Tree.DecisionTreeClassifier(min_samples_leaf=3, max_depth=6, class_weight='balanced')
tree3 = tree3.fit(x_train, y_train)
yhat = tree3.predict(x_test)
cm3 = metrics.confusion_matrix(y_test, yhat)
stats3 = metrics.recall_score(y_test, yhat), metrics.accuracy_score(y_test, yhat), metrics.precision_score(y_test, yhat)

In [335]:
cm3, stats3

(array([[197,  91],
        [ 28,  97]], dtype=int64),
 (0.776, 0.711864406779661, 0.5159574468085106))

In [339]:
y = tree3.predict(x_train)

In [340]:
metrics.confusion_matrix(y_train,y)

array([[578, 262],
       [ 72, 325]], dtype=int64)

## Bagging

In [None]:
models = []
for i in range(10):
    x_train, x_test, y_train,y_test = model_selection.train_test_split(X_data, y)
    scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    
    

In [349]:
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

In [352]:
cv = RepeatedStratifiedKFold(n_splits=10)
bag = BaggingClassifier(base_estimator=tree3, n_estimators=10)
bag.fit(x_train, y_train)

  return f(*args, **kwargs)


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced',
                                                        max_depth=6,
                                                        min_samples_leaf=3))

  return f(*args, **kwargs)


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced',
                                                        max_depth=6,
                                                        min_samples_leaf=3))