In [535]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import statistics

from sklearn.model_selection import train_test_split, validation_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.metrics import plot_confusion_matrix, confusion_matrix

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [473]:
combine = pd.read_csv('CB_Combine.txt', delimiter = ",") # importing combine dataset
combine = combine.drop(['Year', 'Player', 'Rk', 'AV', 'Pos', 'Unnamed: 7'], axis=1)
combine = combine.drop_duplicates()
combine['Drafted_bool'] = np.where(combine['Drafted (tm/rnd/yr)'].isna() == True, 0, 1) # create boolean drafted or not value

def inches(x): # function to convert height in feet-inches format to inches (5-11 > 71)
    height = (x.split('-'))
    total_inches = int(height[0])*12 + int(height[1])
    return total_inches

def draft_rank(x): # function to get draft rank
    try:
        line = (x.split('/'))
        rank_str = line[2]
        rank = int(re.findall(r'\d+', rank_str)[0])
    except:
        rank = -1
    return rank

combine['Height'] = combine['Height'].apply(lambda x: inches(x)) #covert height from feet and inches to inches
combine['Rank'] = combine['Drafted (tm/rnd/yr)'].apply(lambda x: draft_rank(x)) #get draft rank
combine['School'] = combine['School'].replace({'St\.': 'State'}, regex=True)

In [474]:
combine.head(3)

Unnamed: 0,Age,School,Height,Wt,40YD,Vertical,BenchReps,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Drafted_bool,Rank
0,22.0,Florida,71,191,4.37,43.5,26.0,136.0,6.8,4.13,,0,-1
1,21.0,Syracuse,74,208,,36.0,20.0,123.0,,,,0,-1
2,24.0,Oklahoma State,71,189,4.5,36.5,14.0,123.0,6.98,4.15,,0,-1


In [475]:
schoolinfo = pd.read_csv('School_Conferences.txt', delimiter = ",") # import school conference dataset
schoolinfo = schoolinfo.drop(['stadium', 'city', 'capacity', 'built', 'expanded'], axis=1) 
pedigree_list = ['ACC', 'Big Ten', 'Big 12', 'SEC', 'Pac-12']

schoolinfo['Pedigree'] = np.where(schoolinfo['conference'].isin(pedigree_list), 1, 0)

In [476]:
schoolinfo.head(3)

Unnamed: 0,state,team,conference,div,latitude,longitude,Pedigree
0,MI,Michigan,Big Ten,fbs,42.265869,-83.748726,1
1,PA,Penn State,Big Ten,fbs,40.812153,-77.856202,1
2,OH,Ohio State,Big Ten,fbs,40.001686,-83.019728,1


In [477]:
df = combine.merge(schoolinfo, left_on='School', right_on='team', how = 'outer', suffixes=('_left', '_right'))
df = df[df['School'].notna()]

df.head(1)

Unnamed: 0,Age,School,Height,Wt,40YD,Vertical,BenchReps,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Drafted_bool,Rank,state,team,conference,div,latitude,longitude,Pedigree
0,22.0,Florida,71.0,191.0,4.37,43.5,26.0,136.0,6.8,4.13,,0.0,-1.0,FL,Florida,SEC,fbs,29.649869,-82.348666,1.0


In [None]:
from sklearn.linear_model import LogisticRegression
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform (X_test)

logisticRegr = LogisticRegression( max_iter=1000, fit_intercept=True)
logisticRegr.fit(X_train_scaled, y_train)

score = logisticRegr.score(X_test_scaled, y_test)
print(score)

logisticRegr.coef_

In [None]:
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error

lassoRegr = Lasso(normalize=True,max_iter=10000)
alphas = 10**np.linspace(10,-20,200)*0.5
print(alphas)
train_scores, test_scores = validation_curve(
    lassoRegr, X_train_scaled, y_train, param_name="alpha", param_range=alphas,
    scoring="neg_mean_squared_error", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

print(test_scores_mean)
alpha_best = alphas[np.argmax(test_scores_mean)]

#Plot CV error as a function of alpha
plt.title("Validation Curve with Lasso Regression")
plt.xlabel("alpha parameter")
plt.ylabel("Score (negative MSE)")
lw = 2
plt.semilogx(alphas, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(alphas, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(alphas, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(alphas, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()
#performance very similar for lower values of parameters. Observe that in this dataset there is very little need for regularization.

In [None]:
lasso = Lasso(alpha=alpha_best,max_iter=10000,normalize=True)
lasso.fit(X_train_scaled, y_train)
MSE_train_lasso=mean_squared_error(y_train, lasso.predict(X_train_scaled))
MSE_test_lasso=mean_squared_error(y_test, lasso.predict(X_test_scaled))
print(MSE_train_lasso)
print(MSE_test_lasso)
print(alpha_best)

In [540]:
# Decision trees with Age, depth 3

y = df['Drafted_bool']

X = df.loc[:, df.columns != 'Drafted_bool']
X = X.drop(['Drafted (tm/rnd/yr)', 'team', 'latitude', 'Rank','longitude', 'School','state', 'conference', 'div'], axis=1)

gini = []
info = []

for i in range(100):
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(X)

    newX = pd.DataFrame(data=imp.transform(X), columns=X.columns)
    X_train, X_test, y_train, y_test = train_test_split(newX, y, test_size=0.2)
    
    clf_gini = DecisionTreeClassifier(max_depth = 3, criterion = 'gini')
    clf_gini.fit(X_train, y_train)
    gini.append(clf_gini.score(X_test, y_test))

    clf_info = DecisionTreeClassifier(max_depth = 3, criterion = 'entropy')
    clf_info.fit(X_train, y_train)
    info.append(clf_info.score(X_test, y_test))
print(statistics.mean(gini), statistics.mean(info))

0.749375 0.758375


In [None]:
# Decision trees with Age, depth 4

y = df['Drafted_bool']

X = df.loc[:, df.columns != 'Drafted_bool']
X = X.drop(['Drafted (tm/rnd/yr)', 'team', 'latitude', 'Rank','longitude', 'School','state', 'conference', 'div'], axis=1)

gini = []
info = []

for i in range(100):
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(X)

    newX = pd.DataFrame(data=imp.transform(X), columns=X.columns)
    X_train, X_test, y_train, y_test = train_test_split(newX, y, test_size=0.2)
    
    clf_gini = DecisionTreeClassifier(max_depth = 4, criterion = 'gini')
    clf_gini.fit(X_train, y_train)
    gini.append(clf_gini.score(X_test, y_test))

    clf_info = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
    clf_info.fit(X_train, y_train)
    info.append(clf_info.score(X_test, y_test))
    
print(statistics.mean(gini), statistics.mean(info))

In [541]:
# Decision trees without Age, depth 3

y = df['Drafted_bool']

X = df.loc[:, df.columns != 'Drafted_bool']
X = X.drop(['Age', 'Drafted (tm/rnd/yr)', 'team', 'latitude', 'Rank','longitude', 'School','state', 'conference', 'div'], axis=1)

gini = []
info = []

for i in range(100):
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(X)

    newX = pd.DataFrame(data=imp.transform(X), columns=X.columns)
    X_train, X_test, y_train, y_test = train_test_split(newX, y, test_size=0.2)
    
    clf_gini = DecisionTreeClassifier(max_depth = 3, criterion = 'gini')
    clf_gini.fit(X_train, y_train)
    gini.append(clf_gini.score(X_test, y_test))

    clf_info = DecisionTreeClassifier(max_depth = 3, criterion = 'entropy')
    clf_info.fit(X_train, y_train)
    info.append(clf_info.score(X_test, y_test))
print(statistics.mean(gini), statistics.mean(info))

0.69925 0.697125


In [543]:
# Decision trees without Age, depth 3

y = df['Drafted_bool']

X = df.loc[:, df.columns != 'Drafted_bool']
X = X.drop(['Age', 'Drafted (tm/rnd/yr)', 'team', 'latitude', 'Rank','longitude', 'School','state', 'conference', 'div'], axis=1)

gini = []
info = []

for i in range(100):
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(X)

    newX = pd.DataFrame(data=imp.transform(X), columns=X.columns)
    X_train, X_test, y_train, y_test = train_test_split(newX, y, test_size=0.2)
    
    clf_gini = DecisionTreeClassifier(max_depth = 4, criterion = 'gini')
    clf_gini.fit(X_train, y_train)
    gini.append(clf_gini.score(X_test, y_test))

    clf_info = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
    clf_info.fit(X_train, y_train)
    info.append(clf_info.score(X_test, y_test))
print(statistics.mean(gini), statistics.mean(info))

0.685 0.6905


In [None]:
# from graphviz import Source
# Source(tree.export_graphviz(clf_info, feature_names=X_train.columns))

In [546]:
# Part 2 of report- rank breaking

In [552]:
combine = pd.read_csv('CB_Combine.txt', delimiter = ",") # importing combine dataset
combine = combine.drop(['Player', 'Rk', 'AV', 'Pos', 'Unnamed: 7'], axis=1)
combine = combine.drop_duplicates()
combine['Drafted_bool'] = np.where(combine['Drafted (tm/rnd/yr)'].isna() == True, 0, 1) # create boolean drafted or not value

def inches(x): # function to convert height in feet-inches format to inches (5-11 > 71)
    height = (x.split('-'))
    total_inches = int(height[0])*12 + int(height[1])
    return total_inches

def draft_rank(x): # function to get draft rank
    try:
        line = (x.split('/'))
        rank_str = line[2]
        rank = int(re.findall(r'\d+', rank_str)[0])
    except:
        rank = -1
    return rank

combine['Height'] = combine['Height'].apply(lambda x: inches(x)) #covert height from feet and inches to inches
combine['Rank'] = combine['Drafted (tm/rnd/yr)'].apply(lambda x: draft_rank(x)) #get draft rank
combine['School'] = combine['School'].replace({'St\.': 'State'}, regex=True)

schoolinfo = pd.read_csv('School_Conferences.txt', delimiter = ",") # import school conference dataset
schoolinfo = schoolinfo.drop(['stadium', 'city', 'capacity', 'built', 'expanded'], axis=1) 
pedigree_list = ['ACC', 'Big Ten', 'Big 12', 'SEC', 'Pac-12']

schoolinfo['Pedigree'] = np.where(schoolinfo['conference'].isin(pedigree_list), 1, 0)

df = combine.merge(schoolinfo, left_on='School', right_on='team', how = 'outer', suffixes=('_left', '_right'))
df = df[df['School'].notna()]

df.head(1)

Unnamed: 0,Year,Age,School,Height,Wt,40YD,Vertical,BenchReps,Broad Jump,3Cone,...,Drafted (tm/rnd/yr),Drafted_bool,Rank,state,team,conference,div,latitude,longitude,Pedigree
0,2021.0,22.0,Florida,71.0,191.0,4.37,43.5,26.0,136.0,6.8,...,,0.0,-1.0,FL,Florida,SEC,fbs,29.649869,-82.348666,1.0


In [670]:
from sklearn.cluster import KMeans
year = []
df = df[df['Rank'] != -1]
for i in set(df['Year']):
    ndf= df[df['Year'] == i].sort_values('Rank').set_index('Rank')
    ndf['Relative_rank'] = np.arange(ndf.shape[0])
    # ndf = ndf.set_index('Relative_rank')
    ndf = ndf[['Height', 'Wt', '40YD', 'Vertical',
       'BenchReps', 'Broad Jump', '3Cone', 'Shuttle', 
        'Pedigree', 'Relative_rank']].reset_index()
    y = ndf['Relative_rank']

    X = ndf.loc[:, ndf.columns != 'Relative_rank']
    # X = X.drop(['Drafted (tm/rnd/yr)', 'team', 'latitude', 'Rank','longitude', 'School','state', 'conference', 'div'], axis=1)


    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(ndf)

    newX = pd.DataFrame(data=imp.transform(ndf), columns=ndf.columns)
    
    newX['Height'] =  newX['Height'] 
    newX['BMI'] = 703 * newX['Wt']/ (newX['Height'])**2
    newX['Jump'] = newX['Broad Jump'] + newX['Vertical']
    newX['Agility'] = newX['3Cone'] + newX['Shuttle'] + newX['40YD']

    newX = newX[['BMI', 'Wt', 'Jump','Agility']]

#     X_train, X_test, y_train, y_test = train_test_split(ndf, y, test_size=0.2)
   
    try: 
        kmeans = KMeans(n_clusters=2, random_state=0, max_iter=10).fit(newX)
        clusters = tuple(kmeans.cluster_centers_)
        labels = list(kmeans.labels_)[:10]
        labels2 = list(kmeans.labels_)[10:]
        mode_cb = max(set(labels), key=labels.count)
        other_cb = min(set(labels2), key=labels2.count)
        print(clusters[mode_cb][1], clusters[other_cb][1])
        year.append((i, clusters[mode_cb][0] - clusters[other_cb][0],clusters[mode_cb][1] - clusters[other_cb][1], clusters[mode_cb][2] - clusters[other_cb][2], clusters[mode_cb][3] - clusters[other_cb][3]))
    except:
        pass
    

204.3 204.3
193.9090909090909 212.2
186.4375 203.16666666666666
192.42857142857142 192.42857142857142
202.0 202.0
192.0 207.0
201.75 201.75
187.86666666666667 202.75
190.33333333333334 206.42857142857142
190.13636363636363 205.0
197.5 197.5


In [675]:
print(pd.DataFrame(year, columns=['Year','BMI', 'Wt', 'Jump','Agility']).sort_values('Year').to_latex(float_format="%.2f", index=False))

\begin{tabular}{rrrrr}
\toprule
   Year &   BMI &     Wt &  Jump &  Agility \\
\midrule
2010.00 & -1.66 & -15.00 &  1.34 &     0.00 \\
2011.00 &  0.00 &   0.00 &  0.00 &     0.00 \\
2012.00 & -1.44 & -14.88 & -2.89 &    -0.07 \\
2013.00 & -1.30 & -16.10 & -0.93 &    -0.21 \\
2014.00 & -0.45 & -14.86 & -1.26 &    -0.09 \\
2015.00 &  0.00 &   0.00 &  0.00 &     0.00 \\
2016.00 &  0.00 &   0.00 &  0.00 &     0.00 \\
2017.00 & -0.65 & -18.29 &  7.96 &    -0.12 \\
2018.00 & -2.38 & -16.73 &  1.30 &    -0.37 \\
2019.00 &  0.00 &   0.00 &  0.00 &     0.00 \\
2020.00 &  0.00 &   0.00 &  0.00 &     0.00 \\
\bottomrule
\end{tabular}

