In [5]:
import pandas as pd
df = pd.read_csv('../datasets/scores_w_avgs.csv')
df = df.dropna()
df

Unnamed: 0,schedule_date,schedule_season,home_id,away_id,score_home,home_avg_yards,home_avg_TO,score_away,away_avg_yards,away_avg_TO,total_score,over_under_line,label
0,1/14/1968,1967,GNB,RAI,33,304.117647,2.529412,14,361.764706,2.352941,47,43.0,over
1,1/12/1969,1968,CLT,NYJ,7,332.529412,2.529412,16,365.470588,1.823529,23,40.0,under
2,1/11/1970,1969,KAN,MIN,23,315.470588,2.588235,7,293.705882,2.235294,30,39.0,under
3,1/17/1971,1970,CLT,DAL,16,301.470588,2.529412,13,306.823529,2.000000,29,36.0,under
4,1/16/1972,1971,DAL,MIA,24,342.000000,2.176471,3,311.176471,1.705882,27,34.0,under
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10391,1/17/2021,2020,KAN,CLE,22,417.117647,1.000000,17,370.882353,0.882353,39,56.0,under
10392,1/17/2021,2020,NOR,TAM,20,378.294118,1.294118,30,391.647059,0.882353,50,53.0,under
10393,1/24/2021,2020,GNB,TAM,26,386.294118,0.764706,31,392.352941,0.941176,57,53.0,over
10394,1/24/2021,2020,KAN,BUF,38,421.235294,1.058824,24,376.235294,1.176471,62,55.0,over


In [6]:
df['score_home'] = df['score_home'].astype('float64')
df['score_away'] = df['score_away'].astype('float64')
df['total_score'] = df['total_score'].astype('float64')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10332 entries, 0 to 10395
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   schedule_date    10332 non-null  object 
 1   schedule_season  10332 non-null  int64  
 2   home_id          10332 non-null  object 
 3   away_id          10332 non-null  object 
 4   score_home       10332 non-null  float64
 5   home_avg_yards   10332 non-null  float64
 6   home_avg_TO      10332 non-null  float64
 7   score_away       10332 non-null  float64
 8   away_avg_yards   10332 non-null  float64
 9   away_avg_TO      10332 non-null  float64
 10  total_score      10332 non-null  float64
 11  over_under_line  10332 non-null  float64
 12  label            10332 non-null  object 
dtypes: float64(8), int64(1), object(4)
memory usage: 1.1+ MB


In [7]:
features = pd.DataFrame(df, columns = ['home_avg_yards', 'home_avg_TO', 'away_avg_yards', 'away_avg_TO'])
labels = pd.DataFrame(df, columns=['label'])

In [8]:
from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(features, np.ravel(labels), test_size=0.3, random_state=109)

In [12]:
from sklearn import svm, metrics

# We want to test out different C values to see what works best
a = []
c_val = 0.1
while c_val < 1.0:
    clf = svm.SVC(kernel='linear', C=c_val) # Linear Kernel
    
    # Train the model using the training sets
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn) 
    f1 = (2*(recall * precision)) / (recall + precision)
    
        
    d = {}
    d['c'] = c_val
    d['accuracy'] = accuracy
    d['precision'] = precision
    d['recall'] = recall
    d['f1'] = f1
    c_val += 0.1
    a.append(d)

[{'c': 0.1, 'accuracy': 0.5293548387096774, 'precision': 0.537243947858473, 'recall': 0.7127856701667696, 'f1': 0.6126891425537563}, {'c': 0.2, 'accuracy': 0.5296774193548387, 'precision': 0.5377756921633036, 'recall': 0.7078443483631871, 'f1': 0.6111999999999999}, {'c': 0.30000000000000004, 'accuracy': 0.5309677419354839, 'precision': 0.5379659456971928, 'recall': 0.7220506485484868, 'f1': 0.6165611814345991}, {'c': 0.4, 'accuracy': 0.5319354838709678, 'precision': 0.5385321100917431, 'recall': 0.7251389746757257, 'f1': 0.6180573835219795}, {'c': 0.5, 'accuracy': 0.5312903225806451, 'precision': 0.538003663003663, 'recall': 0.7257566399011736, 'f1': 0.6179332106231922}, {'c': 0.6, 'accuracy': 0.5309677419354839, 'precision': 0.5382475660639777, 'recall': 0.7171093267449042, 'f1': 0.6149364406779662}, {'c': 0.7, 'accuracy': 0.5319354838709678, 'precision': 0.5385321100917431, 'recall': 0.7251389746757257, 'f1': 0.6180573835219795}, {'c': 0.7999999999999999, 'accuracy': 0.53225806451612

In [25]:
fmt = "{0:.4f}"
for item in a:
    print(f'c: {fmt.format(item["c"])}, accuracy: {fmt.format(item["accuracy"])}, precision: {fmt.format(item["precision"])}, recall: {fmt.format(item["recall"])}, f1: {fmt.format(item["f1"])}')

c: 0.1000, accuracy: 0.5294, precision: 0.5372, recall: 0.7128, f1: 0.6127
c: 0.2000, accuracy: 0.5297, precision: 0.5378, recall: 0.7078, f1: 0.6112
c: 0.3000, accuracy: 0.5310, precision: 0.5380, recall: 0.7221, f1: 0.6166
c: 0.4000, accuracy: 0.5319, precision: 0.5385, recall: 0.7251, f1: 0.6181
c: 0.5000, accuracy: 0.5313, precision: 0.5380, recall: 0.7258, f1: 0.6179
c: 0.6000, accuracy: 0.5310, precision: 0.5382, recall: 0.7171, f1: 0.6149
c: 0.7000, accuracy: 0.5319, precision: 0.5385, recall: 0.7251, f1: 0.6181
c: 0.8000, accuracy: 0.5323, precision: 0.5393, recall: 0.7165, f1: 0.6154
c: 0.9000, accuracy: 0.5313, precision: 0.5387, recall: 0.7134, f1: 0.6139
c: 1.0000, accuracy: 0.5290, precision: 0.5371, recall: 0.7103, f1: 0.6117


In [26]:
# Best C value seems to be 0.4. Has second highest accuracy and the highest in all other metrics from above

clf = svm.SVC(kernel='linear', C=0.4) # Linear Kernel
    
# Train the model using the training sets
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

Accuracy:  0.5319
Precision: 0.5385
Recall:    0.7251
F1 Score:  0.6181


In [27]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()
accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn) 
f1 = (2*(recall * precision)) / (recall + precision)

print(f'Accuracy:  {fmt.format(accuracy)}')
print(f'Precision: {fmt.format(precision)}')
print(f'Recall:    {fmt.format(recall)}')
print(f'F1 Score:  {fmt.format(f1)}')

Accuracy:  0.5319
Precision: 0.5385
Recall:    0.7251
F1 Score:  0.6181


In [30]:
# Train the model using a different kernel
a = []
c_val = 0.1
while c_val < 1.0:
    clf = svm.SVC(kernel='poly', C=c_val) # Linear Kernel
    
    # Train the model using the training sets
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn) 
    f1 = (2*(recall * precision)) / (recall + precision)
    
        
    d = {}
    d['c'] = c_val
    d['accuracy'] = accuracy
    d['precision'] = precision
    d['recall'] = recall
    d['f1'] = f1
    c_val += 0.1
    a.append(d)

for item in a:
    

In [31]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predictions).ravel()
accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn) 
f1 = (2*(recall * precision)) / (recall + precision)

print(f'Accuracy:  {fmt.format(accuracy)}')
print(f'Precision: {fmt.format(precision)}')
print(f'Recall:    {fmt.format(recall)}')
print(f'F1 Score:  {fmt.format(f1)}')

Accuracy:  0.5332
Precision: 0.5339
Recall:    0.8363
F1 Score:  0.6517
