In [16]:
import pandas as pd
import numpy as np

features = pd.read_csv('team_features.csv')
matches = pd.read_csv('MNCAATourneyCompactResults.csv')
matches.drop(columns = ['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'])
matches = pd.merge(matches[['Season', 'WTeamID', 'LTeamID']], features[['TeamID', 'Season', 'weighted_past_seed',
       '3pt_success_rate', 'freethrow_success_rate', 'field_success_rate']], left_on=['Season', 'WTeamID'], right_on = ['Season', 'TeamID'])

matches = matches.drop(columns = ['TeamID'])
matches = matches.rename(columns = {'weighted_past_seed': 'weighted_past_seed_W', 
                                    '3pt_success_rate': '3pt_success_rate_W', 
                                    'freethrow_success_rate': 'freethrow_success_rate_W',
                                    'freethrow_success_rate': 'freethrow_success_rate_W',
                                    'field_success_rate': 'field_success_rate_W'
                                    })
matches = pd.merge(matches[['Season', 'WTeamID', 'LTeamID', 'weighted_past_seed_W',
       '3pt_success_rate_W', 'freethrow_success_rate_W',
       'field_success_rate_W']], features[['TeamID', 'Season', 'weighted_past_seed',
       '3pt_success_rate', 'freethrow_success_rate', 'field_success_rate']], left_on=['Season', 'LTeamID'], right_on = ['Season', 'TeamID'])

matches = matches.drop(columns=['TeamID'])
matches = matches.rename(columns = {'weighted_past_seed': 'weighted_past_seed_L', 
                                    '3pt_success_rate': '3pt_success_rate_L', 
                                    'freethrow_success_rate': 'freethrow_success_rate_L',
                                    'field_success_rate': 'field_success_rate_L'
                                    })


cols = list(matches.columns)
all_cols = {'weighted_past_seed', '3pt_success_rate', 'freethrow_success_rate', 'field_success_rate'}
for col in all_cols:
    i, j = cols.index(col+'_W'), cols.index(col+'_L')
    cols[i], cols[j] = cols[j], cols[i]
    matches_negated = matches[cols]

matches['label'] = np.full(len(matches), 1)
matches_negated['label'] = np.full(len(matches_negated), 0)

data = pd.concat([matches, matches_negated])
data = data.drop(columns = ['Season', 'WTeamID', 'LTeamID'])

In [17]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = data.drop(columns='label').values.astype('float32')
Y = data['label'].values.astype('float32')
X = scaler.fit_transform(X)

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8, random_state=42)

X_train_tensor = torch.tensor(X_train)
X_val_tensor = torch.tensor(X_val)
Y_train_tensor = torch.tensor(Y_train).unsqueeze(1)
Y_val_tensor = torch.tensor(Y_val).unsqueeze(1)

train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataset = TensorDataset(X_val_tensor, Y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)

from torch import nn
class FCNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Linear(16, 1))       #nn.Sequential takes in batchsize x input_dim
    def forward(self, x):
        return self.layers(x)
    
model = FCNN(X_train.shape[1])
criterion = nn.BCEWithLogitsLoss()
opt = torch.optim.Adam(model.parameters(), lr = 0.001)
#learning rate too high will cause exploding weights -> NaNs in y_pred

max_epoch = 100
for epoch in range(max_epoch):
    model.train()
    avg_loss = 0
    for x, y in train_loader:
        y_pred = model(x)
        loss = criterion(y_pred, y)
        opt.zero_grad()
        loss.backward()
        opt.step()

        avg_loss += loss.item()/len(train_loader)
    if (epoch + 1) % 10 == 0:
        print(f"epoch {epoch+1}: Loss {avg_loss: .4f}")

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for x, y in val_loader:
        y_pred = model(x)
        pred = y_pred >= 0.5
        correct += (pred == y).sum().item()
        total += len(x)
print(f"validation accuracy: {correct/total: .2%}")

epoch 10: Loss  0.6929
epoch 20: Loss  0.6925
epoch 30: Loss  0.6917
epoch 40: Loss  0.6910
epoch 50: Loss  0.6902
epoch 60: Loss  0.6893
epoch 70: Loss  0.6886
epoch 80: Loss  0.6881
epoch 90: Loss  0.6869
epoch 100: Loss  0.6859
validation accuracy:  46.33%


In [18]:
X_train.shape[1]

8