In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

def load_and_process_data():
    df = pd.read_csv('career_stats.csv')
    target = 'HOF'

    features_to_drop = ['QBrec', 'HOF', 'Name']

    X = df.drop(columns=features_to_drop)
    y = df[target]

    X = X.fillna(0)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    train_names = df['Name'].iloc[y_train.index]
    val_names = df['Name'].iloc[y_val.index]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    return X_train_scaled, y_train, X_val_scaled, y_val, len(X.columns), train_names, val_names

class QBDataSet(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class HOFPredictor(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(HOFPredictor, self).__init__()
        self.layer1 = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(64, 32)
        self.output = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.output(x)
        return x

BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 20
NUM_CLASSES = 2

X_train, y_train, X_val, y_val, input_dim, train_names, val_names = load_and_process_data()


train_dataset = QBDataSet(X_train, np.array(y_train))
val_dataset = QBDataSet(X_val, np.array(y_val))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = HOFPredictor(input_dim, NUM_CLASSES)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)

        loss = criterion(outputs, y_batch)

        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            _, predicted = torch.max(outputs.data, 1)

            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    val_acc = correct / total
    avg_loss = running_loss / len(train_loader)

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {avg_loss:.4f} | Val Accuracy: {val_acc:.4f}")

Epoch 1/20 | Loss: 0.6883 | Val Accuracy: 0.7027
Epoch 2/20 | Loss: 0.6345 | Val Accuracy: 0.7568
Epoch 3/20 | Loss: 0.5851 | Val Accuracy: 0.8108
Epoch 4/20 | Loss: 0.5610 | Val Accuracy: 0.8108
Epoch 5/20 | Loss: 0.5158 | Val Accuracy: 0.8108
Epoch 6/20 | Loss: 0.4806 | Val Accuracy: 0.8108
Epoch 7/20 | Loss: 0.4535 | Val Accuracy: 0.8108
Epoch 8/20 | Loss: 0.4310 | Val Accuracy: 0.8108
Epoch 9/20 | Loss: 0.3933 | Val Accuracy: 0.8108
Epoch 10/20 | Loss: 0.3632 | Val Accuracy: 0.8108
Epoch 11/20 | Loss: 0.3461 | Val Accuracy: 0.8108
Epoch 12/20 | Loss: 0.3251 | Val Accuracy: 0.8108
Epoch 13/20 | Loss: 0.3044 | Val Accuracy: 0.8108
Epoch 14/20 | Loss: 0.2794 | Val Accuracy: 0.8108
Epoch 15/20 | Loss: 0.2674 | Val Accuracy: 0.8108
Epoch 16/20 | Loss: 0.2435 | Val Accuracy: 0.8378
Epoch 17/20 | Loss: 0.2241 | Val Accuracy: 0.8378
Epoch 18/20 | Loss: 0.2169 | Val Accuracy: 0.8649
Epoch 19/20 | Loss: 0.2010 | Val Accuracy: 0.8378
Epoch 20/20 | Loss: 0.1994 | Val Accuracy: 0.8378


In [15]:
import torch
from torch.utils.data import DataLoader
from pydvl.influence.torch import DirectInfluence

infl_model = DirectInfluence(model, nn.CrossEntropyLoss(), regularization=0.01)
infl_model = infl_model.fit(train_loader)
influences = infl_model.influences(val_dataset.X, val_dataset.y, train_dataset.X, train_dataset.y)

influences = influences.cpu().numpy()
np.savetxt("influences.csv", influences, delimiter=",")

# save train_names and val_names
np.savetxt("train_names.csv", train_names, delimiter=",", fmt="%s")
np.savetxt("val_names.csv", val_names, delimiter=",", fmt="%s")
