In [45]:
import pandas as pd

df = pd.read_csv("rookie_stats_final_clean.csv")
if "BAR" in df.columns:
    df["BAR"] = df["BAR"].str.rstrip('%').astype(float)
    
df["SeasonStart"] = df["Season"].str.extract(r"(\d{4})").astype(int)
df.dropna(inplace=True)

train_df = df[df["SeasonStart"] <= 2022]
test_df = df[df["SeasonStart"] >= 2023]

train_df.to_csv("rookie_train_pre2023.csv", index=False)
test_df.to_csv("rookie_test_2023on.csv", index=False)

print("Cleaned, filtered, and split data saved:")
print(" - rookie_train_pre2023.csv")
print(" - rookie_test_2023on.csv")

Cleaned, filtered, and split data saved:
 - rookie_train_pre2023.csv
 - rookie_test_2023on.csv


In [46]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

df = pd.read_csv("rookie_train_pre2023.csv")

player_names = df["Player"].values

features_df = df.drop(columns=["Player", "Season", "SeasonStart"], errors="ignore")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features_df.astype(np.float32))

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)

class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim=8):
        super().__init__()
        self.encoder = nn.Sequential(
        nn.Linear(input_dim, 128),
        nn.Tanh(),
        nn.Linear(128, 64),
        nn.Tanh(),
        nn.Linear(64, encoding_dim)
    )

        self.decoder = nn.Sequential(
        nn.Linear(encoding_dim, 64),
        nn.ReLU(),
        nn.Linear(64, 128),
        nn.ReLU(),
        nn.Linear(128, input_dim)
    )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

input_dim = X_tensor.shape[1]
encoding_dim = 8
model = Autoencoder(input_dim, encoding_dim)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.SmoothL1Loss()

epochs = 200
i = 0
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_tensor)
    loss = loss_fn(output, X_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 50 == 0:
        print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item():.6f}")
    print("Epoch " ,  i , "complete")
    i += 1

model.eval()
with torch.no_grad():
    encoded_data = model.encoder(X_tensor).numpy()

knn = NearestNeighbors(n_neighbors=6, metric="euclidean")
knn.fit(encoded_data)

def find_similar(player_name, top_n=5):
    if player_name not in player_names:
        print(f"Player '{player_name}' not found in training data.")
        return

    idx = np.where(player_names == player_name)[0][0]
    query_vec = encoded_data[idx].reshape(1, -1)
    distances, indices = knn.kneighbors(query_vec, n_neighbors=top_n + 1)

    print(f"\n Top {top_n} most similar rookies to {player_name}:")
    for rank, (i, dist) in enumerate(zip(indices[0][1:], distances[0][1:]), start=1):
        print(f"{rank}. {player_names[i]} (distance: {dist:.4f})")

Epoch  0 complete
Epoch  1 complete
Epoch  2 complete
Epoch  3 complete
Epoch  4 complete
Epoch  5 complete
Epoch  6 complete
Epoch  7 complete
Epoch  8 complete
Epoch  9 complete
Epoch  10 complete
Epoch  11 complete
Epoch  12 complete
Epoch  13 complete
Epoch  14 complete
Epoch  15 complete
Epoch  16 complete
Epoch  17 complete
Epoch  18 complete
Epoch  19 complete
Epoch  20 complete
Epoch  21 complete
Epoch  22 complete
Epoch  23 complete
Epoch  24 complete
Epoch  25 complete
Epoch  26 complete
Epoch  27 complete
Epoch  28 complete
Epoch  29 complete
Epoch  30 complete
Epoch  31 complete
Epoch  32 complete
Epoch  33 complete
Epoch  34 complete
Epoch  35 complete
Epoch  36 complete
Epoch  37 complete
Epoch  38 complete
Epoch  39 complete
Epoch  40 complete
Epoch  41 complete
Epoch  42 complete
Epoch  43 complete
Epoch  44 complete
Epoch  45 complete
Epoch  46 complete
Epoch  47 complete
Epoch  48 complete
Epoch 50/200 - Loss: 0.095527
Epoch  49 complete
Epoch  50 complete
Epoch  51 c

In [47]:
def find_similar_post2023_player(player_name, test_csv="rookie_test_2023on.csv", top_n=5):
    test_df = pd.read_csv(test_csv)
    test_df.dropna(inplace=True)

    row = test_df[test_df["Player"] == player_name]
    if row.empty:
        print(f" Player '{player_name}' not found in {test_csv}")
        return

    row_features = row.drop(columns=["Player", "Season", "SeasonStart"], errors="ignore")

    row_scaled = scaler.transform(row_features.astype(np.float32))
    row_tensor = torch.tensor(row_scaled, dtype=torch.float32)

    model.eval()
    with torch.no_grad():
        row_encoded = model.encoder(row_tensor).numpy()

    distances, indices = knn.kneighbors(row_encoded, n_neighbors=top_n)

    print(f"\n Top {top_n} similar rookies to {player_name}:")
    for rank, (j, dist) in enumerate(zip(indices[0], distances[0]), 1):
        print(f"{rank}. {player_names[j]} (distance: {dist:.4f})")

In [49]:
find_similar_post2023_player("Cody Williams", top_n=5)


 Top 5 similar rookies to Cody Williams:
1. Joshua Primo (distance: 1.2044)
2. Xavier Henry (distance: 1.4260)
3. Bryce McGowens (distance: 1.5694)
4. Antoine Wright (distance: 1.7309)
5. Josh Hall (distance: 1.7946)
