In [1]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [2]:
%load_ext autoreload
%autoreload 2
from data.api_fetcher import ApiFetcher
from model.team_embeddings import EmbeddingFetcher
from model.team_embeddings import TeamEmbeddingsModel

In [3]:
api = ApiFetcher(starting_year=2019, ending_year=2025)
df = api.get_dataframe(numeric=False, date=True, time_coeff=False, ids=True)

In [6]:
trainer = TeamEmbeddingsModel(df)
test_mse, trained_model = trainer.train()
print("Test MSE:", test_mse)

  return bound(*args, **kwds)


Test MSE: 18.10117064622039


In [7]:
home_embeddings = trained_model.home_embedding.weight.detach().cpu().numpy()
away_embeddings = trained_model.away_embedding.weight.detach().cpu().numpy()
embeddings_fetcher = EmbeddingFetcher(home_embeddings, away_embeddings)


In [8]:
def prep_df(df1, target_cols=['home_pts', 'away_pts'], scaler=None):
    team_id_cols = ['home_team_id', 'away_team_id']
    exclude_cols = target_cols + team_id_cols + ['date', 'home_team', 'away_team']
    numeric_cols = [col for col in df1.columns if col not in exclude_cols]


    X_numeric_raw = df1[numeric_cols].values
    X_team_ids = df1[team_id_cols].astype(int).values
    y = df1[target_cols].sum(axis=1).values
    if scaler is None:
        scaler = StandardScaler()
        X_numeric = scaler.fit_transform(X_numeric_raw)
    else:
        X_numeric = scaler.transform(X_numeric_raw)

    return X_numeric, X_team_ids, y, scaler, numeric_cols

In [9]:
class NBAEmbeddingDataset(Dataset):
    def __init__(self, X_numeric, X_team_ids, y, fetcher):
        self.X_numeric = torch.tensor(X_numeric, dtype=torch.float32)
        self.X_team_ids = torch.tensor(X_team_ids, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.fetcher = fetcher
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        numeric_features = self.X_numeric[idx]
        home_id, away_id = self.X_team_ids[idx]
        # Fetch embeddings
        home_emb = torch.tensor(self.fetcher.get_home_embedding(home_id), dtype=torch.float32)
        away_emb = torch.tensor(self.fetcher.get_away_embedding(away_id), dtype=torch.float32)
        return numeric_features, home_emb, away_emb, self.y[idx]

In [10]:
class MiniNN(nn.Module):
    def __init__(self, num_numeric_features, embedding_dim):
        super(MiniNN, self).__init__()
        input_size = num_numeric_features + embedding_dim*2
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.out = nn.Linear(32, 1)
        
    def forward(self, numeric_features, home_emb, away_emb):
        x = torch.cat([numeric_features, home_emb, away_emb], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.out(x).squeeze(1)

In [11]:
def train_model(df, fetcher, embedding_dim, num_epochs, lr, batch_size=64,):
    # Sort and split
    df_sorted = df.sort_values("date")
    train_df, val_df, test_df = np.split(
        df_sorted, 
        [int(0.7*len(df_sorted)), int(0.85*len(df_sorted))]
    )
    
    X_train_num, X_train_ids, y_train, scaler, numeric_cols = prep_df(train_df)
    X_val_num, X_val_ids, y_val, _, _ = prep_df(val_df, scaler=scaler)
    X_test_num, X_test_ids, y_test, _, _ = prep_df(test_df, scaler=scaler)
    
    # Datasets
    train_dataset = NBAEmbeddingDataset(X_train_num, X_train_ids, y_train, fetcher)
    val_dataset = NBAEmbeddingDataset(X_val_num, X_val_ids, y_val, fetcher)
    test_dataset = NBAEmbeddingDataset(X_test_num, X_test_ids, y_test, fetcher)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Model
    num_numeric_features = X_train_num.shape[1]
    model = MiniNN(num_numeric_features, embedding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for X_num, home_emb, away_emb, y in train_loader:
            optimizer.zero_grad()
            pred = model(X_num, home_emb, away_emb)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_num.size(0)
        train_loss /= len(train_loader.dataset)
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_num, home_emb, away_emb, y in val_loader:
                pred = model(X_num, home_emb, away_emb)
                val_loss += criterion(pred, y).item() * X_num.size(0)
        val_loss /= len(val_loader.dataset)
        
        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} | Train MSE: {train_loss:.4f} | Val MSE: {val_loss:.4f}")
    
    # Test MSE
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_num, home_emb, away_emb, y in test_loader:
            pred = model(X_num, home_emb, away_emb)
            test_loss += criterion(pred, y).item() * X_num.size(0)
    test_loss /= len(test_loader.dataset)
    print(f"Test MSE: {test_loss:.4f}")
    
    return model, scaler, numeric_cols
    

In [12]:
embedding_dim = home_embeddings.shape[1]
model, scaler, numeric_cols = train_model(df, embeddings_fetcher, embedding_dim, num_epochs=270, lr=0.0008991)

  return bound(*args, **kwds)


Epoch 10/270 | Train MSE: 94.7155 | Val MSE: 104.5811
Epoch 20/270 | Train MSE: 52.1111 | Val MSE: 61.6737
Epoch 30/270 | Train MSE: 40.3554 | Val MSE: 49.8350
Epoch 40/270 | Train MSE: 35.5401 | Val MSE: 46.2402
Epoch 50/270 | Train MSE: 33.5514 | Val MSE: 46.2981
Epoch 60/270 | Train MSE: 31.9704 | Val MSE: 45.3142
Epoch 70/270 | Train MSE: 29.5589 | Val MSE: 42.8878
Epoch 80/270 | Train MSE: 27.8731 | Val MSE: 48.1384
Epoch 90/270 | Train MSE: 27.5819 | Val MSE: 41.9689
Epoch 100/270 | Train MSE: 25.8668 | Val MSE: 43.2711
Epoch 110/270 | Train MSE: 26.0287 | Val MSE: 42.3025
Epoch 120/270 | Train MSE: 24.7638 | Val MSE: 40.8628
Epoch 130/270 | Train MSE: 24.7923 | Val MSE: 54.5419
Epoch 140/270 | Train MSE: 23.1098 | Val MSE: 44.1836
Epoch 150/270 | Train MSE: 22.8135 | Val MSE: 43.0347
Epoch 160/270 | Train MSE: 23.1952 | Val MSE: 42.0540
Epoch 170/270 | Train MSE: 22.7479 | Val MSE: 43.4998
Epoch 180/270 | Train MSE: 22.1200 | Val MSE: 43.3121
Epoch 190/270 | Train MSE: 20.6644 |

In [49]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt

sys.path.append(os.path.abspath(os.path.join('..', 'src')))
%load_ext autoreload
%autoreload 2

from data.api_fetcher import ApiFetcher
from model.team_embeddings import EmbeddingFetcher
from model.team_embeddings import TeamEmbeddingsModel

api = ApiFetcher(starting_year=2019, ending_year=2025)
df = api.get_dataframe(numeric=False, date=True, time_coeff=False, ids=True)

home_embeddings = trained_model.home_embedding.weight.detach().cpu().numpy()
away_embeddings = trained_model.away_embedding.weight.detach().cpu().numpy()
embeddings_fetcher = EmbeddingFetcher(home_embeddings, away_embeddings)

def prep_df(df1, target_cols=['home_pts', 'away_pts'], scaler=None):
    team_id_cols = ['home_team_id', 'away_team_id']
    exclude_cols = target_cols + team_id_cols + ['date', 'home_team', 'away_team']
    numeric_cols = [col for col in df1.columns if col not in exclude_cols]
    
    X_numeric_raw = df1[numeric_cols].values
    X_team_ids = df1[team_id_cols].astype(int).values
    y = df1[target_cols].values  # Shape: (n_samples, 2)
    
    if scaler is None:
        scaler = StandardScaler()
        X_numeric = scaler.fit_transform(X_numeric_raw)
    else:
        X_numeric = scaler.transform(X_numeric_raw)
    
    return X_numeric, X_team_ids, y, scaler, numeric_cols

# ------------------ SEKWENCYJNY DATASET ------------------
class NBASeqDataset(Dataset):
    def __init__(self, X_numeric, X_team_ids, y, fetcher, seq_len=3):
        self.X_numeric = torch.tensor(X_numeric, dtype=torch.float32)
        self.X_team_ids = torch.tensor(X_team_ids, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.fetcher = fetcher
        self.seq_len = seq_len

    def __len__(self):
        return len(self.y) - self.seq_len

    def __getitem__(self, idx):
        # Pobranie sekwencji
        seq_numeric = self.X_numeric[idx:idx+self.seq_len].flatten()
        seq_home_emb = torch.cat([
            torch.tensor(self.fetcher.get_home_embedding(int(self.X_team_ids[i,0])), dtype=torch.float32)
            for i in range(idx, idx+self.seq_len)
        ])
        seq_away_emb = torch.cat([
            torch.tensor(self.fetcher.get_away_embedding(int(self.X_team_ids[i,1])), dtype=torch.float32)
            for i in range(idx, idx+self.seq_len)
        ])
        X_seq = torch.cat([seq_numeric, seq_home_emb, seq_away_emb])
        y_target = self.y[idx+self.seq_len]  # przewidujemy wektor w chwili t
        return X_seq, y_target

# ------------------ SEKWENCYJNY NN ------------------
class MiniNNSeq(nn.Module):
    def __init__(self, input_dim):
        super(MiniNNSeq, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 2)
        )

    def forward(self, x):
        return self.model(x)

# ------------------ FUNKCJA TRENINGU ------------------
def train_model_seq(df, fetcher, embedding_dim, num_epochs=250, lr=0.001, batch_size=64, seq_len=3):
    # Sort i split
    df_sorted = df.sort_values("date")
    train_df, val_df, test_df = np.split(
        df_sorted,
        [int(0.7*len(df_sorted)), int(0.85*len(df_sorted))]
    )

    X_train_num, X_train_ids, y_train, scaler, numeric_cols = prep_df(train_df)
    X_val_num, X_val_ids, y_val, _, _ = prep_df(val_df, scaler=scaler)
    X_test_num, X_test_ids, y_test, _, _ = prep_df(test_df, scaler=scaler)

    # Datasets
    train_dataset = NBASeqDataset(X_train_num, X_train_ids, y_train, fetcher, seq_len)
    val_dataset = NBASeqDataset(X_val_num, X_val_ids, y_val, fetcher, seq_len)
    test_dataset = NBASeqDataset(X_test_num, X_test_ids, y_test, fetcher, seq_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Model
    num_numeric_features = X_train_num.shape[1]
    input_dim = seq_len * (num_numeric_features + 2*embedding_dim)
    model = MiniNNSeq(input_dim)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Trening
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for X_seq, y_target in train_loader:
            optimizer.zero_grad()
            pred = model(X_seq)
            loss = criterion(pred, y_target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_seq.size(0)
        train_loss /= len(train_loader.dataset)

        # Walidacja
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_seq, y_target in val_loader:
                pred = model(X_seq)
                val_loss += criterion(pred, y_target).item() * X_seq.size(0)
        val_loss /= len(val_loader.dataset)

        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} | Train MSE: {train_loss:.4f} | Val MSE: {val_loss:.4f}")

    # Test
    model.eval()
    test_loss = 0
    home_loss = 0
    away_loss = 0
    total_loss = 0

    with torch.no_grad():
        for X_seq, y_target in test_loader:
            pred = model(X_seq)
            test_loss += criterion(pred, y_target).item() * X_seq.size(0)
            home_pred, away_pred = pred[:,0], pred[:,1]
            home_true, away_true = y_target[:,0], y_target[:,1]
            home_loss += F.mse_loss(home_pred, home_true).item() * X_seq.size(0)
            away_loss += F.mse_loss(away_pred, away_true).item() * X_seq.size(0)
            total_pred = home_pred + away_pred
            total_true = home_true + away_true
            total_loss += F.mse_loss(total_pred, total_true).item() * X_seq.size(0)

    test_size = len(test_loader.dataset)
    test_loss /= test_size
    home_loss /= test_size
    away_loss /= test_size
    total_loss /= test_size

    print(f"Test Results:")
    print(f"  Overall MSE (avg of home/away): {test_loss:.4f}")
    print(f"  Home Points MSE: {home_loss:.4f}")
    print(f"  Away Points MSE: {away_loss:.4f}")
    print(f"  Total Points MSE (sum): {total_loss:.4f}")

    return model, scaler, numeric_cols, {
        'overall_mse': test_loss,
        'home_mse': home_loss,
        'away_mse': away_loss,
        'total_mse': total_loss
    }

# ------------------ RUN ------------------
embedding_dim = home_embeddings.shape[1]
seq_len = 3  # liczba poprzednich wektorów
model, scaler, numeric_cols, results = train_model_seq(
    df, embeddings_fetcher, embedding_dim, 
    num_epochs=250, lr=0.001, seq_len=seq_len
)

# ------------------ Funkcja do predykcji ------------------
def get_predictions_seq(model, df, fetcher, scaler, numeric_cols, seq_len=3):
    X_numeric, X_team_ids, _, _, _ = prep_df(df, scaler=scaler)
    dataset = NBASeqDataset(X_numeric, X_team_ids, np.zeros((len(df),2)), fetcher, seq_len)

    model.eval()
    predictions = []

    with torch.no_grad():
        for X_seq, _ in DataLoader(dataset, batch_size=64, shuffle=False):
            pred = model(X_seq)
            predictions.append(pred.numpy())

    predictions = np.vstack(predictions)
    home_preds = predictions[:,0]
    away_preds = predictions[:,1]
    total_preds = home_preds + away_preds

    return home_preds, away_preds, total_preds


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Epoch 10/250 | Train MSE: 201.0823 | Val MSE: 223.0521
Epoch 20/250 | Train MSE: 170.4271 | Val MSE: 204.7904
Epoch 30/250 | Train MSE: 144.9442 | Val MSE: 194.1804
Epoch 40/250 | Train MSE: 127.8007 | Val MSE: 193.4398
Epoch 50/250 | Train MSE: 114.6544 | Val MSE: 194.5228
Epoch 60/250 | Train MSE: 102.3096 | Val MSE: 199.7999
Epoch 70/250 | Train MSE: 94.2005 | Val MSE: 207.7082
Epoch 80/250 | Train MSE: 86.6635 | Val MSE: 210.0368
Epoch 90/250 | Train MSE: 81.5413 | Val MSE: 217.1168
Epoch 100/250 | Train MSE: 77.3566 | Val MSE: 223.1875
Epoch 110/250 | Train MSE: 73.4958 | Val MSE: 225.7424
Epoch 120/250 | Train MSE: 69.1060 | Val MSE: 230.6674
Epoch 130/250 | Train MSE: 65.8082 | Val MSE: 239.0515
Epoch 140/250 | Train MSE: 63.3244 | Val MSE: 240.6530
Epoch 150/250 | Train MSE: 60.0522 | Val MSE: 243.6011
Epoch 160/250 | Train MSE: 57.6055 | Val MSE: 250.0422
Epoch 170/250 | Train MSE: 55.0881 

In [46]:
print(df.columns)

Index(['home_fga', 'away_fga', 'home_fg_pct', 'away_fg_pct', 'home_fg3a',
       'away_fg3a', 'home_fg3_pct', 'away_fg3_pct', 'home_oreb', 'away_oreb',
       'home_dreb', 'away_dreb', 'home_ast', 'away_ast', 'home_stl',
       'away_stl', 'home_blk', 'away_blk', 'home_tov', 'away_tov', 'home_pf',
       'away_pf', 'home_pts', 'away_pts', 'home_team', 'away_team', 'date',
       'home_team_id', 'away_team_id'],
      dtype='object')


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader

def create_sequences(df, sequence_length=5, target_cols=['home_pts', 'away_pts']):
    """
    Create sequences where we use the last N games to predict the next game.
    This eliminates data leakage by only using historical data.
    """
    # Sort by date first
    df_sorted = df.sort_values('date').reset_index(drop=True)
    
    # Separate teams to create sequences per team
    sequences = []
    targets = []
    team_info = []
    
    # Get unique teams
    all_teams = pd.concat([df_sorted['home_team_id'], df_sorted['away_team_id']]).unique()
    
    for team_id in all_teams:
        # Get all games for this team (both home and away)
        team_games = df_sorted[
            (df_sorted['home_team_id'] == team_id) | 
            (df_sorted['away_team_id'] == team_id)
        ].copy()
        
        if len(team_games) < sequence_length + 1:
            continue
            
        # For each game, create features based on whether team was home or away
        team_features = []
        team_targets = []
        
        for _, game in team_games.iterrows():
            if game['home_team_id'] == team_id:
                # Team was playing at home
                features = [
                    game['home_fga'], game['home_fg_pct'], game['home_fg3a'], 
                    game['home_fg3_pct'], game['home_oreb'], game['home_dreb'],
                    game['home_ast'], game['home_stl'], game['home_blk'], 
                    game['home_tov'], game['home_pf'],
                    # Opponent features
                    game['away_fga'], game['away_fg_pct'], game['away_fg3a'],
                    game['away_fg3_pct'], game['away_oreb'], game['away_dreb'],
                    game['away_ast'], game['away_stl'], game['away_blk'],
                    game['away_tov'], game['away_pf'],
                    1.0  # home indicator
                ]
                target = game['home_pts']
            else:
                # Team was playing away
                features = [
                    game['away_fga'], game['away_fg_pct'], game['away_fg3a'],
                    game['away_fg3_pct'], game['away_oreb'], game['away_dreb'],
                    game['away_ast'], game['away_stl'], game['away_blk'],
                    game['away_tov'], game['away_pf'],
                    # Opponent features  
                    game['home_fga'], game['home_fg_pct'], game['home_fg3a'],
                    game['home_fg3_pct'], game['home_oreb'], game['home_dreb'],
                    game['home_ast'], game['home_stl'], game['home_blk'],
                    game['home_tov'], game['home_pf'],
                    0.0  # away indicator
                ]
                target = game['away_pts']
            
            team_features.append(features)
            team_targets.append(target)
        
        # Create sequences for this team
        for i in range(len(team_features) - sequence_length):
            # Use games i to i+sequence_length-1 to predict game i+sequence_length
            seq_features = team_features[i:i+sequence_length]
            seq_target = team_targets[i+sequence_length]
            
            sequences.append(seq_features)
            targets.append(seq_target)
            team_info.append({
                'team_id': team_id,
                'game_index': i+sequence_length,
                'date': team_games.iloc[i+sequence_length]['date']
            })
    
    return np.array(sequences), np.array(targets), team_info

class SequentialNBADataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

class TrueSequentialNBAModel(nn.Module):
    def __init__(self, input_features, hidden_size=64, num_layers=2, dropout=0.2):
        super(TrueSequentialNBAModel, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # LSTM to process the sequence of games
        self.lstm = nn.LSTM(
            input_size=input_features,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Output layers
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size // 2, 1)
        )
    
    def forward(self, x):
        # x shape: (batch_size, sequence_length, input_features)
        lstm_out, (hidden, cell) = self.lstm(x)
        
        # Use the last output from the sequence
        last_output = lstm_out[:, -1, :]  # Shape: (batch_size, hidden_size)
        
        # Predict the score
        output = self.fc(last_output)
        return output.squeeze(-1)  # Shape: (batch_size,)

def train_sequential_model(df, sequence_length=5, num_epochs=100, lr=0.001, batch_size=32):
    """
    Train the sequential model without data leakage
    """
    # Create sequences
    print("Creating sequences...")
    sequences, targets, team_info = create_sequences(df, sequence_length)
    
    print(f"Created {len(sequences)} sequences")
    print(f"Sequence shape: {sequences.shape}")
    print(f"Target shape: {targets.shape}")
    
    # Split data temporally (important for time series)
    # Use first 70% for training, next 15% for validation, last 15% for testing
    n_samples = len(sequences)
    train_idx = int(0.7 * n_samples)
    val_idx = int(0.85 * n_samples)
    
    train_sequences = sequences[:train_idx]
    train_targets = targets[:train_idx]
    
    val_sequences = sequences[train_idx:val_idx]
    val_targets = targets[train_idx:val_idx]
    
    test_sequences = sequences[val_idx:]
    test_targets = targets[val_idx:]
    
    # Create datasets and loaders
    train_dataset = SequentialNBADataset(train_sequences, train_targets)
    val_dataset = SequentialNBADataset(val_sequences, val_targets)
    test_dataset = SequentialNBADataset(test_sequences, test_targets)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize model
    input_features = sequences.shape[2]  # Number of features per game
    model = TrueSequentialNBAModel(input_features)
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # Training loop
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        for sequences_batch, targets_batch in train_loader:
            optimizer.zero_grad()
            predictions = model(sequences_batch)
            loss = criterion(predictions, targets_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * sequences_batch.size(0)
        
        train_loss /= len(train_dataset)
        train_losses.append(train_loss)
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for sequences_batch, targets_batch in val_loader:
                predictions = model(sequences_batch)
                val_loss += criterion(predictions, targets_batch).item() * sequences_batch.size(0)
        
        val_loss /= len(val_dataset)
        val_losses.append(val_loss)
        
        if (epoch + 1) % 20 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} | Train MSE: {train_loss:.4f} | Val MSE: {val_loss:.4f}")
    
    # Test evaluation
    model.eval()
    test_loss = 0
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for sequences_batch, targets_batch in test_loader:
            predictions = model(sequences_batch)
            test_loss += criterion(predictions, targets_batch).item() * sequences_batch.size(0)
            all_predictions.append(predictions.numpy())
            all_targets.append(targets_batch.numpy())
    
    test_loss /= len(test_dataset)
    all_predictions = np.concatenate(all_predictions)
    all_targets = np.concatenate(all_targets)
    
    # Calculate additional metrics
    mae = np.mean(np.abs(all_predictions - all_targets))
    
    print(f"\nTest Results:")
    print(f"  Test MSE: {test_loss:.4f}")
    print(f"  Test MAE: {mae:.4f}")
    print(f"  Test RMSE: {np.sqrt(test_loss):.4f}")
    
    return model, train_losses, val_losses, test_loss

# Usage example:
model, train_losses, val_losses, test_loss = train_sequential_model(df, sequence_length=5)

Creating sequences...
Created 13956 sequences
Sequence shape: (13956, 5, 23)
Target shape: (13956,)
Epoch 20/100 | Train MSE: 410.0315 | Val MSE: 158.8855
Epoch 40/100 | Train MSE: 389.5156 | Val MSE: 153.7588
Epoch 60/100 | Train MSE: 390.7374 | Val MSE: 160.3071
Epoch 80/100 | Train MSE: 376.3036 | Val MSE: 158.5018
Epoch 100/100 | Train MSE: 357.7414 | Val MSE: 156.8236

Test Results:
  Test MSE: 175.7238
  Test MAE: 10.5372
  Test RMSE: 13.2561


In [48]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import numpy as np

# ---- Przygotowanie sekwencyjnego datasetu ----
class NBASeqDataset(Dataset):
    def __init__(self, X_numeric, X_team_ids, y, fetcher, seq_len=3):
        """
        X_numeric: (num_samples, num_features)
        X_team_ids: (num_samples, 2) -> home_id, away_id
        y: (num_samples, 2) -> home_pts, away_pts
        fetcher: EmbeddingFetcher
        seq_len: ile poprzednich wektorów użyć
        """
        self.X_numeric = torch.tensor(X_numeric, dtype=torch.float32)
        self.X_team_ids = torch.tensor(X_team_ids, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.fetcher = fetcher
        self.seq_len = seq_len

    def __len__(self):
        return len(self.y) - self.seq_len  # bo używamy poprzednich seq_len wektorów

    def __getitem__(self, idx):
        # pobierz sekwencję
        seq_numeric = self.X_numeric[idx:idx+self.seq_len].flatten()  # spłaszczamy wszystkie wektory
        seq_home_emb = torch.cat([
            torch.tensor(self.fetcher.get_home_embedding(int(self.X_team_ids[i,0])), dtype=torch.float32)
            for i in range(idx, idx+self.seq_len)
        ])
        seq_away_emb = torch.cat([
            torch.tensor(self.fetcher.get_away_embedding(int(self.X_team_ids[i,1])), dtype=torch.float32)
            for i in range(idx, idx+self.seq_len)
        ])
        X_seq = torch.cat([seq_numeric, seq_home_emb, seq_away_emb])
        y_target = self.y[idx+self.seq_len]  # przewidujemy wektor w momencie t
        return X_seq, y_target

# ---- Prosty NN dla sekwencji ----
class MiniNNSeq(nn.Module):
    def __init__(self, input_dim):
        super(MiniNNSeq, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 2)  # przewidujemy home i away points
        )

    def forward(self, x):
        return self.model(x)

# ---- Funkcja do trenowania ----
def train_seq_model(X_numeric, X_team_ids, y, fetcher, seq_len=3, num_epochs=100, lr=0.001, batch_size=64):
    dataset = NBASeqDataset(X_numeric, X_team_ids, y, fetcher, seq_len)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # input_dim = seq_len * (num_numeric_features + 2*embedding_dim)
    num_numeric = X_numeric.shape[1]
    embedding_dim = fetcher.home_embeddings.shape[1]
    input_dim = seq_len * (num_numeric + 2*embedding_dim)
    
    model = MiniNNSeq(input_dim)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for X_seq, y_target in train_loader:
            optimizer.zero_grad()
            pred = model(X_seq)
            loss = criterion(pred, y_target)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * X_seq.size(0)
        epoch_loss /= len(dataset)
        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} | Train MSE: {epoch_loss:.4f}")
    return model

# ---- Użycie ----
# Zakładam, że masz przygotowane:
# X_numeric, X_team_ids, y, fetcher
seq_len = 3
model = train_seq_model(X_numeric, X_team_ids, y, embeddings_fetcher, seq_len=seq_len)


NameError: name 'X_numeric' is not defined

In [50]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn

def create_pregame_features_per_game(df):
    """
    Create pre-game features for each game that would be available before tipoff
    """
    df_sorted = df.sort_values("date").reset_index(drop=True)
    
    features_list = []
    targets_list = []
    valid_indices = []
    
    for i in range(len(df_sorted)):
        current_game = df_sorted.iloc[i]
        home_team = current_game['home_team_id']
        away_team = current_game['away_team_id']
        
        # Get historical data (games before this one)
        historical = df_sorted.iloc[:i]
        
        if len(historical) < 10:  # Need minimum historical data
            continue
        
        # Calculate pre-game features
        home_history = historical[
            (historical['home_team_id'] == home_team) | 
            (historical['away_team_id'] == home_team)
        ]
        away_history = historical[
            (historical['home_team_id'] == away_team) | 
            (historical['away_team_id'] == away_team)
        ]
        
        if len(home_history) < 3 or len(away_history) < 3:
            continue
        
        # Home team stats (from completed games)
        home_total_pts = 0
        home_total_allowed = 0
        home_wins = 0
        home_games = len(home_history)
        home_home_wins = 0
        home_home_games = 0
        
        for _, game in home_history.iterrows():
            if game['home_team_id'] == home_team:  # Was home
                home_total_pts += game['home_pts']
                home_total_allowed += game['away_pts']
                home_home_games += 1
                if game['home_pts'] > game['away_pts']:
                    home_wins += 1
                    home_home_wins += 1
            else:  # Was away
                home_total_pts += game['away_pts']
                home_total_allowed += game['home_pts']
                if game['away_pts'] > game['home_pts']:
                    home_wins += 1
        
        # Away team stats
        away_total_pts = 0
        away_total_allowed = 0
        away_wins = 0
        away_games = len(away_history)
        away_away_wins = 0
        away_away_games = 0
        
        for _, game in away_history.iterrows():
            if game['home_team_id'] == away_team:  # Was home
                away_total_pts += game['home_pts']
                away_total_allowed += game['away_pts']
                if game['home_pts'] > game['away_pts']:
                    away_wins += 1
            else:  # Was away
                away_total_pts += game['away_pts']
                away_total_allowed += game['home_pts']
                away_away_games += 1
                if game['away_pts'] > game['home_pts']:
                    away_wins += 1
                    away_away_wins += 1
        
        # Head to head
        h2h = historical[
            ((historical['home_team_id'] == home_team) & (historical['away_team_id'] == away_team)) |
            ((historical['home_team_id'] == away_team) & (historical['away_team_id'] == home_team))
        ]
        
        h2h_games = len(h2h)
        h2h_home_wins = 0
        if h2h_games > 0:
            for _, game in h2h.iterrows():
                if ((game['home_team_id'] == home_team) and (game['home_pts'] > game['away_pts'])) or \
                   ((game['away_team_id'] == home_team) and (game['away_pts'] > game['home_pts'])):
                    h2h_home_wins += 1
        
        # Recent form (last 3 games)
        home_recent = home_history.tail(3)
        away_recent = away_history.tail(3)
        
        home_recent_wins = 0
        home_recent_pts = 0
        for _, game in home_recent.iterrows():
            if game['home_team_id'] == home_team:
                home_recent_pts += game['home_pts']
                if game['home_pts'] > game['away_pts']:
                    home_recent_wins += 1
            else:
                home_recent_pts += game['away_pts']
                if game['away_pts'] > game['home_pts']:
                    home_recent_wins += 1
        
        away_recent_wins = 0
        away_recent_pts = 0
        for _, game in away_recent.iterrows():
            if game['home_team_id'] == away_team:
                away_recent_pts += game['home_pts']
                if game['home_pts'] > game['away_pts']:
                    away_recent_wins += 1
            else:
                away_recent_pts += game['away_pts']
                if game['away_pts'] > game['home_pts']:
                    away_recent_wins += 1
        
        # Compile features
        game_features = [
            # Home team strength
            home_wins / home_games,  # Overall win rate
            home_total_pts / home_games,  # PPG
            home_total_allowed / home_games,  # Opponent PPG
            home_home_wins / home_home_games if home_home_games > 0 else 0.5,  # Home win rate
            home_games,  # Experience
            
            # Away team strength
            away_wins / away_games,  # Overall win rate
            away_total_pts / away_games,  # PPG
            away_total_allowed / away_games,  # Opponent PPG
            away_away_wins / away_away_games if away_away_games > 0 else 0.5,  # Away win rate
            away_games,  # Experience
            
            # Head to head
            h2h_games,
            h2h_home_wins / h2h_games if h2h_games > 0 else 0.5,
            
            # Recent form
            home_recent_wins / len(home_recent) if len(home_recent) > 0 else 0.5,
            home_recent_pts / len(home_recent) if len(home_recent) > 0 else 110,
            away_recent_wins / len(away_recent) if len(away_recent) > 0 else 0.5,
            away_recent_pts / len(away_recent) if len(away_recent) > 0 else 110,
            
            # Context
            1.0,  # Home court advantage
        ]
        
        features_list.append(game_features)
        targets_list.append([current_game['home_pts'], current_game['away_pts']])
        valid_indices.append(i)
    
    return np.array(features_list), np.array(targets_list), valid_indices

def prep_pregame_df(df, target_cols=['home_pts', 'away_pts'], scaler=None):
    """
    Prepare dataframe with pre-game features instead of box score stats
    """
    X_features, y, valid_indices = create_pregame_features_per_game(df)
    
    # Get team IDs for the valid games
    df_valid = df.iloc[valid_indices]
    X_team_ids = df_valid[['home_team_id', 'away_team_id']].astype(int).values
    
    if scaler is None:
        scaler = StandardScaler()
        X_numeric = scaler.fit_transform(X_features)
    else:
        X_numeric = scaler.transform(X_features)
    
    feature_names = [
        'home_win_rate', 'home_ppg', 'home_opp_ppg', 'home_home_win_rate', 'home_games',
        'away_win_rate', 'away_ppg', 'away_opp_ppg', 'away_away_win_rate', 'away_games',
        'h2h_games', 'h2h_home_win_rate',
        'home_recent_win_rate', 'home_recent_ppg', 'away_recent_win_rate', 'away_recent_ppg',
        'home_court_advantage'
    ]
    
    return X_numeric, X_team_ids, y, scaler, feature_names

# Modified dataset class
class NBASeqDatasetFixed(Dataset):
    def __init__(self, X_numeric, X_team_ids, y, fetcher, seq_len=3):
        self.X_numeric = torch.tensor(X_numeric, dtype=torch.float32)
        self.X_team_ids = torch.tensor(X_team_ids, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.fetcher = fetcher
        self.seq_len = seq_len

    def __len__(self):
        return max(0, len(self.y) - self.seq_len)

    def __getitem__(self, idx):
        # Create sequence of pre-game features
        seq_numeric = self.X_numeric[idx:idx+self.seq_len].flatten()
        
        # Get team embeddings for sequence
        seq_home_emb = torch.cat([
            torch.tensor(self.fetcher.get_home_embedding(int(self.X_team_ids[i,0])), dtype=torch.float32)
            for i in range(idx, idx+self.seq_len)
        ])
        seq_away_emb = torch.cat([
            torch.tensor(self.fetcher.get_away_embedding(int(self.X_team_ids[i,1])), dtype=torch.float32)
            for i in range(idx, idx+self.seq_len)
        ])
        
        # Combine all features
        X_seq = torch.cat([seq_numeric, seq_home_emb, seq_away_emb])
        y_target = self.y[idx+self.seq_len]  # Predict the next game
        
        return X_seq, y_target

# Same model class
class MiniNNSeq(nn.Module):
    def __init__(self, input_dim):
        super(MiniNNSeq, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 2)
        )

    def forward(self, x):
        return self.model(x)

# Modified training function
def train_model_seq_fixed(df, fetcher, embedding_dim, num_epochs=250, lr=0.001, batch_size=64, seq_len=3):
    print("Creating pre-game features...")
    
    # Sort by date
    df_sorted = df.sort_values("date")
    
    # Create pre-game features for all data first
    X_all, X_team_ids_all, y_all, scaler, feature_names = prep_pregame_df(df_sorted)
    
    print(f"Created {len(X_all)} samples with {len(feature_names)} pre-game features each")
    
    if len(X_all) < 100:
        print("Not enough samples for training!")
        return None, None, None, None
    
    # Split data temporally
    train_idx = int(0.7 * len(X_all))
    val_idx = int(0.85 * len(X_all))
    
    X_train_num = X_all[:train_idx]
    X_train_ids = X_team_ids_all[:train_idx]
    y_train = y_all[:train_idx]
    
    X_val_num = X_all[train_idx:val_idx]
    X_val_ids = X_team_ids_all[train_idx:val_idx]
    y_val = y_all[train_idx:val_idx]
    
    X_test_num = X_all[val_idx:]
    X_test_ids = X_team_ids_all[val_idx:]
    y_test = y_all[val_idx:]

    # Create datasets
    train_dataset = NBASeqDatasetFixed(X_train_num, X_train_ids, y_train, fetcher, seq_len)
    val_dataset = NBASeqDatasetFixed(X_val_num, X_val_ids, y_val, fetcher, seq_len)
    test_dataset = NBASeqDatasetFixed(X_test_num, X_test_ids, y_test, fetcher, seq_len)

    if len(train_dataset) < 10:
        print("Not enough sequential samples for training!")
        return None, None, None, None

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Model
    num_features = X_train_num.shape[1]
    input_dim = seq_len * (num_features + 2*embedding_dim)
    model = MiniNNSeq(input_dim)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training
    print("Starting training...")
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for X_seq, y_target in train_loader:
            optimizer.zero_grad()
            pred = model(X_seq)
            loss = criterion(pred, y_target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_seq.size(0)
        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_seq, y_target in val_loader:
                pred = model(X_seq)
                val_loss += criterion(pred, y_target).item() * X_seq.size(0)
        val_loss /= len(val_loader.dataset)

        if (epoch+1) % 25 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} | Train MSE: {train_loss:.4f} | Val MSE: {val_loss:.4f}")

    # Test evaluation
    print("Evaluating on test set...")
    model.eval()
    test_loss = 0
    home_loss = 0
    away_loss = 0
    total_loss = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for X_seq, y_target in test_loader:
            pred = model(X_seq)
            test_loss += criterion(pred, y_target).item() * X_seq.size(0)
            
            home_pred, away_pred = pred[:,0], pred[:,1]
            home_true, away_true = y_target[:,0], y_target[:,1]
            
            home_loss += F.mse_loss(home_pred, home_true).item() * X_seq.size(0)
            away_loss += F.mse_loss(away_pred, away_true).item() * X_seq.size(0)
            
            total_pred = home_pred + away_pred
            total_true = home_true + away_true
            total_loss += F.mse_loss(total_pred, total_true).item() * X_seq.size(0)
            
            all_preds.append(pred.numpy())
            all_targets.append(y_target.numpy())

    test_size = len(test_loader.dataset)
    test_loss /= test_size
    home_loss /= test_size
    away_loss /= test_size
    total_loss /= test_size
    
    # Calculate MAE
    all_preds = np.vstack(all_preds)
    all_targets = np.vstack(all_targets)
    mae = np.mean(np.abs(all_preds - all_targets))

    print(f"\nTest Results:")
    print(f"  Overall MSE: {test_loss:.4f}")
    print(f"  Home Points MSE: {home_loss:.4f}")
    print(f"  Away Points MSE: {away_loss:.4f}")
    print(f"  Total Points MSE: {total_loss:.4f}")
    print(f"  Overall MAE: {mae:.4f}")
    print(f"  RMSE: {np.sqrt(test_loss):.4f}")

    return model, scaler, feature_names, {
        'overall_mse': test_loss,
        'home_mse': home_loss,
        'away_mse': away_loss,
        'total_mse': total_loss,
        'mae': mae
    }

# Usage:
model, scaler, feature_names, results = train_model_seq_fixed(
     df, embeddings_fetcher, embedding_dim, 
     num_epochs=250, lr=0.001, seq_len=3
 )

Creating pre-game features...
Created 7005 samples with 17 pre-game features each
Starting training...
Epoch 25/250 | Train MSE: 224.6755 | Val MSE: 194.2977
Epoch 50/250 | Train MSE: 208.5151 | Val MSE: 200.5405
Epoch 75/250 | Train MSE: 199.0594 | Val MSE: 226.9470
Epoch 100/250 | Train MSE: 193.4310 | Val MSE: 226.8276


KeyboardInterrupt: 