In [None]:
!pip install tensorflow



In [None]:
import logging
import numpy as np
import pandas as pd
import os
import pickle
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Flatten, Concatenate, Dropout



# Constants

In [None]:
rolling_window = 10

dfs_cats = ['reb', 'pts', 'ast', 'stl', 'blk', 'tov']
same_game_cols = ['minutes_played', 'fgm', 'fga', 'fg_pct', 'fg3m', 'fg3a', 'fg3_pct', 'ftm', 'fta', 'ft_pct', 'oreb',
                   'dreb', 'reb', 'ast', 'tov', 'stl', 'blk', 'blka', 'pf', 'pfd', 'pts', 'plus_minus',
                   'nba_fantasy_pts', 'dd2', 'td3', 'wnba_fantasy_pts', 'available_flag', 'e_off_rating', 'off_rating',
                   'sp_work_off_rating', 'e_def_rating', 'def_rating', 'sp_work_def_rating', 'e_net_rating',
                   'net_rating', 'sp_work_net_rating', 'ast_pct', 'ast_to', 'ast_ratio', 'oreb_pct', 'dreb_pct',
                   'reb_pct', 'tm_tov_pct', 'e_tov_pct', 'efg_pct', 'ts_pct', 'usg_pct_x', 'e_usg_pct', 'e_pace',
                   'pace', 'pace_per40', 'sp_work_pace', 'pie', 'poss', 'fgm_pg', 'fga_pg', 'pct_fga_2pt',
                   'pct_fga_3pt', 'pct_pts_2pt', 'pct_pts_2pt_mr', 'pct_pts_3pt', 'pct_pts_fb', 'pct_pts_ft',
                   'pct_pts_off_tov', 'pct_pts_paint', 'pct_ast_2pm', 'pct_uast_2pm', 'pct_ast_3pm', 'pct_uast_3pm',
                   'pct_ast_fgm', 'pct_uast_fgm', 'pct_fgm', 'pct_fga', 'pct_fg3m', 'pct_fg3a', 'pct_ftm', 'pct_fta',
                   'pct_oreb', 'pct_dreb', 'pct_reb', 'pct_ast', 'pct_tov', 'pct_stl', 'pct_blk', 'pct_blka', 'pct_pf',
                   'pct_pfd', 'pct_pts', 'usage_rate', 'fp_draftkings', 'fp_fanduel',
                   'fp_yahoo']


In [None]:
def assign_league_weeks(df):
    df['week'] = df['game_date'].dt.isocalendar().week
    df['season_start'] = df.groupby('season_year')['game_date'].transform('min')
    df['season_week'] = ((df['game_date'] - df['season_start']).dt.days // 7) + 1
    df = df.drop(columns=['season_start', 'week'])
    df = df.rename(columns={'season_week': 'league_week'})
    return df

In [None]:
def create_sequences(X, y, time_steps):
    Xs, ys = [], []
    for i in range(len(X) - time_steps + 1):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps - 1])  # Adjusted this line
    return np.array(Xs), np.array(ys)


In [None]:
def build_rnn_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(25))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


In [None]:
def rolling_train_test_rnn(X, y, df, num_weeks_for_training=4, time_steps=4, save_model=False, model_dir='models'):
    os.makedirs(model_dir, exist_ok=True)

    # Initialize lists to store predictions and true values
    all_predictions = []
    all_true_values = []
    all_game_ids = []
    all_game_dates = []
    all_player_ids = []
    all_fanduel_salaries = []
    all_draftkings_salaries = []
    all_yahoo_salaries = []
    all_fanduel_positions = []
    all_draftkings_positions = []
    all_yahoo_positions = []

    scaler = MinMaxScaler(feature_range=(0, 1))
    unique_weeks = df['league_week'].unique()

    for current_week in unique_weeks:
        start_week = current_week - num_weeks_for_training
        training_weeks = list(range(start_week, current_week))

        # Select training data (previous 4 weeks)
        X_train = X[X['league_week'].isin(training_weeks)]
        y_train = y.loc[X_train.index]

        # Select test data (current week)
        X_test = X[X['league_week'] == current_week]
        y_test = y.loc[X_test.index]

        if X_train.empty or X_test.empty:
            continue

        identifying_test_data = X_test[['player_name', 'game_date', 'game_id']]
        X_train = X_train.drop(columns=['game_date', 'game_id'])
        X_test = X_test.drop(columns=['game_date', 'game_id'])

        # Scale data
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Create sequences for training
        X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train, time_steps)
        X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test, time_steps)

        # Build and train the RNN model
        model = build_rnn_model((X_train_seq.shape[1], X_train_seq.shape[2]))
        model.fit(X_train_seq, y_train_seq, epochs=10, batch_size=32, verbose=0)

        # Make predictions
        y_pred = model.predict(X_test_seq)

        # Store the predictions and true values
        all_predictions.extend(y_pred.flatten())
        all_true_values.extend(y_test_seq)
        all_game_ids.extend(list(identifying_test_data['game_id'])[time_steps:])
        all_game_dates.extend(list(identifying_test_data['game_date'])[time_steps:])
        all_player_ids.extend(list(identifying_test_data['player_name'])[time_steps:])
        all_fanduel_salaries.extend(X_test['salary-draftkings'].values[time_steps:])
        all_draftkings_salaries.extend(X_test['salary-draftkings'].values[time_steps:])
        all_yahoo_salaries.extend(X_test['salary-yahoo'].values[time_steps:])
        all_fanduel_positions.extend(X_test['pos-draftkings'].values[time_steps:])
        all_draftkings_positions.extend(X_test['pos-draftkings'].values[time_steps:])
        all_yahoo_positions.extend(X_test['pos-yahoo'].values[time_steps:])

        if save_model:
            model_filename = f'{model_dir}/rnn_model_week_{current_week}_trained_on_{start_week}_to_{current_week - 1}.h5'
            model.save(model_filename)

        mse = mean_squared_error(y_test_seq, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test_seq, y_pred)

        print(f'Training weeks: {training_weeks}')
        print(f'Test week: {current_week}')
        print(f'Mean Squared Error (MSE): {mse:.2f}')
        print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
        print(f'R-squared (R²): {r2:.2f}')
        print('')

    results_df = pd.DataFrame({
        'player_name': all_player_ids,
        'game_id': all_game_ids,
        'game_date': all_game_dates,
        'y': all_true_values,
        'y_pred': all_predictions,
        'fanduel_salary': all_fanduel_salaries,
        'draftkings_salary': all_draftkings_salaries,
        'yahoo_salary': all_yahoo_salaries,
        'fanduel_position': all_fanduel_positions,
        'draftkings_position': all_draftkings_positions,
        'yahoo_position': all_yahoo_positions,
    })

    return results_df


In [None]:
def predict_fp_rnn(df, rolling_window=rolling_window):
    df = df.drop('Unnamed: 0', axis=1)
    df['game_date'] = pd.to_datetime(df['game_date'])

    cat_cols = ['team_abbreviation', 'player_name', 'opponent', 'pos-draftkings', 'pos-fanduel', 'pos-yahoo']
    df[cat_cols] = df[cat_cols].astype('category')

    df = df.sort_values(['game_date'], ascending=True)
    df = assign_league_weeks(df)
    df = clean_numeric_columns(df, same_game_cols)
    df = add_time_dependent_features(df, rolling_window=rolling_window)

    all_seasons_results = []

    for season in df['season_year'].unique():
        season_df = df[df['season_year'] == season]
        season_df = season_df.drop('season_year', axis=1)
        season_results = pd.DataFrame()

        for cat in dfs_cats:
            target = cat
            target_related_cols = same_game_cols
            features = season_df.columns.difference(target_related_cols).tolist()

            X = season_df[features]
            y = season_df[target]

            print(f'Training RNN models for {cat}')
            print('---------------------------------')
            cat_results = rolling_train_test_rnn(X=X, y=y, df=season_df)
            cat_results.rename(columns={'y': cat, 'y_pred': f'{cat}_pred'}, inplace=True)
            if len(season_results) == 0:
                season_results = cat_results
            else:
                season_results = pd.merge(
                    season_results,
                    cat_results,
                    on=['player_name', 'game_date', 'game_id', 'fanduel_salary', 'draftkings_salary', 'yahoo_salary', 'draftkings_position', 'fanduel_position', 'yahoo_position'],
                    suffixes=('', f'_{season_df.columns.name}'))
            cat_results.to_csv(f'output_csv/{cat}_{season}_rnn_results.csv', index=False)

        all_seasons_results.append(season_results)

    combined_df = pd.concat(all_seasons_results, ignore_index=True)
    combined_df['fp_fanduel'] = combined_df.apply(lambda row: calculate_fp_fanduel(row), axis=1)
    combined_df['fp_fanduel_pred'] = combined_df.apply(lambda row: calculate_fp_fanduel(row, pred_mode=True), axis=1)

    combined_df['fp_yahoo'] = combined_df.apply(calculate_fp_yahoo, axis=1)
    combined_df['fp_yahoo_pred'] = combined_df.apply(lambda row: calculate_fp_yahoo(row, pred_mode=True), axis=1)

    combined_df['fp_draftkings'] = combined_df.apply(calculate_fp_draftkings, axis=1)
    combined_df['fp_draftkings_pred'] = combined_df.apply(lambda row: calculate_fp_draftkings(row, pred_mode=True),
                                                          axis=1)
    return combined_df



In [None]:
def calculate_fp_fanduel(row, pred_mode=False):
    pred = '_pred' if pred_mode else ''
    return (row[f'pts{pred}'] +
            row[f'reb{pred}'] * 1.2 +
            row[f'ast{pred}'] * 1.5 +
            row[f'stl{pred}'] * 3 +
            row[f'blk{pred}'] * 3 -
            row[f'tov{pred}'] * 1)


def calculate_fp_yahoo(row, pred_mode=False):
    pred = '_pred' if pred_mode else ''
    return (row[f'pts{pred}'] +
            row[f'reb{pred}'] * 1.2 +
            row[f'ast{pred}'] * 1.5 +
            row[f'stl{pred}'] * 3 +
            row[f'blk{pred}'] * 3 -
            row[f'tov{pred}'] * 1)


def calculate_fp_draftkings(row, pred_mode=False):
    pred = '_pred' if pred_mode else ''
    fp = (row[f'pts{pred}'] +
          row[f'reb{pred}'] * 1.25 +
          row[f'ast{pred}'] * 1.5 +
          row[f'stl{pred}'] * 2 +
          row[f'blk{pred}'] * 2 -
          row[f'tov{pred}'] * 0.5)

    # Calculate Double-Double and Triple-Double bonuses
    stats = [row[f'pts{pred}'], row[f'reb{pred}'], row[f'ast{pred}'], row[f'stl{pred}'], row[f'blk{pred}']]
    double_double = sum([1 for stat in stats if stat >= 10]) >= 2
    triple_double = sum([1 for stat in stats if stat >= 10]) >= 3

    if double_double:
        fp += 1.5
    if triple_double:
        fp += 3

    return fp

def clean_numeric_columns(df, columns):
    """
    Convert columns to numeric, forcing errors to NaN, and handle specific non-numeric values.
    """
    for col in columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert non-numeric to NaN
    return df

def add_time_dependent_features(df, rolling_window):
    for col in same_game_cols:
        logging.info(f"Adding features to {col}")
        gb = df.groupby('player_name')[col]
        df[f'{col}_rolling_{rolling_window}_day_avg'] = gb.transform(
            lambda x: x.rolling(rolling_window, min_periods=1).mean())
        df[f'{col}_rolling_{rolling_window}_day_std'] = gb.transform(
            lambda x: x.rolling(rolling_window, min_periods=1).std())
        df[f'{col}_lag_1'] = gb.shift(1)
        df[f'{col}_lag_2'] = gb.shift(2)
        df[f'{col}_lag_3'] = gb.shift(3)
        df[f'{col}_diff_1'] = gb.diff(1)
        df[f'{col}_diff_2'] = gb.diff(2)
        df[f'{col}_diff_3'] = gb.diff(3)
    return df

# Load

In [None]:
df = pd.read_csv('gamelogs_salaries_2018-19_merged.csv')
df = df.drop('Unnamed: 0', axis=1)

In [None]:
df

Unnamed: 0,season_year,player_name,team_abbreviation,game_id,game_date,minutes_played,fgm,fga,fg_pct,fg3m,...,salary-fanduel,salary-yahoo,fp_draftkings,fp_fanduel,fp_yahoo,starter,venue,is_playoff,is_wl,days_rest_int
0,2018-19,Nemanja Bjelica,SAC,21801230,2019-04-10T00:00:00,11.910000,3,4,0.750,2,...,4100.0,15.0,12.50,11.5,11.5,1,0,0,0,2
1,2018-19,Corey Brewer,SAC,21801230,2019-04-10T00:00:00,23.450000,0,0,0.000,0,...,3500.0,10.0,8.25,8.7,8.7,0,0,0,0,2
2,2018-19,Yuta Watanabe,MEM,21801225,2019-04-10T00:00:00,19.166667,2,8,0.250,0,...,3500.0,10.0,9.00,8.8,8.8,0,1,0,0,4
3,2018-19,Lou Williams,LAC,21801229,2019-04-10T00:00:00,23.316667,5,17,0.294,1,...,6700.0,26.0,27.75,25.5,25.5,0,1,0,0,2
4,2018-19,Jamal Murray,DEN,21801228,2019-04-10T00:00:00,34.583333,6,12,0.500,3,...,7000.0,25.0,40.00,38.1,38.1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25607,2018-19,Aron Baynes,BOS,21800001,2018-10-16T00:00:00,19.233333,3,7,0.429,2,...,3500.0,11.0,18.00,16.3,16.3,0,1,0,0,3
25608,2018-19,Jerami Grant,OKC,21800002,2018-10-16T00:00:00,29.103333,2,7,0.286,1,...,4700.0,10.0,19.00,21.4,21.4,0,0,0,0,3
25609,2018-19,Raymond Felton,OKC,21800002,2018-10-16T00:00:00,13.916667,1,5,0.200,0,...,3600.0,10.0,10.25,9.1,9.1,0,0,0,0,3
25610,2018-19,Amir Johnson,PHI,21800001,2018-10-16T00:00:00,11.183333,1,1,1.000,0,...,3500.0,10.0,10.25,10.1,10.1,0,0,0,0,3


In [None]:
# prompt: find columns with missing values

# Check for missing values in each column
missing_values = df.isnull().sum()

# Filter for columns with missing values
columns_with_missing_values = missing_values[missing_values > 0]

print(columns_with_missing_values)


pos-draftkings       717
pos-fanduel          121
pos-yahoo             47
salary-draftkings    717
salary-fanduel       121
salary-yahoo          47
dtype: int64


In [None]:
df['game_date'] = pd.to_datetime(df['game_date'])

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['team_abbreviation', 'player_name', 'opponent', 'pos-draftkings', 'pos-fanduel', 'pos-yahoo']

# Create a label encoder for each categorical column
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Flatten, Concatenate

# Define input layers for each categorical feature
inputs = []
embeddings = []
for col in cat_cols:
    input_dim = df[col].nunique()  # number of unique categories
    output_dim = min(50, input_dim // 2)  # embedding size, you can adjust this
    input_layer = Input(shape=(1,))
    embedding_layer = Embedding(input_dim=input_dim, output_dim=output_dim, input_length=1)(input_layer)
    embedding_layer = Flatten()(embedding_layer)
    inputs.append(input_layer)
    embeddings.append(embedding_layer)

# Combine all embeddings
combined_embeddings = Concatenate()(embeddings)




In [None]:
combined_embeddings

<KerasTensor shape=(None, 91), dtype=float32, sparse=False, name=keras_tensor_29>

In [None]:
df = df.sort_values(['game_date'], ascending=True)
df = assign_league_weeks(df)
df = clean_numeric_columns(df, same_game_cols)
df = add_time_dependent_features(df, rolling_window=rolling_window)


  gb = df.groupby('player_name')[col]
  df[f'{col}_lag_2'] = gb.shift(2)
  df[f'{col}_lag_3'] = gb.shift(3)
  df[f'{col}_diff_1'] = gb.diff(1)
  df[f'{col}_diff_2'] = gb.diff(2)
  df[f'{col}_diff_3'] = gb.diff(3)
  gb = df.groupby('player_name')[col]
  df[f'{col}_rolling_{rolling_window}_day_avg'] = gb.transform(
  df[f'{col}_rolling_{rolling_window}_day_std'] = gb.transform(
  df[f'{col}_lag_1'] = gb.shift(1)
  df[f'{col}_lag_2'] = gb.shift(2)
  df[f'{col}_lag_3'] = gb.shift(3)
  df[f'{col}_diff_1'] = gb.diff(1)
  df[f'{col}_diff_2'] = gb.diff(2)
  df[f'{col}_diff_3'] = gb.diff(3)
  gb = df.groupby('player_name')[col]
  df[f'{col}_rolling_{rolling_window}_day_avg'] = gb.transform(
  df[f'{col}_rolling_{rolling_window}_day_std'] = gb.transform(
  df[f'{col}_lag_1'] = gb.shift(1)
  df[f'{col}_lag_2'] = gb.shift(2)
  df[f'{col}_lag_3'] = gb.shift(3)
  df[f'{col}_diff_1'] = gb.diff(1)
  df[f'{col}_diff_2'] = gb.diff(2)
  df[f'{col}_diff_3'] = gb.diff(3)
  gb = df.groupby('player_name')[co

In [None]:
# prompt: show me all columns with missing values and their count

# Check for missing values in each column
missing_values = df.isnull().sum()

# Filter for columns with missing values
columns_with_missing_values = missing_values[missing_values > 0]

print(columns_with_missing_values)


pos-draftkings        717
pos-fanduel           121
pos-yahoo              47
salary-draftkings     717
salary-fanduel        121
                     ... 
fp_yahoo_lag_2       1029
fp_yahoo_lag_3       1530
fp_yahoo_diff_1       521
fp_yahoo_diff_2      1029
fp_yahoo_diff_3      1530
Length: 650, dtype: int64


In [None]:
all_predictions = []
all_true_values = []
all_game_ids = []
all_game_dates = []
all_player_ids = []
all_fanduel_salaries = []
all_draftkings_salaries = []
all_yahoo_salaries = []
all_fanduel_positions = []
all_draftkings_positions = []
all_yahoo_positions = []

scaler = MinMaxScaler(feature_range=(0, 1))
unique_weeks = df['league_week'].unique()


In [None]:
target = 'reb'
target_related_cols = same_game_cols
features = df.columns.difference(target_related_cols).tolist()

X = df[features]
y = df[target]

In [None]:
current_week = 8
num_weeks_for_training = 4
start_week = current_week - num_weeks_for_training
training_weeks = list(range(start_week, current_week))
time_steps = 4

In [None]:
X_train = X[X['league_week'].isin(training_weeks)]
y_train = y.loc[X_train.index]

X_test = X[X['league_week'] == current_week]
y_test = y.loc[X_test.index]

identifying_test_data = X_test[['player_name', 'game_date', 'game_id']]

X_train = X_train.drop(columns=['game_date', 'game_id'])
X_test = X_test.drop(columns=['game_date', 'game_id'])


In [None]:
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)


In [None]:
# Ensure y_train and y_test are reset
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Create sequences
X_train_seq, y_train_seq = create_sequences(X_train, y_train, time_steps)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, time_steps)


In [None]:
model = build_rnn_model((X_train_seq.shape[1], X_train_seq.shape[2]))

  super().__init__(**kwargs)


In [None]:
# prompt: i want to know all the different type for all features that are in X_train_seq

# Get the feature names from the original DataFrame (X_train)
feature_names = X_train.columns

# Iterate through the features and print their data types
for feature_name in feature_names:
  feature_data = X_train_seq[:,:, X_train.columns.get_loc(feature_name)]
  unique_types = set(type(element) for element in feature_data.flatten())
  print(f"Feature '{feature_name}' has the following data types: {unique_types}")


Feature 'ast_diff_1' has the following data types: {<class 'float'>}
Feature 'ast_diff_2' has the following data types: {<class 'float'>}
Feature 'ast_diff_3' has the following data types: {<class 'float'>}
Feature 'ast_lag_1' has the following data types: {<class 'float'>}
Feature 'ast_lag_2' has the following data types: {<class 'float'>}
Feature 'ast_lag_3' has the following data types: {<class 'float'>}
Feature 'ast_pct_diff_1' has the following data types: {<class 'float'>}
Feature 'ast_pct_diff_2' has the following data types: {<class 'float'>}
Feature 'ast_pct_diff_3' has the following data types: {<class 'float'>}
Feature 'ast_pct_lag_1' has the following data types: {<class 'float'>}
Feature 'ast_pct_lag_2' has the following data types: {<class 'float'>}
Feature 'ast_pct_lag_3' has the following data types: {<class 'float'>}
Feature 'ast_pct_rolling_10_day_avg' has the following data types: {<class 'float'>}
Feature 'ast_pct_rolling_10_day_std' has the following data types: {<

In [None]:
# print(X_train_seq.dtype)
# print(y_train_seq.dtype)
X_train_seq = X_train_seq.astype(np.float32)

X_train_seq.dtype

ValueError: could not convert string to float: 'Brooklyn'

In [None]:
model.fit(X_train_seq, y_train_seq, epochs=10, batch_size=32, verbose=0)


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [None]:

    for current_week in unique_weeks:
        start_week = current_week - num_weeks_for_training
        training_weeks = list(range(start_week, current_week))

        # Select training data (previous 4 weeks)
        X_train = X[X['league_week'].isin(training_weeks)]
        y_train = y.loc[X_train.index]

        # Select test data (current week)
        X_test = X[X['league_week'] == current_week]
        y_test = y.loc[X_test.index]

        if X_train.empty or X_test.empty:
            continue

        identifying_test_data = X_test[['player_name', 'game_date', 'game_id']]
        X_train = X_train.drop(columns=['game_date', 'game_id'])
        X_test = X_test.drop(columns=['game_date', 'game_id'])

        # Scale data
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Create sequences for training
        X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train, time_steps)
        X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test, time_steps)

        # Build and train the RNN model
        model = build_rnn_model((X_train_seq.shape[1], X_train_seq.shape[2]))
        model.fit(X_train_seq, y_train_seq, epochs=10, batch_size=32, verbose=0)

        # Make predictions
        y_pred = model.predict(X_test_seq)

        # Store the predictions and true values
        all_predictions.extend(y_pred.flatten())
        all_true_values.extend(y_test_seq)
        all_game_ids.extend(list(identifying_test_data['game_id'])[time_steps:])
        all_game_dates.extend(list(identifying_test_data['game_date'])[time_steps:])
        all_player_ids.extend(list(identifying_test_data['player_name'])[time_steps:])
        all_fanduel_salaries.extend(X_test['salary-draftkings'].values[time_steps:])
        all_draftkings_salaries.extend(X_test['salary-draftkings'].values[time_steps:])
        all_yahoo_salaries.extend(X_test['salary-yahoo'].values[time_steps:])
        all_fanduel_positions.extend(X_test['pos-draftkings'].values[time_steps:])
        all_draftkings_positions.extend(X_test['pos-draftkings'].values[time_steps:])
        all_yahoo_positions.extend(X_test['pos-yahoo'].values[time_steps:])

        if save_model:
            model_filename = f'{model_dir}/rnn_model_week_{current_week}_trained_on_{start_week}_to_{current_week - 1}.h5'
            model.save(model_filename)

        mse = mean_squared_error(y_test_seq, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test_seq, y_pred)

        print(f'Training weeks: {training_weeks}')
        print(f'Test week: {current_week}')
        print(f'Mean Squared Error (MSE): {mse:.2f}')
        print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
        print(f'R-squared (R²): {r2:.2f}')
        print('')

    results_df = pd.DataFrame({
        'player_name': all_player_ids,
        'game_id': all_game_ids,
        'game_date': all_game_dates,
        'y': all_true_values,
        'y_pred': all_predictions,
        'fanduel_salary': all_fanduel_salaries,
        'draftkings_salary': all_draftkings_salaries,
        'yahoo_salary': all_yahoo_salaries,
        'fanduel_position': all_fanduel_positions,
        'draftkings_position': all_draftkings_positions,
        'yahoo_position': all_yahoo_positions,
    })



In [None]:
]

            print(f'Training RNN models for {cat}')
            print('---------------------------------')
            cat_results = rolling_train_test_rnn(X=X, y=y, df=df)
            cat_results.rename(columns={'y': cat, 'y_pred': f'{cat}_pred'}, inplace=True)
            if len(season_results) == 0:
                season_results = cat_results
            else:
                season_results = pd.merge(
                    season_results,
                    cat_results,
                    on=['player_name', 'game_date', 'game_id', 'fanduel_salary', 'draftkings_salary', 'yahoo_salary', 'draftkings_position', 'fanduel_position', 'yahoo_position'],
                    suffixes=('', f'_{df.columns.name}'))
            cat_results.to_csv(f'output_csv/{cat}_{season}_rnn_results.csv', index=False)

        all_seasons_results.append(season_results)

    combined_df = pd.concat(all_seasons_results, ignore_index=True)
    combined_df['fp_fanduel'] = combined_df.apply(lambda row: calculate_fp_fanduel(row), axis=1)
    combined_df['fp_fanduel_pred'] = combined_df.apply(lambda row: calculate_fp_fanduel(row, pred_mode=True), axis=1)

    combined_df['fp_yahoo'] = combined_df.apply(calculate_fp_yahoo, axis=1)
    combined_df['fp_yahoo_pred'] = combined_df.apply(lambda row: calculate_fp_yahoo(row, pred_mode=True), axis=1)

    combined_df['fp_draftkings'] = combined_df.apply(calculate_fp_draftkings, axis=1)
    combined_df['fp_draftkings_pred'] = combined_df.apply(lambda row: calculate_fp_draftkings(row, pred_mode=True),
                                                          axis=1)
    return combined_df


In [None]:
xt

# Take 2

In [None]:
df = pd.read_csv('gamelogs_salaries_2018-19_merged.csv')
df = df.drop('Unnamed: 0', axis=1)

In [None]:
df = df.sort_values(['game_date'], ascending=True)
df = assign_league_weeks(df)
df = clean_numeric_columns(df, same_game_cols)
df = add_time_dependent_features(df, rolling_window=rolling_window)


  df[f'{col}_lag_2'] = gb.shift(2)
  df[f'{col}_lag_3'] = gb.shift(3)
  df[f'{col}_diff_1'] = gb.diff(1)
  df[f'{col}_diff_2'] = gb.diff(2)
  df[f'{col}_diff_3'] = gb.diff(3)
  df[f'{col}_rolling_{rolling_window}_day_avg'] = gb.transform(
  df[f'{col}_rolling_{rolling_window}_day_std'] = gb.transform(
  df[f'{col}_lag_1'] = gb.shift(1)
  df[f'{col}_lag_2'] = gb.shift(2)
  df[f'{col}_lag_3'] = gb.shift(3)
  df[f'{col}_diff_1'] = gb.diff(1)
  df[f'{col}_diff_2'] = gb.diff(2)
  df[f'{col}_diff_3'] = gb.diff(3)
  df[f'{col}_rolling_{rolling_window}_day_avg'] = gb.transform(
  df[f'{col}_rolling_{rolling_window}_day_std'] = gb.transform(
  df[f'{col}_lag_1'] = gb.shift(1)
  df[f'{col}_lag_2'] = gb.shift(2)
  df[f'{col}_lag_3'] = gb.shift(3)
  df[f'{col}_diff_1'] = gb.diff(1)
  df[f'{col}_diff_2'] = gb.diff(2)
  df[f'{col}_diff_3'] = gb.diff(3)
  df[f'{col}_rolling_{rolling_window}_day_avg'] = gb.transform(
  df[f'{col}_rolling_{rolling_window}_day_std'] = gb.transform(
  df[f'{col}_lag_1'] 

In [None]:
# Convert game_date to datetime
df['game_date'] = pd.to_datetime(df['game_date'])

# Define categorical columns
cat_cols = ['team_abbreviation', 'player_name', 'opponent', 'pos-draftkings', 'pos-fanduel', 'pos-yahoo']

# Label encode categorical variables
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [None]:
current_week = 8
num_weeks_for_training = 4
start_week = current_week - num_weeks_for_training
training_weeks = list(range(start_week, current_week))
time_steps = 4

# Split the data into training and testing sets
X_train = df[df['league_week'].isin(training_weeks)]
y_train = df[df['league_week'].isin(training_weeks)][target]
X_test = df[df['league_week'] == current_week]
y_test = df[df['league_week'] == current_week][target]

# Reset index for y_train and y_test
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Drop non-feature columns
X_train = X_train.drop(columns=['game_date', 'game_id'])
X_test = X_test.drop(columns=['game_date', 'game_id'])


In [None]:
X_train_seq, y_train_seq = create_sequences(X_train, y_train, time_steps)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, time_steps)


In [None]:
inputs = []
embeddings = []
for col in cat_cols:
    input_dim = df[col].nunique()  # number of unique categories
    output_dim = min(50, input_dim // 2)  # embedding size, you can adjust this
    input_layer = Input(shape=(1,), name=col)
    embedding_layer = Embedding(input_dim=input_dim, output_dim=output_dim, input_length=1)(input_layer)
    embedding_layer = Flatten()(embedding_layer)
    inputs.append(input_layer)
    embeddings.append(embedding_layer)

# Combine all embeddings
combined_embeddings = Concatenate()(embeddings)




In [None]:
rnn_input = Input(shape=(time_steps, X_train_seq.shape[2] - len(cat_cols)), name='rnn_input')
x = LSTM(64, return_sequences=True)(rnn_input)
x = LSTM(32)(x)
x = Dense(32, activation='relu')(x)


In [None]:
x = Concatenate()([x, combined_embeddings])


In [None]:
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
output = Dense(1, activation='linear')(x)


In [None]:
# Define the model
model = Model(inputs=[rnn_input] + inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Model summary
model.summary()


In [None]:
X_train_inputs = {col: X_train[col].values for col in cat_cols}
X_test_inputs = {col: X_test[col].values for col in cat_cols}

In [None]:
X_train_inputs['rnn_input'] = X_train_seq
X_test_inputs['rnn_input'] = X_test_seq


In [None]:
# Train the model
model.fit(X_train_inputs, y_train_seq, epochs=10, batch_size=32, verbose=1)

# Evaluate the model
model.evaluate(X_test_inputs, y_test_seq)


ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 4171, 4171, 4171, 4171, 4171, 4167, 4171
'y' sizes: 4167
