In [1]:
import pandas as pd
import numpy as np

def preprocess_fantasy_data(df):
    # Step 1: Create HOME_GAME from MATCHUP before dropping it
    df["HOME_GAME"] = df["MATCHUP"].apply(lambda x: 1 if "vs." in x else 0)

    # Step 2: Drop unneeded columns
    drop_cols = [
        "Unnamed: 0", "W", "L", "MIN", "FORM",
        "TEAM_ID", "GAME_ID", "PLAYER_ID", "OPPONENT_ID", "MATCHUP", "OPPONENT_ABBR"
    ]
    df = df.drop(columns=drop_cols, errors='ignore')

    # Step 3: Convert HEIGHT from string to inches
    def height_to_inches(h):
        try:
            feet, inches = map(int, h.split('-'))
            return feet * 12 + inches
        except:
            return np.nan

    df["HEIGHT_IN"] = df["HEIGHT"].apply(height_to_inches)

    # Step 4: Encode POSITION as categorical code
    df["POSITION"] = df["POSITION"].astype("category").cat.codes

    # Step 5: Drop original HEIGHT column
    df = df.drop(columns=["HEIGHT"], errors='ignore')

    # Step 6: Drop rows with any remaining missing values (if any)
    df = df.dropna()

    return df


In [2]:
df = pd.read_csv("data/all_players_stats.csv")
df_future = pd.read_csv("data/all_players_stats2025.csv")

# Extract all GAME_IDs from the 2024–25 dataset
future_game_ids = df_future["GAME_ID"].unique()

# Remove rows from the main dataset where GAME_ID is in the future set
df_main_filtered = df[~df["GAME_ID"].isin(future_game_ids)].copy()
cleaned_df = preprocess_fantasy_data(df_main_filtered)


In [3]:
df_position = df["POSITION"].unique()
df_position

array(['G', 'G-F', 'F', 'C-F', 'F-G', 'C', 'F-C'], dtype=object)

In [4]:
from sklearn.model_selection import train_test_split

X = cleaned_df.drop(columns=["NBA_FANTASY_PTS"])
y = cleaned_df["NBA_FANTASY_PTS"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)

print("MAE:", mean_absolute_error(y_val, y_pred))
print("R²:", r2_score(y_val, y_pred))


MAE: 7.617190276411412
R²: 0.5332614313798828


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import randint

# Define hyperparameter space
param_dist = {
    "n_estimators": randint(50, 150),
    "max_depth": [None] + list(range(10, 51, 10)),
    "max_features": ["sqrt", "log2", 0.5, 0.8],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 5),
    "bootstrap": [True, False]
}

# Use a small subset for tuning
# X_sample = X_train.sample(n=25000, random_state=42)
X_sample = X_train.sample(n=10000, random_state=42)

y_sample = y_train.loc[X_sample.index]

# Set up search
rf_base = RandomForestRegressor(random_state=42, n_jobs=-1)
# random_search = RandomizedSearchCV(
#     estimator=rf_base,
#     param_distributions=param_dist,
#     n_iter=20,
#     cv=3,
#     scoring="neg_mean_absolute_error",
#     verbose=2,
#     n_jobs=-1,
#     random_state=42
# )
random_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_dist,
    n_iter=10,              # ⏩ Try just 10 combinations
    cv=2,                   # ⏩ 2-fold CV instead of 3
    scoring="neg_mean_absolute_error",
    verbose=1,
    n_jobs=-1,
    random_state=42
)


# Fit the model
random_search.fit(X_sample, y_sample)

# Evaluate on validation set
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_val)

print("MAE:", mean_absolute_error(y_val, y_pred))
print("R²:", r2_score(y_val, y_pred))
print("Best Parameters:", random_search.best_params_)


Fitting 2 folds for each of 10 candidates, totalling 20 fits
MAE: 7.64483353818653
R²: 0.5287554237347603
Best Parameters: {'bootstrap': False, 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 138}


In [7]:
!pip install lightgbm

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/5e/23/f8b28ca248bb629b9e08f877dd2965d1994e1674a03d67cd10c5246da248/lightgbm-4.6.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.5 MB 393.8 kB/s eta 0:00:04
   -- ------------------------------------- 0.1/1.5 MB 651.6 kB/s eta 0:00:03
   ---- ----------------------------------- 0.2/1.5 MB 893.0 kB/s eta 0:00:02
   ------- -------------------------------- 0.3/1.5 MB 1.1 MB/s eta 0:00:02
   ------------ --------------------------- 0.5/1.5 MB 1.8 MB/s eta 0:00:01
   ------------------ --------------------- 0.7/1.5 MB 2.1 MB/s eta 0:00:01
   --------------------------- ------------ 1

In [8]:
import lightgbm as lgb

In [9]:
X = cleaned_df.drop(columns=["NBA_FANTASY_PTS"])
y = cleaned_df["NBA_FANTASY_PTS"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1)
lgb_model.fit(X_train, y_train)
y_pred = lgb_model.predict(X_val)

print("MAE:", mean_absolute_error(y_val, y_pred))
print("R²:", r2_score(y_val, y_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2870
[LightGBM] [Info] Number of data points in the train set: 252084, number of used features: 19
[LightGBM] [Info] Start training from score 21.153602
MAE: 7.505963643119989
R²: 0.5448201404505459


In [12]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, r2_score

# Scale + generate polynomial features (degree 2)
poly_model = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=2, include_bias=False),
    Ridge(alpha=10, random_state=42)
)

# Fit and predict
poly_model.fit(X_train, y_train)
y_pred_poly = poly_model.predict(X_val)

# Evaluate
print("MAE:", mean_absolute_error(y_val, y_pred_poly))
print("R²:", r2_score(y_val, y_pred_poly))


MAE: 7.5342202640075655
R²: 0.5409208340855014


In [13]:

from sklearn.metrics import classification_report, confusion_matrix
from lightgbm import LGBMClassifier

def classify_performance(row):
    if row["NBA_FANTASY_PTS"] > row["LAST_30_GAMES_AVG_FORM"] + 5:
        return "over"
    elif row["NBA_FANTASY_PTS"] < row["LAST_30_GAMES_AVG_FORM"] - 5:
        return "under"
    else:
        return "expected"

cleaned_df["PERFORMANCE_CLASS"] = cleaned_df.apply(classify_performance, axis=1)

# Prepare features and target
X = cleaned_df.drop(columns=["NBA_FANTASY_PTS", "PERFORMANCE_CLASS"])
y = cleaned_df["PERFORMANCE_CLASS"]

# Split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Model
clf = LGBMClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2871
[LightGBM] [Info] Number of data points in the train set: 252084, number of used features: 19
[LightGBM] [Info] Start training from score -0.902078
[LightGBM] [Info] Start training from score -0.984321
[LightGBM] [Info] Start training from score -1.511489
[[17416  5987  2167]
 [11718 10335  1498]
 [ 8977  2338  2586]]
              precision    recall  f1-score   support

    expected       0.46      0.68      0.55     25570
        over       0.55      0.44      0.49     23551
       under       0.41      0.19      0.26     13901

    accuracy                           0.48     63022
   macro avg       0.47      0.44      0.43     63022
weighted avg       0.48      0.48      0.46     63022



In [14]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Reduce to a 15k sample for faster search
X_sample = X_train.sample(n=15000, random_state=42)
y_sample = y_train.loc[X_sample.index]

# Hyperparameter space
param_dist = {
    "n_estimators": [100, 200, 300],
    "max_depth": [6, 10, 15],
    "learning_rate": [0.01, 0.05, 0.1],
    "num_leaves": [20, 40, 60],
    "class_weight": [None, "balanced"]
}

# Model
lgb_base = LGBMClassifier(random_state=42, n_jobs=-1)

# Randomized search
search = RandomizedSearchCV(
    lgb_base,
    param_distributions=param_dist,
    n_iter=15,
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit
search.fit(X_sample, y_sample)

# Use best model
best_lgb = search.best_estimator_
y_pred = best_lgb.predict(X_val)

# Evaluation
print("Best Params:", search.best_params_)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))


Fitting 3 folds for each of 15 candidates, totalling 45 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2858
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 19
[LightGBM] [Info] Start training from score -0.899106
[LightGBM] [Info] Start training from score -0.974450
[LightGBM] [Info] Start training from score -1.534021
Best Params: {'num_leaves': 20, 'n_estimators': 300, 'max_depth': 15, 'learning_rate': 0.05, 'class_weight': None}
[[16339  6711  2520]
 [11315 10470  1766]
 [ 8511  2857  2533]]
              precision    recall  f1-score   support

    expected       0.45      0.64      0.53     25570
        over       0.52      0.44      0.48     23551
       under       0.37      0.18      0.24     13901

    accuracy                           0.47     63022
   macro avg       0.45      0.42    

In [17]:
import numpy as np

# Predict probabilities
probs = best_lgb.predict_proba(X_val)
classes = best_lgb.classes_

# Threshold-based prediction logic
threshold = 0.8
y_pred_thresh = []

for p in probs:
    prob_dict = dict(zip(classes, p))
    if prob_dict["over"] >= threshold:
        y_pred_thresh.append("over")
    elif prob_dict["under"] >= threshold:
        y_pred_thresh.append("under")
    else:
        y_pred_thresh.append("expected")

# Evaluate
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_val, y_pred_thresh))
print(classification_report(y_val, y_pred_thresh))


[[25334   235     1]
 [22062  1489     0]
 [13879    16     6]]
              precision    recall  f1-score   support

    expected       0.41      0.99      0.58     25570
        over       0.86      0.06      0.12     23551
       under       0.86      0.00      0.00     13901

    accuracy                           0.43     63022
   macro avg       0.71      0.35      0.23     63022
weighted avg       0.68      0.43      0.28     63022



In [18]:
thresholds = [0.5, 0.6, 0.7, 0.8]
from sklearn.metrics import classification_report

for thresh in thresholds:
    y_pred_thresh = []
    for p in probs:
        prob_dict = dict(zip(classes, p))
        if prob_dict["over"] >= thresh:
            y_pred_thresh.append("over")
        elif prob_dict["under"] >= thresh:
            y_pred_thresh.append("under")
        else:
            y_pred_thresh.append("expected")
    
    print(f"\nThreshold = {thresh}")
    print(classification_report(y_val, y_pred_thresh))



Threshold = 0.5
              precision    recall  f1-score   support

    expected       0.43      0.85      0.57     25570
        over       0.63      0.30      0.40     23551
       under       0.45      0.04      0.07     13901

    accuracy                           0.47     63022
   macro avg       0.50      0.40      0.35     63022
weighted avg       0.51      0.47      0.40     63022


Threshold = 0.6
              precision    recall  f1-score   support

    expected       0.42      0.93      0.58     25570
        over       0.70      0.20      0.31     23551
       under       0.53      0.01      0.02     13901

    accuracy                           0.45     63022
   macro avg       0.55      0.38      0.30     63022
weighted avg       0.55      0.45      0.36     63022


Threshold = 0.7
              precision    recall  f1-score   support

    expected       0.42      0.97      0.59     25570
        over       0.78      0.12      0.21     23551
       under       0.53 