In [1]:
# Step 1: Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from tqdm import tqdm
from sklearn.base import BaseEstimator, RegressorMixin
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df1=pd.read_csv('/kaggle/input/ddd7000/match_data 2.csv/match_data 2.csv')
df2=pd.read_csv('/kaggle/input/ddd7000/match_data.csv/match_data.csv')
df3=pd.read_csv('/kaggle/input/ddd7000/match_data1000/match_data.csv')
df4=pd.read_csv('/kaggle/input/ddd7000/match_data10000/match_data.csv')
df5=pd.read_csv('/kaggle/input/ddd7000/match_data12000/match_data.csv')
df6=pd.read_csv('/kaggle/input/ddd7000/match_data15000/match_data.csv')
df7=pd.read_csv('/kaggle/input/ddd7000/match_data5000/match_data.csv')

In [3]:
df = pd.concat([df1, df2, df3,df4,df5,df6,df7], ignore_index=True)
# df.info()

In [4]:
# Process Date
df['Date'] = pd.to_datetime(df['Date'])
reference_date = pd.to_datetime('2000-01-01')
df['Days Passed'] = (df['Date'] - reference_date).dt.days
df = df.drop(columns=['Date'])

# Label encode high-cardinality columns
le = LabelEncoder()
df['Player Name Encoded'] = le.fit_transform(df['Player Name'])
df = df.drop(columns=['Player Name'])

# Frequency encode Venue
venue_counts = df['Venue'].value_counts()
df['Venue Frequency'] = df['Venue'].map(venue_counts)
df = df.drop(columns=['Venue'])

# Target encode 'Team For' and 'Team Against'
team_for_mean = df.groupby('Team For')['Fantasy Points'].mean()
df['Team For Encoded'] = df['Team For'].map(team_for_mean)

team_against_mean = df.groupby('Team Against')['Fantasy Points'].mean()
df['Team Against Encoded'] = df['Team Against'].map(team_against_mean)
df = df.drop(columns=['Team For', 'Team Against'])

# One-hot encode 'type' (low-cardinality)
df = pd.get_dummies(df, columns=['type'], prefix='type', drop_first=True)

# Display reduced dataframe
# print(df.head())


In [5]:
# Step 2: Prepare Data
X = df.drop(columns=['Fantasy Points'])  # Features
y = df['Fantasy Points']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Define Base Models (estimators for stacking)
base_models = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42)),
    ('knn', KNeighborsRegressor(n_neighbors=5)),
    ('xgb', xgb.XGBRegressor(n_estimators=100, random_state=42, tree_method='auto'))
]

# Step 4: Define Meta-Model (typically a linear model)
meta_model = Ridge(alpha=1.0)

# Custom wrapper to add a progress bar to model training
class TQDMRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, model, total_epochs=None):
        self.model = model
        self.total_epochs = total_epochs if total_epochs else 1  # Default to 1 if not provided

    def fit(self, X, y):
        # Wrap the training process with tqdm to show the progress bar
        with tqdm(total=self.total_epochs, desc=str(self.model)) as pbar:
            self.model.fit(X, y)
            pbar.update(self.total_epochs)
        return self

    def predict(self, X):
        return self.model.predict(X)

# Step 5: Create Stacking Regressor
# Wrap the base models with TQDMRegressor to track progress
base_models_with_progress = [
    (name, TQDMRegressor(model, total_epochs=2)) for name, model in base_models
]

stacking_model = StackingRegressor(estimators=base_models_with_progress, final_estimator=meta_model)


In [6]:
# Print the stacking model details
print("\nStacking Regressor Model:")
print(stacking_model)


Stacking Regressor Model:
StackingRegressor(estimators=[('rf',
                               TQDMRegressor(model=RandomForestRegressor(random_state=42),
                                             total_epochs=2)),
                              ('gb',
                               TQDMRegressor(model=GradientBoostingRegressor(random_state=42),
                                             total_epochs=2)),
                              ('knn',
                               TQDMRegressor(model=KNeighborsRegressor(),
                                             total_epochs=2)),
                              ('xgb',
                               TQDMRegressor(model=XGBRegressor(base_score=None,
                                                                booster=None,
                                                                callbacks=None,
                                                                co...
                                                                i

In [7]:
# Step 6: Train the Stacking Model
stacking_model.fit(X_train, y_train)

# Step 7: Make Predictions and Evaluate the Model
y_pred = stacking_model.predict(X_test)

# Calculate the Mean Squared Error (MSE) to evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error of Stacking Regressor: {mse}')


RandomForestRegressor(random_state=42): 100%|██████████| 2/2 [04:04<00:00, 122.10s/it]
GradientBoostingRegressor(random_state=42): 100%|██████████| 2/2 [00:41<00:00, 20.80s/it]
KNeighborsRegressor(): 100%|██████████| 2/2 [00:00<00:00, 20.99it/s]
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
XGBRegressor(base_score=None, booster=None, callbacks=None,
         

Mean Squared Error of Stacking Regressor: 2326.202113987771


In [8]:
# Optionally, print the first 5 predictions and the actual values
print("Predictions:", y_pred[:200])
print("Actual values:", y_test[:200].values)

Predictions: [ 42.1545287   43.88685898  37.80870504  43.57696398  38.71524496
  50.81945041  40.42372899  41.70886178  49.5619124   20.92708629
  39.6505852   46.07074881  37.63921672  42.08346389  42.34151406
  30.07425976  31.74824018  29.54790564  30.81719382  43.064434
  24.40482154  33.63533321  33.05099585  34.61717808  38.97458982
  38.3673953   32.2635252   34.97820728  81.17138581  47.04056255
  27.45126729  48.27987965  25.29553495  54.48127317 163.78616495
 136.92570456  58.94425735  57.45199353  48.0309116  139.00435435
  50.67655965  44.50730802  34.28586904  78.05226628  22.88076924
  95.5365773   69.92209976  30.99673382  34.47631977  44.01968828
  29.62377859  77.44195842  32.04884015  30.11778188  54.68896656
  44.51647082  37.84811467  41.01702208  30.89083559  32.70235311
  46.19500837  26.75216312  28.0103609   45.79403069  39.95228925
 118.86335558  43.52137473  40.34753682  42.00561621  48.48231919
 118.92711406  14.49251652  37.42036748  29.28886578  35.12000772

In [9]:
import joblib

# Save the trained stacking model
joblib.dump(stacking_model, 'stacking_model.pkl')
print("Model saved successfully!")

Model saved successfully!
