# Hackathon Challenge: Predicting Restaurant Annual Turnover

This notebook outlines the process of building a machine learning model to predict the annual turnover of restaurants across India based on various features provided in the dataset.

## Import Necessary Libraries

In [46]:
# Import necessary libraries
import pandas as pd
import optuna
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from math import sqrt
from lightgbm import LGBMRegressor
import numpy as np  



# Load and preprocess datasets
train_df = pd.read_csv('../data/Train_dataset_(1).csv')
test_df = pd.read_csv('../data/Test_dataset_(1).csv')


# Adjusted Feature Engineering
def feature_engineering(df):
    df['Opening Day of Restaurant'] = pd.to_datetime(df['Opening Day of Restaurant'], errors='coerce', dayfirst=True)
    df['Year Opened'] = df['Opening Day of Restaurant'].dt.year
    current_year = datetime.now().year
    df['Restaurant Age'] = current_year - df['Year Opened']
    df['Cuisine Count'] = df['Cuisine'].apply(lambda x: len(x.split(',')) if pd.notnull(x) else 0)
    df.drop(['Opening Day of Restaurant', 'Cuisine'], axis=1, inplace=True)
    return df



train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

X = train_df.drop(['Annual Turnover', 'Registration Number'], axis=1)
y = train_df['Annual Turnover']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=59)

# Defining preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Fit the preprocessor and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)

# Train an initial LightGBM model
initial_model = LGBMRegressor()
initial_model.fit(X_train_preprocessed, y_train)
importances = initial_model.feature_importances_
# This assumes that feature importances are directly usable for selecting top features. 
# Adjust this logic as needed for your specific feature selection criteria.
threshold = np.sort(importances)[-20]  # Example: select top 20 features; adjust as needed
selected_indices = np.where(importances >= threshold)[0]
# Assuming the process to select features based on importances is correct
# and you have `selected_features` correctly identified

def objective(trial):
    # Suggest parameters for the LightGBM model
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt']),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 31, 128),  # Reduce for smaller datasets
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 0.9),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 0.9),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),  # Increase for overfitting control
        'max_depth': trial.suggest_int('max_depth', 5, 15),  # Limiting depth for control overfitting
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 0.1),  # Additional control for splitting
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),  # Similar to bagging_fraction but can be tested separately
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),  # Control overfitting
    }

    model = LGBMRegressor(**param)
    # Fit using only selected features
    model.fit(X_train_preprocessed[:, selected_indices], y_train, 
              eval_set=[(X_val_preprocessed[:, selected_indices], y_val)], 
              early_stopping_rounds=50, verbose=False)
    
    preds = model.predict(X_val_preprocessed[:, selected_indices])
    rmse = sqrt(mean_squared_error(y_val, preds))
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# After optimization, you can train your final model with the best parameters found:
best_params = study.best_trial.params
final_model = LGBMRegressor(**best_params)
final_model.fit(X_train_preprocessed[:, selected_indices], y_train)


# Prepare test data and make predictions
X_test_preprocessed = preprocessor.transform(test_df.drop(['Registration Number'], axis=1))
test_predictions = final_model.predict(X_test_preprocessed[:, selected_indices])

# Create submission DataFrame
submission_df = pd.DataFrame({
    'Registration Number': test_df['Registration Number'],
    'Annual Turnover': test_predictions
})
submission_path = '../data/submission_lgb_v5.csv'
submission_df.to_csv(submission_path, index=False)


[I 2024-02-09 21:24:48,878] A new study created in memory with name: no-name-35163e4d-8f49-4ad2-9a02-b1b4390f8013
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
[W 2024-02-09 21:24:48,882] Trial 0 failed with parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.008290912690272073, 'lambda_l1': 1.4446862689901869e-06, 'lambda_l2': 0.1372681856936005, 'num_leaves': 123, 'feature_fraction': 0.6053731050630098, 'bagging_fraction': 0.663801726432367, 'bagging_freq': 6, 'min_child_samples': 48, 'max_depth': 14, 'min_split_gain': 0.02919476309699186, 'subsample': 0.737105285857293, 'colsample_bytree': 0.8341828677635825} because of the following error: TypeError("LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds'").
Traceback (most recent call last):
  File "/home/codespace/.python/current/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    valu

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000355 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 747
[LightGBM] [Info] Number of data points in the train set: 2445, number of used features: 54
[LightGBM] [Info] Start training from score 30833496.932515


TypeError: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds'