# Hackathon Challenge: Predicting Restaurant Annual Turnover

This notebook outlines the process of building a machine learning model to predict the annual turnover of restaurants across India based on various features provided in the dataset.

## Import Necessary Libraries

In [20]:
# Import necessary libraries
import pandas as pd
import optuna
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from math import sqrt
from lightgbm import LGBMRegressor

# Load and preprocess the datasets
train_df = pd.read_csv('../data/Train_dataset_(1).csv')
test_df = pd.read_csv('../data/Test_dataset_(1).csv')

# Adjusted Feature Engineering
def feature_engineering(df):
    df['Opening Day of Restaurant'] = pd.to_datetime(df['Opening Day of Restaurant'], errors='coerce', dayfirst=True)
    df['Year Opened'] = df['Opening Day of Restaurant'].dt.year
    current_year = datetime.now().year
    df['Restaurant Age'] = current_year - df['Year Opened']
    df['Cuisine Count'] = df['Cuisine'].apply(lambda x: len(x.split(',')) if pd.notnull(x) else 0)
    df.drop(['Opening Day of Restaurant', 'Cuisine'], axis=1, inplace=True)
    return df

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)



X = train_df.drop(['Annual Turnover', 'Registration Number'], axis=1)
y = train_df['Annual Turnover']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=59)

categorical_features = ['City', 'Restaurant Location', 'Endorsed By', 'Restaurant Type', 'Restaurant Theme']
numerical_features = [col for col in train_df.columns if col not in categorical_features + ['Annual Turnover', 'Registration Number']]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

def objective(trial):
    # Suggest parameters for the LightGBM model
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt']),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 31, 128),  # Reduce for smaller datasets
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 0.9),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 0.9),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),  # Increase for overfitting control
        'max_depth': trial.suggest_int('max_depth', 5, 15),  # Limiting depth for control overfitting
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 0.1),  # Additional control for splitting
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),  # Similar to bagging_fraction but can be tested separately
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),  # Control overfitting
    }

    # Preprocess the data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_val_preprocessed = preprocessor.transform(X_val)

    # Fit the model with early stopping
    model = LGBMRegressor(**param)
    # Note: Specify eval_set and early_stopping_rounds within fit() method
    model.fit(
        X_train_preprocessed, 
        y_train, 
        eval_set=[(X_val_preprocessed, y_val)]
    )
    
    preds = model.predict(X_val_preprocessed, num_iteration=model.best_iteration_)
    rmse = sqrt(mean_squared_error(y_val, preds))
    return rmse


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)



# Train the model with the best parameters
best_params = {k: v for k, v in study.best_trial.params.items() if k not in ['early_stopping_rounds', 'eval_metric', 'eval_set']}

# Initialize and fit the model with corrected parameters within the pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(**best_params))
])
model.fit(X_train, y_train)

# Make predictions and prepare for submission
val_predictions = model.predict(X_val)
rmse_val = sqrt(mean_squared_error(y_val, val_predictions))
print(f"Validation RMSE: {rmse_val}")

test_predictions = model.predict(test_df.drop(['Registration Number'], axis=1))
submission_df = pd.DataFrame({
    'Registration Number': test_df['Registration Number'],
    'Annual Turnover': test_predictions
})
submission_path = '../data/submission_lgb_v5.csv'
submission_df.to_csv(submission_path, index=False)


[I 2024-02-09 21:18:55,498] A new study created in memory with name: no-name-55178fea-96a1-40e9-a3fe-798198a8ed57
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
[I 2024-02-09 21:18:55,607] Trial 0 finished with value: 19230923.21457975 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.009108734594498165, 'lambda_l1': 0.07587625226343363, 'lambda_l2': 4.640687509698562, 'num_leaves': 37, 'feature_fraction': 0.7729788647408471, 'bagging_fraction': 0.5185399928756116, 'bagging_freq': 5, 'min_child_samples': 19, 'max_depth': 14, 'min_split_gain': 0.012572151206218997, 'subsample': 0.8754632465851877, 'colsample_bytree': 0.8241641844018247}. Best is trial 0 with value: 19230923.21457975.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000361 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1276
[LightGBM] [Info] Number of data points in the train set: 2445, number of used features: 59
[LightGBM] [Info] Start training from score 30833496.932515


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
[I 2024-02-09 21:18:55,717] Trial 1 finished with value: 18959435.723572865 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.01465497011504406, 'lambda_l1': 0.00016137375415387233, 'lambda_l2': 5.083727087343212e-06, 'num_leaves': 127, 'feature_fraction': 0.816258508227192, 'bagging_fraction': 0.6115003833681139, 'bagging_freq': 5, 'min_child_samples': 40, 'max_depth': 10, 'min_split_gain': 0.019531877711114743, 'subsample': 0.7731158081727677, 'colsample_bytree': 0.7859411570380794}. Best is trial 1 with value: 18959435.723572865.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
[I 2024-02-09 21:18:55,832] Trial 2 finished with value: 18996249.409414258 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.014152512876648473, 'lambda_l1': 9.86331659689876

ValueError: Specifying the columns using strings is only supported for pandas DataFrames