# Hackathon Challenge: Predicting Restaurant Annual Turnover

This notebook outlines the process of building a machine learning model to predict the annual turnover of restaurants across India based on various features provided in the dataset.

## Import Necessary Libraries

In [4]:
# Import necessary libraries
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from math import sqrt
import lightgbm as lgb

# Load the Datasets
train_df = pd.read_csv('../data/Train_dataset_(1).csv')
test_df = pd.read_csv('../data/Test_dataset_(1).csv')

# Feature Engineering Function
def feature_engineering(df):
    # Convert "Opening Day of Restaurant" into restaurant age
    current_date = datetime.now()
    df['Opening Day of Restaurant'] = pd.to_datetime(df['Opening Day of Restaurant'], errors='coerce')
    df['Restaurant Age'] = (current_date - df['Opening Day of Restaurant']).dt.days / 365
    df.drop('Opening Day of Restaurant', axis=1, inplace=True)
    
    # Encode "Cuisine" by the number of cuisines offered
    df['Cuisine Count'] = df['Cuisine'].apply(lambda x: len(x.split(',')))
    df.drop('Cuisine', axis=1, inplace=True)
    
    # Impute missing values for ratings
    ratings_columns = ['Overall Restaurant Rating', 'Live Music Rating', 'Comedy Gigs Rating', 
                       'Value Deals Rating', 'Live Sports Rating']
    df[ratings_columns] = SimpleImputer(strategy='median').fit_transform(df[ratings_columns])
    
    return df

# Apply feature engineering to both train and test datasets
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

# Split the Data
X = train_df.drop(['Annual Turnover', 'Registration Number'], axis=1)
y = train_df['Annual Turnover']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Display data types of the features in X_train
X_train.dtypes


# Dynamically update categorical and numerical features lists based on the current DataFrame
all_features = set(X_train.columns)
categorical_features = ['City', 'Restaurant Location', 'Endoresed By', 'Restaurant Type', 'Restaurant Theme']
numerical_features = list(all_features - set(categorical_features))

# Corrected feature lists
categorical_features = ['City', 'Restaurant Location', 'Endorsed By', 'Restaurant Type', 'Restaurant Theme']
numerical_features = [col for col in X_train.columns if col not in categorical_features]


# Update preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Update preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)




# Update the pipeline to use LightGBM model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', lgb.LGBMRegressor(objective='regression'))])

param_grid = {
    'model__num_leaves': [31, 127, 200],  # Adding a higher option
    'model__max_depth': [5, 10, 15],  # Increasing depth
    'model__learning_rate': [0.01, 0.05, 0.1],  # More granular learning rate options
    'model__n_estimators': [100, 200, 500],  # More estimators
    'model__colsample_bytree': [0.5, 0.75, 1.0],  # Feature fraction
    'model__reg_alpha': [0.0, 0.1, 1.0],  # L1 regularization
    'model__reg_lambda': [0.0, 0.1, 1.0]  # L2 regularization
}

# Continue with GridSearchCV and model fitting as before
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)

# Update model to the best estimator found
model = grid_search.best_estimator_

# Prediction and RMSE Calculation
val_predictions = model.predict(X_val)
rmse_val = sqrt(mean_squared_error(y_val, val_predictions))
print(f"Validation RMSE: {rmse_val}")

def add_advanced_features(df):
    # Interaction between ratings and popularity
    df['Rating_Popularity_Interaction'] = df['Overall Restaurant Rating'] * df['Social Media Popularity']
    # Further features can be added here based on your dataset
    return df

train_df = add_advanced_features(train_df)
test_df = add_advanced_features(test_df)

# Update the preprocessing and model fitting as before, using the expanded `param_grid`

# No change to the GridSearchCV instantiation and fitting, except using the expanded `param_grid`
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Prepare test dataset (excluding 'Registration Number' for predictions)
test_predictions = model.predict(test_df.drop(['Registration Number'], axis=1))

# Generate submission dataframe
submission_df = pd.DataFrame({
    'Registration Number': test_df['Registration Number'],
    'Annual Turnover': test_predictions
})

# Save the submission file
submission_path = '../data/submission8.csv'
submission_df.to_csv(submission_path, index=False)


  df['Opening Day of Restaurant'] = pd.to_datetime(df['Opening Day of Restaurant'], errors='coerce')
  df['Opening Day of Restaurant'] = pd.to_datetime(df['Opening Day of Restaurant'], errors='coerce')


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000611 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 977
[LightGBM] [Info] Number of data points in the train set: 1862, number of used features: 51
[LightGBM] [Info] Start training from score 30160848.549946
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 976
[LightGBM] [Info] Number of data points in the train set: 1863, number of used features: 50
[LightGBM] [Info] Start training from score 30972893.183038
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 975
[LightGBM] [Info] Number of data points in the 