# Hackathon Challenge: Predicting Restaurant Annual Turnover

This notebook outlines the process of building a machine learning model to predict the annual turnover of restaurants across India based on various features provided in the dataset.

## Import Necessary Libraries

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from math import sqrt
import lightgbm as lgb

# Load the Datasets
train_df = pd.read_csv('../data/Train_dataset_(1).csv')
test_df = pd.read_csv('../data/Test_dataset_(1).csv')

# Feature Engineering Function
def feature_engineering(df):
    # Existing feature engineering steps
    df['Opening Day of Restaurant'] = pd.to_datetime(df['Opening Day of Restaurant'], errors='coerce')
    df['Restaurant Age'] = (datetime.now() - df['Opening Day of Restaurant']).dt.days / 365
    df.drop('Opening Day of Restaurant', axis=1, inplace=True)
    df['Cuisine Count'] = df['Cuisine'].apply(lambda x: len(x.split(',')))
    df.drop('Cuisine', axis=1, inplace=True)
    ratings_columns = ['Overall Restaurant Rating', 'Live Music Rating', 'Comedy Gigs Rating', 
                       'Value Deals Rating', 'Live Sports Rating']
    df[ratings_columns] = SimpleImputer(strategy='median').fit_transform(df[ratings_columns])
    
    # New feature engineering steps
    # Example: Interaction between 'Facebook Popularity Quotient' and 'Instagram Popularity Quotient'
    df['Social Media Popularity'] = df['Facebook Popularity Quotient'] * df['Instagram Popularity Quotient']
    
    return df

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

X = train_df.drop(['Annual Turnover', 'Registration Number'], axis=1)
y = train_df['Annual Turnover']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Update preprocessing for numerical data to include polynomial features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('polynomial', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

# Update the pipeline to use an advanced model tuning strategy
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', lgb.LGBMRegressor(objective='regression'))])

# New hyperparameter search space for RandomizedSearchCV
param_distributions = {
    'model__num_leaves': np.arange(20, 200),
    'model__max_depth': [-1, 5, 10, 15, 20],
    'model__learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],
    'model__n_estimators': np.arange(100, 1000, 100),
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__reg_alpha': [0, 0.1, 0.5, 1.0],
    'model__reg_lambda': [0, 0.1, 0.5, 1.0]
}

# Using RandomizedSearchCV for model tuning
random_search = RandomizedSearchCV(model, param_distributions, n_iter=50, cv=3, scoring='neg_root_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)

model = random_search.best_estimator_

# Prediction and RMSE Calculation
val_predictions = model.predict(X_val)
rmse_val = sqrt(mean_squared_error(y_val, val_predictions))
print(f"Validation RMSE: {rmse_val}")

# Predictions for test dataset
test_predictions = model.predict(test_df.drop(['Registration Number'], axis=1))

# Generate and save submission dataframe
submission_df = pd.DataFrame({
    'Registration Number': test_df['Registration Number'],
    'Annual Turnover': test_predictions
})
submission_path = '../data/submission_advanced.csv'
submission_df.to_csv(submission_path, index=False)


  df['Opening Day of Restaurant'] = pd.to_datetime(df['Opening Day of Restaurant'], errors='coerce')
  df['Opening Day of Restaurant'] = pd.to_datetime(df['Opening Day of Restaurant'], errors='coerce')


NameError: name 'preprocessor' is not defined