## Training Regression Model for Restaurant Revenue Prediction

In [6]:
import os
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

file_handler = logging.FileHandler('app.log')
file_handler.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.info('This is an info message')

# Kaggle and S3 configuration
KAGGLE_DATASET = 'anthonytherrien/restaurant-revenue-prediction-dataset'
S3_BUCKET = 'restaurant-revenue'
S3_PREFIX = 'restaurant-revenue-prediction'


### Load data from kaggle

In [7]:

# Function to download dataset from Kaggle
def download_kaggle_dataset(dataset_name, download_path):
    logger.info("Downloading dataset from Kaggle...")
    try:
        os.system(f'kaggle datasets download -d {dataset_name} -p {download_path} --unzip')
        logger.info("Dataset downloaded successfully.")
    except Exception as e:
        logger.error(f"Failed to download dataset from Kaggle: {e}")


In [8]:

# Function to load dataset
def load_dataset(file_path):
    logger.info("Loading dataset...")
    try:
        data = pd.read_csv(file_path)
        logger.info("Dataset loaded successfully.")
        return data
    except Exception as e:
        logger.error(f"Failed to load dataset: {e}")
        return None


### Data cleaning and preprocessing

In [9]:

# Function to preprocess and clean data
def preprocess_data(df, target_column=None, is_training=True, preprocessor=None):
    logger.info("Preprocessing data...")
    try:
        df = df.drop("Name", axis=1)
        df.dropna(inplace=True)

        if is_training:
            X = df.drop(columns=[target_column])
            y = df[target_column]
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), ['Rating', 'Seating Capacity', 'Average Meal Price', 'Marketing Budget', 'Social Media Followers', 'Chef Experience Years', 'Number of Reviews', 'Avg Review Length', 'Ambience Score', 'Service Quality Score', 'Weekend Reservations', 'Weekday Reservations']),
                    ('cat', OneHotEncoder(), ['Location', 'Cuisine', 'Parking Availability'])
                ],
                remainder='passthrough'
            )
            X = preprocessor.fit_transform(X)
            logger.info("Data preprocessed successfully (training).")
            return X, y, preprocessor
        else:
            X = preprocessor.transform(df)
            logger.info("Data preprocessed successfully (prediction).")
            return X, preprocessor
    except Exception as e:
        logger.error(f"Failed to preprocess data: {e}")
        return None, None, None


### Train Test Split

In [10]:

# Function to split data
def split_data(X, y):
    logger.info("Splitting data...")
    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        logger.info("Data split successfully.")
        return X_train, X_test, y_train, y_test
    except Exception as e:
        logger.error(f"Failed to split data: {e}")
        return None, None, None, None


In [11]:

# Function to train models
def train_models(X_train, y_train):
    logger.info("Training models...")
    try:
        rf_model = RandomForestRegressor(random_state=42)
        xgb_model = XGBRegressor(random_state=42)
        
        rf_model.fit(X_train, y_train)
        xgb_model.fit(X_train, y_train)
        
        logger.info("Models trained successfully.")
        return rf_model, xgb_model
    except Exception as e:
        logger.error(f"Failed to train models: {e}")
        return None, None


In [12]:

# Function to evaluate models
def evaluate_models(models, X_test, y_test):
    logger.info("Evaluating models...")
    try:
        for name, model in models.items():
            predictions = model.predict(X_test)
            mse = mean_squared_error(y_test, predictions)
            logger.info(f"{name} Model Mean Squared Error: {mse}")
    except Exception as e:
        logger.error(f"Failed to evaluate models: {e}")


In [13]:

# Function to save model to S3
def save_model_to_s3(model, model_name, bucket, prefix):
    logger.info(f"Saving model {model_name} to S3...")
    try:
        model_path = f"/tmp/{model_name}.joblib"
        joblib.dump(model, model_path)
        s3_client = boto3.client('s3')
        s3_client.upload_file(model_path, bucket, f"{prefix}/{model_name}.joblib")
        logger.info(f"Model {model_name} saved to S3 successfully.")
    except Exception as e:
        logger.error(f"Failed to save model to S3: {e}")


In [14]:

# Function to save preprocessor to S3
def save_preprocessor_to_s3(preprocessor, preprocessor_name, bucket, prefix):
    logger.info(f"Saving preprocessor {preprocessor_name} to S3...")
    try:
        preprocessor_path = f"/tmp/{preprocessor_name}.joblib"
        joblib.dump(preprocessor, preprocessor_path)
        s3_client = boto3.client('s3')
        s3_client.upload_file(preprocessor_path, bucket, f"{prefix}/{preprocessor_name}.joblib")
        logger.info(f"Preprocessor {preprocessor_name} saved to S3 successfully.")
    except Exception as e:
        logger.error(f"Failed to save preprocessor to S3: {e}")


In [15]:

# Function to load model from S3
def load_model_from_s3(model_name, bucket, prefix):
    logger.info(f"Loading model {model_name} from S3...")
    try:
        s3_client = boto3.client('s3')
        model_path = f"/tmp/{model_name}.joblib"
        s3_client.download_file(bucket, f"{prefix}/{model_name}.joblib", model_path)
        model = joblib.load(model_path)
        logger.info(f"Model {model_name} loaded successfully.")
        return model
    except Exception as e:
        logger.error(f"Failed to load model from S3: {e}")
        return None


### Load preprocessor for feature consistency 

In [16]:

# Function to load preprocessor from S3
def load_preprocessor_from_s3(preprocessor_name, bucket, prefix):
    logger.info(f"Loading preprocessor {preprocessor_name} from S3...")
    try:
        s3_client = boto3.client('s3')
        preprocessor_path = f"/tmp/{preprocessor_name}.joblib"
        s3_client.download_file(bucket, f"{prefix}/{preprocessor_name}.joblib", preprocessor_path)
        preprocessor = joblib.load(preprocessor_path)
        logger.info(f"Preprocessor {preprocessor_name} loaded successfully.")
        return preprocessor
    except Exception as e:
        logger.error(f"Failed to load preprocessor from S3: {e}")
        return None


In [17]:

if __name__ == "__main__":

    # Set AWS credentials as environment variables
    # os.environ['AWS_ACCESS_KEY_ID'] = 'your-access-key-id'
    # os.environ['AWS_SECRET_ACCESS_KEY'] = 'your-secret-access-key'
    # os.environ['AWS_DEFAULT_REGION'] = 'your-default-region'

    
    # Download dataset from Kaggle
    download_path = './'
    download_kaggle_dataset(KAGGLE_DATASET, download_path)

    # Load dataset
    file_path = os.path.join(download_path, 'restaurant_data.csv')
    data = load_dataset(file_path)

    # Preprocess data
    if data is not None:
        X, y, preprocessor = preprocess_data(data, target_column='Revenue', is_training=True)

    # Split data
    if X is not None and y is not None:
        X_train, X_test, y_train, y_test = split_data(X, y)

        # Train models
        if X_train is not None and y_train is not None:
            rf_model, xgb_model = train_models(X_train, y_train)

            # Evaluate models
            if rf_model is not None and xgb_model is not None:
                models = {'Random Forest': rf_model, 'XGBoost': xgb_model}
                evaluate_models(models, X_test, y_test)

                # Save models and preprocessor to S3
                save_model_to_s3(rf_model, 'random_forest_model', S3_BUCKET, S3_PREFIX)
                save_model_to_s3(xgb_model, 'xgboost_model', S3_BUCKET, S3_PREFIX)
                save_preprocessor_to_s3(preprocessor, 'preprocessor', S3_BUCKET, S3_PREFIX)


Dataset URL: https://www.kaggle.com/datasets/anthonytherrien/restaurant-revenue-prediction-dataset
License(s): CC-BY-SA-4.0
Downloading restaurant-revenue-prediction-dataset.zip to .


100%|██████████| 339k/339k [00:00<00:00, 749kB/s]





## Example code to load model and run the prediction

In [18]:

if __name__ == "__main__":
    # Set AWS credentials as environment variables
    # os.environ['AWS_ACCESS_KEY_ID'] = 'your-access-key-id'
    # os.environ['AWS_SECRET_ACCESS_KEY'] = 'your-secret-access-key'
    # os.environ['AWS_DEFAULT_REGION'] = 'your-default-region'


    # Load the models and preprocessor from S3
    rf_model = load_model_from_s3('random_forest_model', S3_BUCKET, S3_PREFIX)
    xgb_model = load_model_from_s3('xgboost_model', S3_BUCKET, S3_PREFIX)
    preprocessor = load_preprocessor_from_s3('preprocessor', S3_BUCKET, S3_PREFIX)

    if rf_model and xgb_model and preprocessor:
        # Prepare new data for prediction (Example data)
        new_data = pd.DataFrame({
            'Name': ['Example Restaurant'],
            'Location': ['Downtown'],
            'Cuisine': ['Italian'],
            'Rating': [4.5],
            'Seating Capacity': [50],
            'Average Meal Price': [20.0],
            'Marketing Budget': [5000],
            'Social Media Followers': [1000],
            'Chef Experience Years': [15],
            'Number of Reviews': [150],
            'Avg Review Length': [200],
            'Ambience Score': [8.5],
            'Service Quality Score': [9.0],
            'Parking Availability': ['Yes'],
            'Weekend Reservations': [30],
            'Weekday Reservations': [20]
        })

        # Preprocess the new data
        preprocessed_data, _ = preprocess_data(new_data, is_training=False, preprocessor=preprocessor)

        if preprocessed_data is not None:
            # Make predictions with the loaded models
            rf_prediction = rf_model.predict(preprocessed_data)
            xgb_prediction = xgb_model.predict(preprocessed_data)

            logger.info(f"Random Forest Prediction: {rf_prediction}")
            logger.info(f"XGBoost Prediction: {xgb_prediction}")

            # Print predictions
            print(f"Random Forest Prediction: {rf_prediction}")
            print(f"XGBoost Prediction: {xgb_prediction}")


Random Forest Prediction: [310169.3862]
XGBoost Prediction: [312052.03]
