In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

print("hello")

hello


In [2]:
# Your custom column names
column_names = ['index', 'restaurant_id', 'list_position', 'total_available_restaurants', 'estimate_delivery_time', 'menu_category', 'star_rating', 'purchasers']

# Load the DataFrame, skipping the first row and directly assigning your custom column names
df = pd.read_csv('data_train.csv', header=0)

# Assign your custom column names
df.columns = column_names

In [3]:
df.shape

(25668, 8)

In [4]:
# top 5 rows 
df.head()

Unnamed: 0,index,restaurant_id,list_position,total_available_restaurants,estimate_delivery_time,menu_category,star_rating,purchasers
0,19499,68,19,26,35,indian,1.0,42
1,5515,899,9,29,20,italian,,49
2,5461,2964,2,11,20,american,4.0,40
3,2868,1993,10,14,25,indian,,45
4,26403,25,49,50,45,indian,,54


In [5]:
# drop star_rating, too many missing values for it to be valuable 
df = df.drop(columns='index')
# no missing values now 
df.isnull().sum()

restaurant_id                  0
list_position                  0
total_available_restaurants    0
estimate_delivery_time         0
menu_category                  0
star_rating                    0
purchasers                     0
dtype: int64

In [6]:
df['restaurant_id'] = df['restaurant_id'].astype('category')
df.dtypes

restaurant_id                  category
list_position                     int64
total_available_restaurants       int64
estimate_delivery_time            int64
menu_category                    object
star_rating                      object
purchasers                        int64
dtype: object

In [7]:
df['star_rating'].unique()

array(['          1 ', '            ', '          4 ', '          3 ',
       '          5 ', '          2 '], dtype=object)

In [8]:
df['star_rating'] = df['star_rating'].str.strip()  # Remove leading and trailing whitespace
df['star_rating'] = df['star_rating'].replace('', np.nan)  # Replace empty strings with NaN
df['star_rating'] = df['star_rating'].astype(float)  # Convert to float
df['star_rating'] = df['star_rating'].astype('category')  # Convert to categorical

# Display the first few rows of the DataFrame to verify
print(df['star_rating'].unique())

[1.0, NaN, 4.0, 3.0, 5.0, 2.0]
Categories (5, float64): [1.0, 2.0, 3.0, 4.0, 5.0]


In [9]:
# Encode categorical variables
label_encoder = LabelEncoder()
df['menu_category'] = label_encoder.fit_transform(df['menu_category'])
df['star_rating'] = label_encoder.fit_transform(df['star_rating'])


In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Define the models to be used
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor()
}

# Function to calculate RMSE
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Multi-Armed Bandit Approach
def multi_armed_bandit(data, target, models):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
    
    # Initialize a dictionary to store the RMSE of each model
    model_rmse = {model: [] for model in models}
    
    # Perform n_rounds of evaluation
    for model_name, model in models.items():
        # Fit the model
        model.fit(X_train, y_train)
        # Predict on the test set
        y_pred = model.predict(X_test)
        # Calculate RMSE and store it
        rmse = calculate_rmse(y_test, y_pred)
        model_rmse[model_name].append(rmse)
    
    # Calculate the average RMSE for each model
    avg_rmse = {model: np.mean(rmses) for model, rmses in model_rmse.items()}
    
    # Create a DataFrame to display the results
    results_df = pd.DataFrame(list(avg_rmse.items()), columns=['Model', 'Average RMSE'])
    return results_df

# Load your dataset
# Assuming your dataset is in a CSV file called 'data.csv' and the target column is 'target'
data = df
target = data.pop('purchasers')

# Run the Multi-Armed Bandit approach
results_df = multi_armed_bandit(data, target, models)

# Display the results
print(results_df)


                   Model  Average RMSE
0       LinearRegression     34.047931
1  DecisionTreeRegressor     47.976598
2  RandomForestRegressor     35.972551
3                    SVR     35.610104
4    KNeighborsRegressor     35.499754


In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load your dataset
data = pd.read_csv('data_train.csv')
# Your custom column names
column_names = ['index', 'restaurant_id', 'list_position', 'total_available_restaurants', 'estimate_delivery_time', 'menu_category', 'star_rating', 'purchasers']
# Assign your custom column names
data.columns = column_names

# Define the target and features
target = data['purchasers']
features = data.drop(columns=['purchasers'])

# Preprocess the categorical features using OneHotEncoder
categorical_features = ['restaurant_id', 'menu_category', 'star_rating']
numeric_features = ['list_position', 'total_available_restaurants', 'estimate_delivery_time']

# Column transformer to apply preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features)
    ])

# Define the model
model = DecisionTreeRegressor(random_state=42)

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'model__max_depth': [5, 10, 20, None],
    'model__min_samples_split': [2, 10, 20],
    'model__min_samples_leaf': [1, 5, 10],
    'model__max_features': [None, 'sqrt', 'log2']
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Display the results
print(f'Best Parameters: {grid_search.best_params_}')
print(f'RMSE: {rmse}')

Best Parameters: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 20}
RMSE: 33.854491242692184


In [12]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# 
# # Load your dataset
# data = pd.read_csv('data_train.csv')
# # Your custom column names
# column_names = ['index', 'restaurant_id', 'list_position', 'total_available_restaurants', 'estimate_delivery_time', 'menu_category', 'star_rating', 'purchasers']
# # Assign your custom column names
# data.columns = column_names
# 
# # Define the target and features
# target = data['purchasers']
# features = data.drop(columns=['purchasers'])
# 
# # Preprocess the categorical features using OneHotEncoder
# categorical_features = ['restaurant_id', 'menu_category', 'star_rating']
# numeric_features = ['list_position', 'total_available_restaurants', 'estimate_delivery_time']
# 
# # Column transformer to apply preprocessing steps
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
#         ('num', StandardScaler(), numeric_features)
#     ])

# Define the K-Neighbors Regressor model
model = KNeighborsRegressor()

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'model__n_neighbors': [3, 5, 7],
    'model__weights': ['uniform', 'distance'],
    'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Use a smaller subset for initial tuning
X_train_sub, _, y_train_sub, _ = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

# GridSearchCV for hyperparameter tuning with reduced number of folds
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train_sub, y_train_sub)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Display the results
print(f'Best Parameters: {grid_search.best_params_}')
print(f'RMSE: {rmse}')

Best Parameters: {'model__algorithm': 'auto', 'model__n_neighbors': 7, 'model__weights': 'uniform'}
RMSE: 36.01849398465335


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the Ridge Regression model
model = Ridge()

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'model__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Display the results
print(f'Best Parameters: {grid_search.best_params_}')
print(f'RMSE: {rmse}')


Best Parameters: {'model__alpha': 10.0}
RMSE: 32.2028266761723


In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression


# Load your dataset
data = pd.read_csv('data_train.csv')
# Your custom column names
column_names = ['index', 'restaurant_id', 'list_position', 'total_available_restaurants', 'estimate_delivery_time', 'menu_category', 'star_rating', 'purchasers']
# Assign your custom column names
data.columns = column_names

# Define the target and features
target = data['purchasers']
features = data.drop(columns=['purchasers'])
# Preprocess the categorical features using OneHotEncoder and numeric features using StandardScaler and PolynomialFeatures
categorical_features = ['restaurant_id', 'menu_category', 'star_rating']
numeric_features = ['list_position', 'total_available_restaurants', 'estimate_delivery_time']

# Column transformer to apply preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=2, include_bias=False))
        ]), numeric_features)
    ])

# Define the Ridge Regression model
model = Ridge()

# Create a pipeline with feature selection
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k='all')),
    ('model', model)
])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'feature_selection__k': [10, 20, 'all'],
    'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Display the results
print(f'Best Parameters: {grid_search.best_params_}')
print(f'RMSE: {rmse}')

Best Parameters: {'feature_selection__k': 'all', 'model__alpha': 10.0}
RMSE: 32.17273114631744


In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression


# Load your dataset
data = pd.read_csv('data_train.csv')
# Your custom column names
column_names = ['index', 'restaurant_id', 'list_position', 'total_available_restaurants', 'estimate_delivery_time', 'menu_category', 'star_rating', 'purchasers']
# Assign your custom column names
data.columns = column_names

# Feature Engineering
data['delivery_efficiency'] = data['estimate_delivery_time'] / (data['total_available_restaurants'] + 1)
data['position_ratio'] = data['list_position'] / (data['total_available_restaurants'] + 1)
# data['rating_position_interaction'] = data['star_rating'].astype(str) + "_" + data['list_position'].astype(str)
# data['category_position_interaction'] = data['menu_category'].astype(str) + "_" + data['list_position'].astype(str)

# Define the target and features
target = data['purchasers']
features = data.drop(columns=['purchasers'])

# Preprocess the categorical features using OneHotEncoder and numeric features using StandardScaler and PolynomialFeatures
categorical_features = ['restaurant_id', 'menu_category', 'star_rating']
numeric_features = ['list_position', 'total_available_restaurants', 'estimate_delivery_time', 'delivery_efficiency', 'position_ratio']

# Column transformer to apply preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=2, include_bias=False))
        ]), numeric_features)
    ])

# Define the Ridge Regression model
model = Ridge()

# Create a pipeline with feature selection
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k='all')),
    ('model', model)
])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'feature_selection__k': [10, 20, 'all'],
    'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=45)

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Perform cross-validation
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate average RMSE from cross-validation
cv_rmse = np.sqrt(-cv_scores).mean()

# Display the results
print(f'Best Parameters: {grid_search.best_params_}')
print(f'RMSE on test set: {rmse}')
print(f'Cross-validated RMSE: {cv_rmse}')

Best Parameters: {'feature_selection__k': 'all', 'model__alpha': 10.0}
RMSE on test set: 31.719745298852118
Cross-validated RMSE: 32.48199999820724


In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression

# Load your dataset
data = pd.read_csv('data_train.csv')

# Your custom column names
column_names = ['index', 'restaurant_id', 'list_position', 'total_available_restaurants', 'estimate_delivery_time', 'menu_category', 'star_rating', 'purchasers']
data.columns = column_names

# Feature Engineering
data['delivery_efficiency'] = data['estimate_delivery_time'] / (data['total_available_restaurants'] + 1)
data['position_ratio'] = data['list_position'] / (data['total_available_restaurants'] + 1)

# Define the target and features
target = data['purchasers']
features = data.drop(columns=['purchasers'])

# Preprocess the categorical features using OneHotEncoder and numeric features using StandardScaler and PolynomialFeatures
categorical_features = ['restaurant_id', 'menu_category', 'star_rating']
numeric_features = ['list_position', 'total_available_restaurants', 'estimate_delivery_time', 'delivery_efficiency', 'position_ratio']

# Column transformer to apply preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=2, include_bias=False))
        ]), numeric_features)
    ])

# Define the Linear Regression model
model = LinearRegression()

# Create a pipeline with feature selection
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k='all')),
    ('model', model)
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=45)

# Perform cross-validation to evaluate the model
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model on the training data
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate average RMSE from cross-validation
cv_rmse = np.sqrt(-cv_scores).mean()

# Display the results
print(f'RMSE on test set: {rmse}')
print(f'Cross-validated RMSE: {cv_rmse}')


RMSE on test set: 32.10657353439663
Cross-validated RMSE: 32.835520221096665
