In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

# Load data
df = pd.read_csv('data_train.csv')

# Column names appear to be messed up, so manually correct them
df.columns = [
    'index', 
    'restaurant_id', 
    'list_position',
    'total_available_restaurants', 
    'estimate_delivery_time',
    'menu_category', 
    'star_rating', 
    'purchasers'
]

# I'm lazy, so I'm dropping the columns that I don't want to use for modeling ... but maybe you will want to?
df.drop(columns=['index', 'restaurant_id', 'star_rating'], inplace=True)

# If I wasn't lazy, I would probably do some feature engineering here ...


# Split data into features and target
X = df.drop('purchasers', axis=1)
y = df['purchasers']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Since the 'menu_category' column is categorical and not numerical, we must process it before modeling
categorical_features = ['menu_category']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)],
    remainder='passthrough')

# Create pipeline with preprocessor and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
rmse = root_mean_squared_error(y_test, y_pred)
print(f'Root Mean Squared Error (RMSE): {rmse}')

Root Mean Squared Error (RMSE): 34.11682777022838
