In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from ipywidgets import interact, widgets

In [None]:
df = pd.read_csv(r'C:\Users\susha\Downloads\archive\swiggy.csv')
df.head(10)

In [None]:
df.info()


In [None]:
df.describe()

In [None]:
# Unique restaurants
unique_restaurant = df['Restaurant'].nunique()
unique_restaurant

In [None]:
# display all cuisines
cuisines = df['Food type'].str.split(',').explode().str.strip()
number_of_cuisines = cuisines.value_counts()
number_of_cuisines

In [None]:
# Number of restaurants per cities
num_per_city = df.groupby('City')['City'].count()
num_per_city

In [None]:
num_per_city.idxmax()

In [None]:
# Highest rated restaurant

rest_by_rating = df.sort_values(by='Avg ratings', ascending=False)
rest_by_rating[['Restaurant','City','Avg ratings']].head(10)

In [None]:
# Cost of two person across cities
city_wise = df.groupby('City')['Price'].mean().reset_index()
city_wise



In [None]:
X_city = city_wise['City']
y_price = city_wise['Price']*2

plt.figure()
plt.grid()
plt.plot(X_city, y_price)
plt.scatter(X_city, y_price, c='red')
plt.xticks(rotation=90)
plt.xlabel('City')
plt.ylabel('Price for 2')
plt.show()

In [None]:
# For interactive plot using plotly as px (hovering, zooming)
city_wise_sorted = city_wise.sort_values(by='City')
fig = px.line(city_wise_sorted, x='City', y='Price', title='Cost of 2 per city', labels={'City': 'City', 'Price': 'Price'}, markers=True)
fig.show()



In [None]:
# Distribution of restaurant ratings
ratings = df['Avg ratings']
plt.hist(ratings, bins=30, color='green', edgecolor='red')
plt.show()

In [None]:
sns.histplot(data=df, x='Avg ratings', bins=30, kde=True)

In [None]:
pr = df.sort_values(by='Price')
pr

In [None]:
# Correlation between Price and rating
plt.scatter(df['Price'], df['Avg ratings'])
plt.show()

In [None]:
corr_mat = df[['Price','Avg ratings']].corr()

sns.heatmap(corr_mat, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Common cuisines in top rated restaurants
# cuisine_rating = df1.groupby('Cuisines')['Avg ratings'].mean().sort_values(ascending=False).head(20).reset_index()
# cuisine_rating

In [None]:
# df1 = df.copy()
# df1['Cuisines'] = df1['Food type'].str.split(',')
# df1 = df1.explode('Cuisines')
# df1['Cuisines'] = df1['Cuisines'].str.strip().str.title()
# df1
# #df1.groupby('Cuisines')['Price'].mean().sort_values(ascending=False).head(20)

In [None]:
# Expensive cuisines
import re

df1 = df.copy()

df1['Cuisines'] = df1['Food type'].str.split(r',|\s{2,}')

df1 = df1.explode('Cuisines')
df1['Cuisines'] = df1['Cuisines'].str.strip().str.title()

df1.groupby('Cuisines')['Price'].mean().sort_values(ascending=False).head(20)

In [None]:
# Top rated Cuisines everywhere
df1.groupby('Cuisines')['Avg ratings'].mean().sort_values(ascending=False).head(20)

In [None]:
# Most common cuisines in top rated restaurants

df_rating = df.sort_values(by='Avg ratings', ascending=False)
df_rating[['Restaurant','Avg ratings','Food type']].head(20)

In [None]:
# Areas having the most restaurants
df.groupby('Area')['Area'].count().idxmax()

In [None]:
num_rest = df.groupby('Area')['Area'].count().reset_index(name='count')
num_rest.sort_values(by='count', ascending=False).head(10)

In [None]:
# Average price andd rating by location
avg_price_rating = df.groupby('Area')[['Price','Avg ratings']].mean()
avg_price_rating

In [None]:
# Check all the restaurants with price equals to zero
df[df['Price']==0.0]

In [None]:
# Since there are only 5 such entries, it is better to drop them
df = df[df['Price'] !=0]

In [None]:
# Visualization
# Heatmap of average restaurant ratings by area
avg_rating_by_area = df.groupby('Area')['Avg ratings'].mean().reset_index(name='Ratings')
avg_rating_by_area


In [None]:
avg_rating_by_city = df.groupby('City')['Avg ratings'].mean().reset_index(name='Ratings')
sns.barplot(x=avg_rating_by_city['City'], y= avg_rating_by_city['Ratings'])
plt.xticks(rotation=90)

In [None]:
plt.plot(avg_rating_by_city['City'], avg_rating_by_city['Ratings'])
plt.scatter(avg_rating_by_city['City'], avg_rating_by_city['Ratings'],c='red')
plt.xticks(rotation=90)
plt.grid()

In [None]:
#Delivery Time vs Price by City
sns.scatterplot(x='Price', y='Delivery time', data=df, hue='City', palette='viridis')
plt.title('Delivery Time vs. Price by City')
plt.xlabel('Price')
plt.ylabel('Delivery Time')
plt.show()

In [None]:
# Pairplot
sns.pairplot(df[['Price', 'Avg ratings', 'Delivery time']])
plt.show()

In [None]:
# Word cloud of cuisines
from wordcloud import WordCloud
all_cuisines = ' '.join(df1['Cuisines'].dropna().unique())
wordcloud = WordCloud(width=800, height=400, background_color = 'white').generate(all_cuisines)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Pie chart of restaurant by city
city_counts = df['City'].value_counts()
city_counts.plot(kind='pie', autopct='%1.1f%%', figsize=(8, 8), startangle=90)
plt.title('Restaurant Distribution by City')
plt.ylabel('')
plt.show()

In [None]:
import plotly.express as px

cuisine_price = df1.groupby('Cuisines')['Price'].mean().reset_index()
fig = px.bar(cuisine_price.sort_values('Price', ascending=False).head(20),
             x='Cuisines', y='Price', title='Average Price by Cuisine',
             hover_data=['Price'], color='Price')
fig.update_layout(xaxis_tickangle=-45)
fig.show()

In [None]:
restaurant_counts = df.groupby(['City', 'Area']).size().reset_index(name='Count')

import plotly.express as px
fig = px.treemap(restaurant_counts, path=['City', 'Area'], values='Count',
                 title='Restaurant Distribution by City and Area')
fig.show()

In [None]:
restaurant_counts = df.groupby(['City', 'Area']).size().reset_index(name='Count')

import plotly.express as px
fig = px.sunburst(restaurant_counts, path=['City', 'Area'], values='Count',
                  title='Sunburst of Restaurants by Location')
fig.show()

In [None]:
fig = px.box(df1, x='Cuisines', y='Price', points='all')
fig.update_layout(title='Price Distribution by Cuisine')
fig.show()

In [None]:
rest_in_Bangalore = (df[df['City'] == 'Bangalore'].groupby(['Area', 'Restaurant'], as_index=False).agg({'Avg ratings': 'mean', 'Total ratings': 'sum'}))
rest_in_Bangalore = rest_in_Bangalore.sort_values(['Area', 'Avg ratings'], ascending=[True, False])
rest_in_Bangalore = rest_in_Bangalore.groupby('Area').head(3)

fig = px.bar(rest_in_Bangalore, 
             x='Avg ratings', 
             y='Restaurant', 
             color='Area',
             orientation='h',
             hover_data=['Total ratings'],
             title=f'Top Restaurants by Area in Bangalore')
fig.update_layout(height=800)
fig.show()

In [None]:


city_list = sorted(df['City'].dropna().unique().tolist())

def plot_top_restaurants_by_city(selected_city):
    filtered_df = (df[df['City'] == selected_city]
                   .groupby(['Area', 'Restaurant'], as_index=False)
                   .agg({'Avg ratings': 'mean', 'Total ratings': 'sum'}))

    filtered_df = filtered_df.sort_values(['Area', 'Avg ratings'], ascending=[True, False])
    filtered_df = filtered_df.groupby('Area').head(3)

    fig = px.bar(filtered_df, 
                 x='Avg ratings', 
                 y='Restaurant', 
                 color='Area', 
                 orientation='h',
                 hover_data=['Total ratings'],
                 title=f'Top Restaurants by Area in {selected_city}')
    fig.update_layout(height=600)
    
    fig.show()

interact(plot_top_restaurants_by_city, selected_city=widgets.Dropdown(options=city_list, description='City:'))


In [None]:
#df1

In [None]:
# a = df1.groupby('Cuisines')['Cuisines'].count().reset_index(name='count')
# top_30_food = a.sort_values('count', ascending=False).head(30)

a = df1['Cuisines'].value_counts().reset_index()
a.columns = ['Cuisines', 'count']
top_30_food_list = a.head(30)['Cuisines'].tolist()

In [None]:
df1['Cuisines'] = df1['Cuisines'].apply(lambda x: x if x in top_30_food_list else 'Other')

In [None]:
top_areas_per_city = df1.groupby('City')['Area'].value_counts().reset_index(name='count')

In [None]:
top_10_areas = top_areas_per_city.groupby('City')[['City','Area']].head(10)

In [None]:
top_pairs = set([tuple(x) for x in top_10_areas.to_numpy()])
                

In [None]:
df1['Area'] = df1.apply(lambda x: x['Area'] if (x['City'], x['Area']) in top_pairs else 'Other',axis=1)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
cuisine_encoded = pd.DataFrame(
    mlb.fit_transform(df1['Cuisines']),
    columns=['Cuisine_' + c for c in mlb.classes_],
    index=df1.index
)


In [None]:
area_dummies = pd.get_dummies(df1['Area'], prefix='Area')
area_updated = area_dummies.astype(int)


In [None]:
df_encoded = pd.concat([df1, area_updated, cuisine_encoded], axis=1)

In [None]:
#df_encoded

In [None]:
city_encoded = pd.get_dummies(df_encoded['City'], prefix='City')


In [None]:
city_updated = city_encoded.astype(int)
df_encoded = pd.concat([df_encoded, city_updated], axis=1)

In [None]:
df_encoded = df_encoded.drop(['ID','Area','Restaurant','Food type','Address','Cuisines','City'],axis=1)

In [None]:
dff = df_encoded.copy()

In [None]:
dff['is_highly_rated'] = (dff['Avg ratings'] > 4.0).astype(int)
dff

In [None]:
from sklearn.model_selection import train_test_split

X = dff.drop(columns=['is_highly_rated', 'Avg ratings']) 
y = dff['is_highly_rated'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(class_weight='balanced',max_iter=5000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Accuracy :", accuracy_score(y_test, y_pred))          # 0.654 (already known)
print("Precision:", precision_score(y_test, y_pred))         # TP / (TP + FP)
print("Recall   :", recall_score(y_test, y_pred))            # TP / (TP + FN)
print("F1 Score :", f1_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print(classification_report(y_test, y_pred_rf))

In [None]:
print(confusion_matrix(y_test, y_pred_rf))

In [None]:
print("Accuracy :", accuracy_score(y_test, y_pred_rf))          # 0.654 (already known)
print("Precision:", precision_score(y_test, y_pred_rf))         # TP / (TP + FP)
print("Recall   :", recall_score(y_test, y_pred_rf))            # TP / (TP + FN)
print("F1 Score :", f1_score(y_test, y_pred_rf))

In [None]:
# feature importance

feature_importances = rf.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=importance_df.head(20), x='Importance', y='Feature')
plt.title('Top 20 Important Features in Random Forest')
plt.tight_layout()
plt.show()

In [None]:
low_importance = importance_df[importance_df['Importance'] < 0.01]['Feature'].tolist()
X_reduced = X.drop(columns=low_importance)

X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
rf.fit(X_train_reduced, y_train)
y_pred = rf.predict(X_test_reduced)

from sklearn.metrics import accuracy_score
print("New Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred_rf))

In [None]:
print("Accuracy :", accuracy_score(y_test, y_pred))          # 0.654 (already known)
print("Precision:", precision_score(y_test, y_pred))         # TP / (TP + FP)
print("Recall   :", recall_score(y_test, y_pred))            # TP / (TP + FN)
print("F1 Score :", f1_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

auc = roc_auc_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_reduced, y, cv=5)
print("Cross-validated accuracy:", scores.mean())

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

auc = roc_auc_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # For binary classification
    'eval_metric': 'logloss',        # Evaluation metric for binary classification
    'max_depth': 6,                  # Maximum depth of trees
    'eta': 0.1,                      # Learning rate
    'subsample': 0.8,                # Fraction of samples used for training trees
    'colsample_bytree': 0.8,         # Fraction of features used for training trees
    'scale_pos_weight': 1,           # Balance the positive class weight (adjust if needed)
    'n_jobs': -1                     # Use all CPU cores
}

model = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
y_pred = model.predict(dtest)
y_pred_binary = (y_pred > 0.5).astype(int)  # Convert probabilities to binary labels

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_binary))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_binary))

In [None]:

important_features = ['Total ratings', 'Price', 'Delivery time', 'Area_Other']  

X_train_filtered = X_train[important_features]
X_test_filtered = X_test[important_features]

model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train_filtered, y_train)
y_pred = model.predict(X_test_filtered)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# Can we classify restaurants as expensive or affordable based on price ranges?

dff['is_expensive'] = (dff['Price']>700.0).astype(int)

X = dff.drop(columns=['Price','is_expensive'])
y = dff['is_expensive']



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced',max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
importance = pd.Series(model.coef_[0], index=X_train.columns)

# Sort by absolute value
importance_sorted = importance.abs().sort_values(ascending=False)

# Display top features
print("Top important features:")
print(importance_sorted.head(10))

In [None]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print(classification_report(y_test, y_pred_rf))

In [None]:
feature_importances = rf.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=importance_df.head(20), x='Importance', y='Feature')
plt.title('Top 20 Important Features in Random Forest')
plt.tight_layout()
plt.show()

In [None]:
low_importance = importance_df[importance_df['Importance'] < 0.01]['Feature'].tolist()
X_reduced = X.drop(columns=low_importance)

X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
rf.fit(X_train_reduced, y_train)
y_pred = rf.predict(X_test_reduced)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

auc = roc_auc_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # For binary classification
    'eval_metric': 'logloss',        # Evaluation metric for binary classification
    'max_depth': 6,                  # Maximum depth of trees
    'eta': 0.1,                      # Learning rate
    'subsample': 0.8,                # Fraction of samples used for training trees
    'colsample_bytree': 0.8,         # Fraction of features used for training trees
    'scale_pos_weight': 1,           # Balance the positive class weight (adjust if needed)
    'n_jobs': -1                     # Use all CPU cores
}

model = xgb.train(params, dtrain, num_boost_round=100)

y_pred = model.predict(dtest)
y_pred_binary = (y_pred > 0.27).astype(int)  # Convert probabilities to binary labels

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_binary))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_binary))

In [None]:
auc = roc_auc_score(y_test, y_pred_binary)
fpr, tpr, _ = roc_curve(y_test, y_pred_binary)
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
# lightgbm

import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create LightGBM model
lgb_model = lgb.LGBMClassifier(
    objective='binary',  # Binary classification
    metric='binary_logloss',  # Evaluation metric
    num_leaves=31,  # Number of leaves in one tree
    learning_rate=0.1,  # Learning rate
    n_estimators=100,  # Number of boosting iterations
    subsample=0.8,  # Fraction of data to use for training
    colsample_bytree=0.8,  # Fraction of features to use for each tree
    random_state=42
)

# Train the model
lgb_model.fit(X_train, y_train)

# Make predictions
y_pred_lgb = lgb_model.predict(X_test)

# Evaluate the model
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print(f'Accuracy: {accuracy_lgb * 100:.2f}%')

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_lgb))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgb))


In [None]:
import lightgbm as lgb
import pandas as pd

# Train a LightGBM model
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)

# Get feature importance based on 'gain'
importance = model.booster_.feature_importance(importance_type='gain')

# Create a DataFrame with feature names and their importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importance
})

# Sort the DataFrame by importance in descending order
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

# Select top features (e.g., top 10 most important features)
top_features = feature_importance.head(10)['feature'].tolist()

# Filter the dataset to include only the top features
X_train_filtered = X_train[top_features]
X_test_filtered = X_test[top_features]

# Train a new model with selected features
model.fit(X_train_filtered, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test_filtered)


accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print(f'Accuracy: {accuracy_lgb * 100:.2f}%')

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_lgb))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgb))


In [None]:
dff = dff.drop(columns = ['is_expensive','is_highly_rated'])

In [None]:
# Regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


X = dff.drop('Avg ratings', axis=1)
y = dff['Avg ratings']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"R² Score: {r2:.3f}")

In [None]:
importances = model.feature_importances_
features = X.columns

feature_importance = pd.DataFrame({
    'feature': features,
    'importance': importances
})

feature_importance = feature_importance.sort_values(by='importance', ascending=False)


plt.figure(figsize=(8, 5))
plt.barh(feature_importance['feature'].head(10), feature_importance['importance'].head(10))
plt.gca().invert_yaxis()  # Highest importance on top
plt.xlabel('Feature Importance')
plt.title('Top 10 Feature Importances')
plt.show()

In [None]:
X = dff[['Total ratings','Delivery time','Price','Area_Other']]
y = dff['Avg ratings']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"R² Score: {r2:.3f}")

In [None]:
import matplotlib.pyplot as plt

# Plot Actual vs Predicted Ratings
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  # diagonal line
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title('Actual vs Predicted Ratings')

# Calculate residuals
residuals = y_test - y_pred

# Plot Residuals
plt.subplot(1, 2, 2)
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Ratings')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residuals Plot')

plt.tight_layout()
plt.show()


In [None]:
X = dff[['Total ratings','Delivery time','Price','Area_Other']]
y = dff['Avg ratings']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost Regressor
model = xgb.XGBRegressor(
    objective='reg:squarederror',  # regression task
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"R² Score: {r2:.3f}")

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  # diagonal line
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title('Actual vs Predicted Ratings')

# Calculate residuals
residuals = y_test - y_pred

# Plot Residuals
plt.subplot(1, 2, 2)
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Ratings')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residuals Plot')

plt.tight_layout()
plt.show()

In [None]:
# RandomForest regresssor performed better than XGBoost regressor