In [None]:
# *******************************************************************
# Zomato Bangalore Restaurants Dataset - Project by Wajiha
# *******************************************************************

# *******************************************************************
# Importing relevant libraries

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error

# *******************************************************************

# *******************************************************************
# Loading the Zomato dataset + basic info & cleanup

df = pd.read_csv("zomato.csv", encoding='latin-1')

print(df.head())
print(df.info())
print(df.isnull().sum())

# *******************************************************************

# *******************************************************************
# Cleaning of 'Ratings' column

df['rate'] = df['rate'].astype(str).str.split('/').str[0]
df['rate'] = pd.to_numeric(df['rate'], errors='coerce')
df['rate'] = df['rate'].fillna(df['rate'].mean())

# *******************************************************************
# Cleaning of 'Cost' column

df['approx_cost(for two people)'] = df['approx_cost(for two people)'].astype(str)
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].str.replace(',', '', regex=False)
df['approx_cost(for two people)'] = pd.to_numeric(df['approx_cost(for two people)'], errors='coerce')
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].fillna(
    df['approx_cost(for two people)'].median()
)

# *******************************************************************
# Cleaning of 'Votes' column

df['votes'] = pd.to_numeric(df['votes'], errors='coerce')
df['votes'] = df['votes'].fillna(df['votes'].median())

# *******************************************************************
# Handling duplicates

df.drop_duplicates(inplace=True)
print(df.shape)

# *******************************************************************

# *******************************************************************
# EXPLORATORY DATA ANALYSIS (EDA)

# Distribution of ratings
plt.figure(figsize=(8, 5))
sns.histplot(df['rate'], kde=True)
plt.title("Distribution of Ratings")
plt.show()

# Top 10 locations
df['location'].value_counts().head(10).plot(kind='bar', figsize=(10, 5))
plt.title("Top 10 Locations with Most Restaurants")
plt.xlabel("location")
plt.ylabel("count")
plt.show()

# Average Rating by location
location_rating = df.groupby('location')['rate'].mean().sort_values(ascending=False)[:10]
plt.figure(figsize=(10, 5))
location_rating.plot(kind='bar')
plt.title("Top 10 Locations with Highest Average Rating")
plt.ylabel("Average Rating")
plt.show()

# Cost vs Rating visualisation
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['approx_cost(for two people)'], y=df['rate'])
plt.title("Cost vs Rating")
plt.show()

# *******************************************************************

# *******************************************************************
# Encoding categorical features

le = LabelEncoder()
df['online_order'] = le.fit_transform(df['online_order'])
df['book_table'] = le.fit_transform(df['book_table'])
df['location'] = le.fit_transform(df['location'])

# *******************************************************************
# Preparing data for ML

model_df = df[['online_order', 'book_table', 'votes',
               'approx_cost(for two people)', 'location', 'rate']]

X = model_df.drop('rate', axis=1)
y = model_df['rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

# *******************************************************************

# *******************************************************************
# Save cleaned dataset for GUI implementation 

df.to_csv("zomato_cleaned.csv", index=False)
print("Zomato's cleaned dataset saved successfully!")

# *******************************************************************

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import pickle

# Load cleaned dataset
df = pd.read_csv("zomato_cleaned.csv")

# Summary for dashboard
summary = {
    "total_records": df.shape[0],
    "num_features": df.shape[1],
    "mean_rating": round(df['rate'].mean(), 2),
    "most_common_location": df['location'].mode()[0]
}

# Label Encoding
le = LabelEncoder()
df['online_order'] = le.fit_transform(df['online_order'])
df['book_table'] = le.fit_transform(df['book_table'])
df['location'] = le.fit_transform(df['location'])

# Prepare ML data
model_df = df[['online_order', 'book_table', 'votes',
               'approx_cost(for two people)', 'location', 'rate']]

X = model_df.drop('rate', axis=1)
y = model_df['rate']

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model = RandomForestRegressor()
model.fit(X_train, Y_train)

# Save model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

print("model.pkl created successfully!")


In [None]:
import pandas as pd

df = pd.read_csv("zomato_cleaned.csv")

print("Locations:", df['location'].unique())
print("Cuisines:", df['cuisines'].unique())
print("Rest_type:", df['rest_type'].unique())


In [None]:
import pandas as pd

df = pd.read_csv("zomato_cleaned.csv")

list(df["cuisines"].unique())
