In [13]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("swiggy.csv")

# Preview the data
df.head()



Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148541 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            148541 non-null  int64 
 1   name          148455 non-null  object
 2   city          148541 non-null  object
 3   rating        148455 non-null  object
 4   rating_count  148455 non-null  object
 5   cost          148410 non-null  object
 6   cuisine       148442 non-null  object
 7   lic_no        148312 non-null  object
 8   link          148541 non-null  object
 9   address       148455 non-null  object
 10  menu          148541 non-null  object
dtypes: int64(1), object(10)
memory usage: 12.5+ MB


In [11]:
# Check number of duplicates
print("Duplicate rows:", df.duplicated().sum())

# Remove duplicates
df = df.drop_duplicates()


Duplicate rows: 0


In [15]:
import numpy as np

# Replace '--' with np.nan and convert to float
df['rating'] = df['rating'].replace('--', np.nan).astype(float)



In [16]:
print("Original shape:", df.shape)

Original shape: (148541, 11)


In [17]:
df['rating_count'] = df['rating_count'].replace('Too Few Ratings', np.nan)
df['rating_count'] = df['rating_count'].str.extract(r'(\d+)')
df['rating_count'] = df['rating_count'].astype(float)


In [18]:
df['cost'] = df['cost'].str.replace('₹', '', regex=False)
df['cost'] = df['cost'].astype(float)

In [19]:
df = df.dropna()
print("After dropping missing values:", df.shape)


After dropping missing values: (61343, 11)


In [20]:
df.to_csv("cleaned_data.csv", index=False)
print("Cleaned data saved as cleaned_data.csv")

Cleaned data saved as cleaned_data.csv


In [3]:
import sklearn
print(sklearn.__version__)


1.6.1


In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import pickle

# Load data
cleaned_data = pd.read_csv("cleaned_data.csv")

# Columns to encode
categorical = ['city', 'cuisine']
drop_cols = categorical + ['name', 'lic_no', 'link', 'address', 'menu']

# Sparse encoder
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')

encoded_sparse = encoder.fit_transform(cleaned_data[categorical])

# Create sparse DataFrame
encoded_df = pd.DataFrame.sparse.from_spmatrix(
    encoded_sparse,
    columns=encoder.get_feature_names_out(categorical),
    index=cleaned_data.index
)

# Merge with numeric data
numeric_df = cleaned_data.drop(columns=drop_cols)
final_df = pd.concat([numeric_df, encoded_df], axis=1)

# Save encoder and final dataset
with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

final_df.to_csv("encoded_data.csv", index=False)




In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

# Load cleaned data
df = pd.read_csv("cleaned_data.csv")


In [7]:
name_encoder = LabelEncoder()
df['name_encoded'] = name_encoder.fit_transform(df['name'])


In [8]:
with open("name_encoder.pkl", "wb") as f:
    pickle.dump(name_encoder, f)


In [9]:
encoded_df = pd.read_csv("encoded_data.csv")
encoded_df['name_encoded'] = df['name_encoded']

# Save updated version
encoded_df.to_csv("encoded_data_with_name.csv", index=False)


In [1]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd


In [1]:
import pandas as pd

# Label-encoded data (contains name_encoded and numeric features)
name_encoded_df = pd.read_csv("encoded_data_with_name.csv")

# One-hot encoded data for city and cuisine
onehot_df = pd.read_csv("encoded_data.csv")  # previously saved from sparse one-hot encoding


In [2]:
# Sanity check
assert name_encoded_df.shape[0] == onehot_df.shape[0], "Mismatch in number of rows"


In [3]:
final_df = pd.concat([name_encoded_df[["id", "name_encoded", "rating", "rating_count", "cost"]],
                      onehot_df.drop(columns=["id", "rating", "rating_count", "cost"])], axis=1)


In [4]:
final_df.to_csv("encoded_data_with_name.csv", index=False)


In [5]:
df = pd.read_csv("encoded_data_with_name.csv")


In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load the final dataset
df = pd.read_csv("encoded_data_with_name.csv")



In [7]:
X = df.drop(columns=['id'])  # if 'id' exists


In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [9]:
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_scaled)


In [10]:
df['cluster'] = clusters
df.to_csv("clustered_data.csv", index=False)


In [1]:
import pandas as pd

df = pd.read_csv("cleaned_data.csv")
df.columns


Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'lic_no', 'link', 'address', 'menu'],
      dtype='object')