In [None]:
# Basic Data Preprocessing on Zomato Dataset using Python

This notebook demonstrates how to perform basic data cleaning, handling missing values, 
label encoding, and train-test splitting on a restaurant dataset 
from Zomato using Python libraries like Pandas and scikit-learn.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Load the Zomato dataset
dataset = pd.read_csv('/Users/tanujajagtap/Desktop/Projects/MLE/zomato.csv')

# Preview the dataset
dataset.head()

Unnamed: 0,name,online_order,book_table,rate,votes,approx_cost(for two people),listed_in(type)
0,Jalsa,Yes,Yes,4.1/5,775.0,800.0,Buffet
1,Spice Elephant,Yes,No,4.1/5,787.0,800.0,Buffet
2,San Churro Cafe,Yes,No,3.8/5,918.0,,Buffet
3,Addhuri Udupi Bhojana,No,No,3.7/5,88.0,300.0,Buffet
4,Grand Village,No,No,3.8/5,166.0,600.0,Buffet


In [4]:
# Selecting independent features and target variable
X = dataset.iloc[:, :-1].values  # All columns except the last
y = dataset.iloc[:, 3].values    # Assuming column 3 is the target

In [5]:
from sklearn.impute import SimpleImputer

# Handle missing values in numerical columns (e.g., cost, ratings)
imputer = SimpleImputer(strategy='mean')
imputer = imputer.fit(X[:, 4:6])  # Assuming columns 4 and 5 are numeric
X[:, 4:6] = imputer.transform(X[:, 4:6])


In [6]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical text data to numeric
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])  # e.g., city names

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)  # e.g., cuisine types or categories


In [7]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# Check shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (118, 6)
X_test shape: (30, 6)
y_train shape: (118,)
y_test shape: (30,)


In [None]:
# Summary

- Loaded and cleaned the Zomato dataset
- Handled missing values in numeric features
- Encoded categorical variables for model compatibility
- Performed an 80-20 train-test split for further modeling

This dataset is now ready for EDA or machine learning model training.
