# Preprocessing

This file contains preprocessing of data before moving to Level3 Tasks that includes modelling

In [11]:
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv("Dataset .csv")

In [12]:
df.sample()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
276,17342771,Fiesta Cancun,216,Dubuque,"2515 NW Arterial, Dubuque, IA 52002",Dubuque,"Dubuque, Dubuque",-90.740213,42.49092,Mexican,10,Dollar($),No,No,No,No,1,3.6,Yellow,Good,156


# Remove missing values

In [13]:
df.isnull().sum()

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64

In [14]:
df = df.dropna()
df.isnull().sum()

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                0
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64

# Removing Unnecessary Columns

In [15]:
df.columns

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')

In [16]:
df = df.drop(['Restaurant ID', 'Restaurant Name', 'Address', 'Rating color', 'Rating text'], axis=1)

# Rest Id and Rest Name donot relate to Rating of restraunt
# Address is not at all related to customer rating. Plus if anything is related its just city and locality. 
# Rating color and text are both derivatives of Aggregate rating. This will cause a leakage in input of output.


# Converting all object data to numeric equivalents

In [17]:
df.dtypes

Country Code              int64
City                     object
Locality                 object
Locality Verbose         object
Longitude               float64
Latitude                float64
Cuisines                 object
Average Cost for two      int64
Currency                 object
Has Table booking        object
Has Online delivery      object
Is delivering now        object
Switch to order menu     object
Price range               int64
Aggregate rating        float64
Votes                     int64
dtype: object

In [18]:
X = df.drop("Aggregate rating", axis=1)
y = df["Aggregate rating"]

In [19]:
for col in X.columns:
    if df[col].dtype=='O':
        print(col)

City
Locality
Locality Verbose
Cuisines
Currency
Has Table booking
Has Online delivery
Is delivering now
Switch to order menu


# Column City

In [20]:
#Linear models will work with dummy encoding as they assume linear relationship
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['City'] = label_encoder.fit_transform(df['City'])

In [21]:
df.head()

Unnamed: 0,Country Code,City,Locality,Locality Verbose,Longitude,Latitude,Cuisines,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Votes
0,162,73,"Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",1100,Botswana Pula(P),Yes,No,No,No,3,4.8,314
1,162,73,"Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,1200,Botswana Pula(P),Yes,No,No,No,3,4.5,591
2,162,75,"Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",4000,Botswana Pula(P),Yes,No,No,No,4,4.4,270
3,162,75,"SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",1500,Botswana Pula(P),No,No,No,No,4,4.9,365
4,162,75,"SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",1500,Botswana Pula(P),Yes,No,No,No,4,4.8,229


# Column Locality

In [22]:
# Remove locality becuase locality verbose gives more information than locality
df[["Locality", "Locality Verbose"]]

Unnamed: 0,Locality,Locality Verbose
0,"Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak..."
1,"Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma..."
2,"Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma..."
3,"SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal..."
4,"SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal..."
...,...,...
9546,Karak�_y,"Karak�_y, ��stanbul"
9547,Ko��uyolu,"Ko��uyolu, ��stanbul"
9548,Kuru�_e��me,"Kuru�_e��me, ��stanbul"
9549,Kuru�_e��me,"Kuru�_e��me, ��stanbul"


In [23]:
df = df.drop(['Locality'], axis=1)

# Column Locality Verbose

In [24]:
df['Locality Verbose'] = label_encoder.fit_transform(df['Locality Verbose'])

In [25]:
df.sample(5)

Unnamed: 0,Country Code,City,Locality Verbose,Longitude,Latitude,Cuisines,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Votes
3675,1,87,382,77.234184,28.551072,"Bakery, Desserts",400,Indian Rupees(Rs.),No,No,No,No,1,3.8,116
3099,1,87,222,77.220846,28.630954,"Continental, North Indian, Chinese, Mediterranean",1800,Indian Rupees(Rs.),Yes,No,No,No,3,4.1,799
7521,1,87,1124,77.188975,28.57939,"French, Italian",5000,Indian Rupees(Rs.),Yes,No,No,No,4,3.8,199
3349,1,87,291,77.206518,28.573356,Bihari,500,Indian Rupees(Rs.),No,No,No,No,2,3.5,28
5431,1,87,616,77.145736,28.493668,"Mughlai, North Indian",350,Indian Rupees(Rs.),No,No,No,No,1,0.0,0


# Column Cuisines

In [26]:
df["Cuisines"]

0             French, Japanese, Desserts
1                               Japanese
2       Seafood, Asian, Filipino, Indian
3                        Japanese, Sushi
4                       Japanese, Korean
                      ...               
9546                             Turkish
9547     World Cuisine, Patisserie, Cafe
9548              Italian, World Cuisine
9549                     Restaurant Cafe
9550                                Cafe
Name: Cuisines, Length: 9542, dtype: object

In [27]:
df['Cuisines'] = df['Cuisines'].astype(str).str.replace(r'[\n\r]', ' ', regex=True).str.strip()
df['Cuisines'] = df['Cuisines'].str.replace(r'\s+', ' ', regex=True)

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(norm=None)
tfidf_matrix = tfidf.fit_transform(df['Cuisines'])

In [29]:
df['Cuisines_Vector'] = list(tfidf_matrix.toarray())

In [30]:
import numpy as np
df['Cuisines_L2_Norm'] = df['Cuisines_Vector'].apply(lambda vec: np.linalg.norm(vec))

In [31]:
df.head()

Unnamed: 0,Country Code,City,Locality Verbose,Longitude,Latitude,Cuisines,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Votes,Cuisines_Vector,Cuisines_L2_Norm
0,162,73,172,121.027535,14.565443,"French, Japanese, Desserts",1100,Botswana Pula(P),Yes,No,No,No,3,4.8,314,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.319194
1,162,73,600,121.014101,14.553708,Japanese,1200,Botswana Pula(P),Yes,No,No,No,3,4.5,591,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.250908
2,162,75,314,121.056831,14.581404,"Seafood, Asian, Filipino, Indian",4000,Botswana Pula(P),Yes,No,No,No,4,4.4,270,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.520663
3,162,75,873,121.056475,14.585318,"Japanese, Sushi",1500,Botswana Pula(P),No,No,No,No,4,4.9,365,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.848181
4,162,75,873,121.057508,14.58445,"Japanese, Korean",1500,Botswana Pula(P),Yes,No,No,No,4,4.8,229,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.808665


In [32]:
df = df.drop(["Cuisines"], axis=1)
df.head()

Unnamed: 0,Country Code,City,Locality Verbose,Longitude,Latitude,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Votes,Cuisines_Vector,Cuisines_L2_Norm
0,162,73,172,121.027535,14.565443,1100,Botswana Pula(P),Yes,No,No,No,3,4.8,314,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.319194
1,162,73,600,121.014101,14.553708,1200,Botswana Pula(P),Yes,No,No,No,3,4.5,591,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.250908
2,162,75,314,121.056831,14.581404,4000,Botswana Pula(P),Yes,No,No,No,4,4.4,270,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.520663
3,162,75,873,121.056475,14.585318,1500,Botswana Pula(P),No,No,No,No,4,4.9,365,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.848181
4,162,75,873,121.057508,14.58445,1500,Botswana Pula(P),Yes,No,No,No,4,4.8,229,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.808665


# Column Currency

In [33]:
df["Currency"].unique()

array(['Botswana Pula(P)', 'Brazilian Real(R$)', 'Dollar($)',
       'Emirati Diram(AED)', 'Indian Rupees(Rs.)',
       'Indonesian Rupiah(IDR)', 'NewZealand($)', 'Pounds(��)',
       'Qatari Rial(QR)', 'Rand(R)', 'Sri Lankan Rupee(LKR)',
       'Turkish Lira(TL)'], dtype=object)

In [34]:
df["Currency"] = label_encoder.fit_transform(df["Currency"])

In [35]:
df["Currency"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [36]:
df.head()

Unnamed: 0,Country Code,City,Locality Verbose,Longitude,Latitude,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Votes,Cuisines_Vector,Cuisines_L2_Norm
0,162,73,172,121.027535,14.565443,1100,0,Yes,No,No,No,3,4.8,314,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.319194
1,162,73,600,121.014101,14.553708,1200,0,Yes,No,No,No,3,4.5,591,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.250908
2,162,75,314,121.056831,14.581404,4000,0,Yes,No,No,No,4,4.4,270,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.520663
3,162,75,873,121.056475,14.585318,1500,0,No,No,No,No,4,4.9,365,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.848181
4,162,75,873,121.057508,14.58445,1500,0,Yes,No,No,No,4,4.8,229,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.808665


# Has Table booking

In [37]:
df["Has Table booking"] = label_encoder.fit_transform(df["Has Table booking"])

In [38]:
df.head()

Unnamed: 0,Country Code,City,Locality Verbose,Longitude,Latitude,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Votes,Cuisines_Vector,Cuisines_L2_Norm
0,162,73,172,121.027535,14.565443,1100,0,1,No,No,No,3,4.8,314,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.319194
1,162,73,600,121.014101,14.553708,1200,0,1,No,No,No,3,4.5,591,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.250908
2,162,75,314,121.056831,14.581404,4000,0,1,No,No,No,4,4.4,270,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.520663
3,162,75,873,121.056475,14.585318,1500,0,0,No,No,No,4,4.9,365,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.848181
4,162,75,873,121.057508,14.58445,1500,0,1,No,No,No,4,4.8,229,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.808665


# Has Online delivery

In [39]:
df["Has Online delivery"] = label_encoder.fit_transform(df["Has Online delivery"])

In [40]:
df.head()

Unnamed: 0,Country Code,City,Locality Verbose,Longitude,Latitude,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Votes,Cuisines_Vector,Cuisines_L2_Norm
0,162,73,172,121.027535,14.565443,1100,0,1,0,No,No,3,4.8,314,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.319194
1,162,73,600,121.014101,14.553708,1200,0,1,0,No,No,3,4.5,591,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.250908
2,162,75,314,121.056831,14.581404,4000,0,1,0,No,No,4,4.4,270,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.520663
3,162,75,873,121.056475,14.585318,1500,0,0,0,No,No,4,4.9,365,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.848181
4,162,75,873,121.057508,14.58445,1500,0,1,0,No,No,4,4.8,229,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.808665


# Is delivering now

In [41]:
df["Is delivering now"] = label_encoder.fit_transform(df["Is delivering now"])

In [42]:
df.head()

Unnamed: 0,Country Code,City,Locality Verbose,Longitude,Latitude,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Votes,Cuisines_Vector,Cuisines_L2_Norm
0,162,73,172,121.027535,14.565443,1100,0,1,0,0,No,3,4.8,314,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.319194
1,162,73,600,121.014101,14.553708,1200,0,1,0,0,No,3,4.5,591,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.250908
2,162,75,314,121.056831,14.581404,4000,0,1,0,0,No,4,4.4,270,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.520663
3,162,75,873,121.056475,14.585318,1500,0,0,0,0,No,4,4.9,365,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.848181
4,162,75,873,121.057508,14.58445,1500,0,1,0,0,No,4,4.8,229,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.808665


# Switch to order menu

In [43]:
df["Switch to order menu"] = label_encoder.fit_transform(df["Switch to order menu"])

In [44]:
df.head()

Unnamed: 0,Country Code,City,Locality Verbose,Longitude,Latitude,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Votes,Cuisines_Vector,Cuisines_L2_Norm
0,162,73,172,121.027535,14.565443,1100,0,1,0,0,0,3,4.8,314,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.319194
1,162,73,600,121.014101,14.553708,1200,0,1,0,0,0,3,4.5,591,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.250908
2,162,75,314,121.056831,14.581404,4000,0,1,0,0,0,4,4.4,270,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.520663
3,162,75,873,121.056475,14.585318,1500,0,0,0,0,0,4,4.9,365,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.848181
4,162,75,873,121.057508,14.58445,1500,0,1,0,0,0,4,4.8,229,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.808665


In [45]:
df.to_csv("Preprocessed.csv")