In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

<font size='4'>**Train Dataset info**

In [2]:
train = pd.read_excel("Data_Train.xlsx")

<font size='4'>**Test Dataset info**

In [3]:
test = pd.read_excel("Data_Test.xlsx")

<font size='2'>**Add Column**

- Add a column {Source} in both dataset

In [4]:
train["source"] = "train"

In [5]:
test["source"] = "test"

<font size='2'>**Combine both Dataset**

- Combined Train & Test datset into one dataset

In [6]:
df = pd.concat([train,test])

<font size='4'>**CHECK NULL-VALUES**

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16921 entries, 0 to 4230
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TITLE          16921 non-null  object 
 1   RESTAURANT_ID  16921 non-null  int64  
 2   CUISINES       16921 non-null  object 
 3   TIME           16921 non-null  object 
 4   CITY           16774 non-null  object 
 5   LOCALITY       16793 non-null  object 
 6   RATING         16917 non-null  object 
 7   VOTES          15315 non-null  object 
 8   COST           12690 non-null  float64
 9   source         16921 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 1.4+ MB


<font size='3'>**Check Duplicates Values**

In [8]:
df.duplicated().sum()

26

- It have 26 duplicates values.

- So, We drop duplicates values.

In [9]:
df= df.drop_duplicates()

<font size='3'>**Null-Value Count**

In [10]:
df.isna().sum()

TITLE               0
RESTAURANT_ID       0
CUISINES            0
TIME                0
CITY              147
LOCALITY          128
RATING              4
VOTES            1602
COST             4230
source              0
dtype: int64

There are null values in CITY,LOCALITY,RATING,VOTES, COST

<font size='3'>**Add Column Subset**

- Make a subset of two column.

    - Add CITY & LOCALITY into Location

In [11]:
df['Location']=df['CITY']+' '+df['LOCALITY']
df.drop(columns=['CITY','LOCALITY'])

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,RATING,VOTES,COST,source,Location
0,CASUAL DINING,9438,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",3.6,49 votes,1200.0,train,Thane Dombivali East
1,"CASUAL DINING,BAR",13198,"Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),4.2,30 votes,1500.0,train,Chennai Ramapuram
2,CASUAL DINING,10915,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",3.8,221 votes,800.0,train,Chennai Saligramam
3,QUICK BITES,6346,"Tibetan, Chinese",11:30am – 1am (Mon-Sun),4.1,24 votes,800.0,train,Mumbai Bandra West
4,DESSERT PARLOR,15387,Desserts,11am – 1am (Mon-Sun),3.8,165 votes,300.0,train,Mumbai Lower Parel
...,...,...,...,...,...,...,...,...,...
4226,CASUAL DINING,9057,"North Indian, Mughlai, Chinese",11:30am – 11:30pm (Mon-Sun),3.9,287 votes,,test,New Delhi Punjabi Bagh
4227,,1247,"Biryani, North Indian, Sandwich, Salad, Wraps",11am – 1am (Mon-Sun),4.3,469 votes,,test,Bangalore HSR Layout
4228,QUICK BITES,8617,"Continental, North Indian",9:30am – 10:30pm (Mon-Sun),3.7,53 votes,,test,Faridabad Sector 86
4229,QUICK BITES,6485,"Rolls, Beverages","11am – 11:30pm (Mon, Tue, Wed, Thu, Sat, Sun),...",-,,,test,Kochi Kochi


- **Drop Subset Location**

In [12]:
df.dropna(subset=['Location'],inplace=True)

<font size='3'>**Correct Names**

- Correct CITY names.

In [13]:
from fuzzywuzzy import process
 
names_array=[]
def match_names(wrong_names,correct_names):
    for row in wrong_names:
        x=process.extractOne(row, correct_names)
        if x[1]<60:
            names_array.append('Others')
        else:
            names_array.append(x[0])
    return names_array
correct_names=['Bangalore','Thane',
'Hyderabad','Andheri',
'Delhi', 'Kerala',
'Chennai', 'Bandra',
'Mumbai', 'Telangana',
'Kochi', 
'Noida', 
'Gurgaon', 'Ernakulam',
'Faridabad', 'Ghaziabad',
'Secunderabad' ]
name_match=match_names(df.Location,correct_names)    

print(len(names_array))
df['Location']=names_array

16747


In [14]:
cuisines_list=[]
for row in df['CUISINES']:
    cuisines_list.append(list(row.split(',')))

df['CUISINES']=cuisines_list

In [15]:
df['CUISINES'].isna().sum()

0

In [16]:
df_cuisines=df['CUISINES'].apply(lambda x: pd.Series(1, x))

In [17]:
title_list=[]
for row in df['TITLE']:
    title_list.append(list(row.split(',')))
df['TITLE']=title_list

In [18]:
df_title=df['TITLE'].apply(lambda x: pd.Series(1, x))

In [19]:
df_title.head()

Unnamed: 0,CASUAL DINING,BAR,QUICK BITES,DESSERT PARLOR,CAFÉ,MICROBREWERY,BEVERAGE SHOP,IRANI CAFE,BAKERY,None,...,FOOD TRUCK,MESS,KIOSK,CLUB,CONFECTIONERY,DHABA,MEAT SHOP,COCKTAIL BAR,PAAN SHOP,BHOJANALYA
0,1.0,,,,,,,,,,...,,,,,,,,,,
1,1.0,1.0,,,,,,,,,...,,,,,,,,,,
2,1.0,,,,,,,,,,...,,,,,,,,,,
3,,,1.0,,,,,,,,...,,,,,,,,,,
4,,,,1.0,,,,,,,...,,,,,,,,,,


In [20]:
df[df['RATING'].isna()]

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST,source,Location
204,[BAKERY],15062,[Bakery],Not Available,Kochi,Marine Drive,,,200.0,train,Kochi
421,[CAFÉ],14127,"[Cafe, Fast Food]",Not Available,Hyderabad,Banjara Hills,,,350.0,train,Hyderabad
2035,[QUICK BITES],2776,"[Kerala, Arabian]",Not Available,Kochi,Kakkanad,,,,test,Kochi
2758,[None],11301,"[Chinese, North Indian, South Indian]",Not Available,Kochi,Tripunithura,,,,test,Kochi


In [21]:
df["RATING"] = df.groupby("CITY").RATING.transform(lambda x : x.fillna(x.mode()[0]))

In [22]:
df['RATING']=df['RATING'].str.extract('(\d+)').astype(float)

In [23]:
df['VOTES'].isna().sum()

1579

- There are 1579 null-values in votes column.

In [24]:
df.VOTES.fillna('0',inplace=True)
df['VOTES']=df['VOTES'].str.extract('(\d+)').astype(float)

- **Drop CITY , LOCALITY , CUISINES from dataset**

In [25]:
df.drop(columns='CITY',inplace=True)
df.drop(columns='LOCALITY',inplace=True)
df.drop(columns='CUISINES',inplace=True)

In [26]:
df_City=pd.get_dummies(df['Location'])
df.drop(columns='Location',inplace=True)
df_City.head()

Unnamed: 0,Andheri,Bandra,Bangalore,Chennai,Delhi,Ernakulam,Faridabad,Ghaziabad,Gurgaon,Hyderabad,Kerala,Kochi,Mumbai,Noida,Others,Secunderabad,Telangana,Thane
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [27]:
df = pd.concat([df,df_City,df_cuisines,df_title], axis=1)

In [28]:
df.drop(columns='TITLE',inplace=True)

In [29]:
df_column_category = df.select_dtypes(exclude=np.number).columns
df_column_category

Index(['TIME', 'source'], dtype='object')

In [30]:
df.drop(columns='TIME',inplace=True)

- **Fill Null-values in dataset**

In [31]:
df.fillna(0,inplace=True)

In [32]:
train_final = df[df.source=="train"]

In [33]:
test_final = df[df.source=="test"]

- **Train Data Shape**

In [34]:
train_final.shape

(12552, 277)

In [35]:
train_final.drop(columns=["source"],inplace=True)

In [36]:
test_final.drop(columns=["source",'COST'],inplace=True)

In [37]:
train_X = train_final.drop(columns=["COST",'RESTAURANT_ID'])

In [38]:
train_Y = train_final["COST"]

In [39]:
test_X = test_final.drop(columns=["RESTAURANT_ID"])

- **Fill Null Values in Train data**

In [40]:
train_X.fillna(0,inplace=True)
train_X.isna().sum()

RATING          0
VOTES           0
Andheri         0
Bandra          0
Bangalore       0
               ..
DHABA           0
MEAT SHOP       0
COCKTAIL BAR    0
PAAN SHOP       0
BHOJANALYA      0
Length: 274, dtype: int64

# MACHINE LEARNING

<font size='4'>**Linear Regression**

In [41]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(train_X, train_Y)
dtrain_predictions = model.predict(train_X)

In [42]:
from sklearn.model_selection import cross_val_score
a = cross_val_score(model, train_X, train_Y, cv=5, scoring='neg_mean_squared_error')

In [43]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_squared_log_error, r2_score
print("\nModel Report")
print("RMSE : %.4g" % np.sqrt(mean_squared_error(train_Y.values, dtrain_predictions)))
    
test_X.fillna(0,inplace=True)
test_final["res_linear"] =  model.predict(test_X)


Model Report
RMSE : 331.3


In [44]:
print('r2 train',r2_score(train_Y,dtrain_predictions))

r2 train 0.722462054786056


<font size='4'>**Decision Tree**

In [45]:
from sklearn.tree import DecisionTreeClassifier
model1 = DecisionTreeClassifier()
model1.fit(train_X, train_Y)
dtrain_predictions1 = model1.predict(train_X)

In [46]:
a = cross_val_score(model1, train_X, train_Y, cv=5, scoring='neg_mean_squared_error')

In [47]:
print("\nModel Report")
print("RMSE : %.4g" % np.sqrt(mean_squared_error(train_Y.values, dtrain_predictions)))
    
test_X.fillna(0,inplace=True)
test_final["reS_decision"] =  model1.predict(test_X)


Model Report
RMSE : 331.3


In [48]:
print('r2 train',r2_score(train_Y,dtrain_predictions1))

r2 train 0.9956345719366212


<font size='4'>**Random Forest**

In [49]:
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier()
model2.fit(train_X, train_Y)
dtrain_predictions2 = model2.predict(train_X)

In [50]:
a = cross_val_score(model2, train_X, train_Y, cv=5, scoring='neg_mean_squared_error')

In [51]:
print("\nModel Report")
print("RMSE : %.4g" % np.sqrt(mean_squared_error(train_Y.values, dtrain_predictions)))
    
test_X.fillna(0,inplace=True)
test_final["res_random"] =  model2.predict(test_X)


Model Report
RMSE : 331.3


In [52]:
print('r2 train',r2_score(train_Y,dtrain_predictions2))

r2 train 0.9956816058073311


- Random Forest Classifier have maximum r2 score values.(0.9956816058073311)

- So, we get our final result from random forest classifier regressor.



<font size='4'>**Final Result**

In [53]:
Random_submission = test_final[["RESTAURANT_ID","res_random"]]

In [54]:
Random_submission

Unnamed: 0,RESTAURANT_ID,res_random
0,4085,750.0
1,12680,300.0
2,1411,500.0
3,204,400.0
4,13453,200.0
...,...,...
4226,9057,700.0
4227,1247,600.0
4228,8617,400.0
4229,6485,300.0


- **res_random = Cost**