# Restaurant Recommendation System

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [3]:
res = pd.read_csv('D:\\Project\\Res_Nev_Halal_Final.csv')

In [4]:
df = res[['review_id', 'user_id', 'business_id', 'text', 'rating']]

In [5]:
df_business = pd.read_csv('D:\\Project\\business.csv')

In [6]:
#Check Null values in Dataframe
df.isnull().sum()

review_id      0
user_id        0
business_id    0
text           0
rating         0
dtype: int64

In [7]:
df.head()

Unnamed: 0,review_id,user_id,business_id,text,rating
0,kCkyGrMMRQbqx69dgzmnEA,I143qmTjREqTwAVWVsB6sw,EfAqol3tWckyNrBMIooJmg,From take out box right to the trash can. I've...,3.5
1,JJUHx7FNoWnBYsIbEoBopw,GOGAWajma1T-dPZn8xls6A,EfAqol3tWckyNrBMIooJmg,First of all the owner needs to take a chill p...,3.5
2,4wUnRF4-DistlPJnUwU-Uw,X40NV4DapcQBiYiPMwQkOw,EfAqol3tWckyNrBMIooJmg,It was 108 degrees outside and the AC wasn't w...,3.5
3,L94lNIuNw3G1UaXgxAgTWg,sPr42M6_rRkKJXNepmfm1A,EfAqol3tWckyNrBMIooJmg,Amazing service and really good Chai tea! If y...,3.5
4,SGaOzRxoEuiGBdSCWUJ8fQ,ykmEwulkVsiEesovrrpVSA,EfAqol3tWckyNrBMIooJmg,I'm assuming it was the owner but he got up in...,3.5


In [8]:
df.shape

(12024, 5)

### Select only rating and text

In [9]:
res_data = df[['business_id', 'user_id', 'rating', 'text']]

In [10]:
import string
from nltk.corpus import stopwords
stop = []
for word in stopwords.words('english'):
    s = [char for char in word if char not in string.punctuation]
    stop.append(''.join(s))

In [11]:
def text_process(mess):
    
    punc = [char for char in mess if char not in string.punctuation]

    punc = ''.join(punc)
    
    return " ".join([word for word in punc.split() if word.lower() not in stop])

### Clean Text

In [12]:
res_data['text'] = res_data['text'].apply(text_process)

In [13]:
#Split train test for testing the model later
#vld_size=0.15
#X_train, X_valid, y_train, y_valid = train_test_split(res_data['text'], df['business_id'], test_size = vld_size) 

### Create two tables of user,text and business,text

In [14]:
userid_df = res_data[['user_id','text']]
business_df = res_data[['business_id', 'text']]

In [15]:
userid_df.head()

Unnamed: 0,user_id,text
0,I143qmTjREqTwAVWVsB6sw,take box right trash Ive Persian Arminian Turk...
1,GOGAWajma1T-dPZn8xls6A,First owner needs take chill pill let customer...
2,X40NV4DapcQBiYiPMwQkOw,108 degrees outside AC working asked owner ext...
3,sPr42M6_rRkKJXNepmfm1A,Amazing service really good Chai tea check get...
4,ykmEwulkVsiEesovrrpVSA,Im assuming owner got face confrontational ask...


In [16]:
business_df.head()

Unnamed: 0,business_id,text
0,EfAqol3tWckyNrBMIooJmg,take box right trash Ive Persian Arminian Turk...
1,EfAqol3tWckyNrBMIooJmg,First owner needs take chill pill let customer...
2,EfAqol3tWckyNrBMIooJmg,108 degrees outside AC working asked owner ext...
3,EfAqol3tWckyNrBMIooJmg,Amazing service really good Chai tea check get...
4,EfAqol3tWckyNrBMIooJmg,Im assuming owner got face confrontational ask...


### Join the text with for each user_id and business_id

In [17]:
userid_df = userid_df.groupby('user_id').agg({'text': ' '.join})
business_df = business_df.groupby('business_id').agg({'text': ' '.join})

In [18]:
userid_df.head()

Unnamed: 0_level_0,text
user_id,Unnamed: 1_level_1
-0ZKLS43isCG3V-nCObxmw,placed order jerks manager delivery boy night ...
-0kBwZEJqWbKgJtwnAsT2w,Best Indian Place LV horrible experience basic...
-17vo-ag35TT-gccu6XbnA,Stale lentils store bought naan beef stew niha...
-1BrzE0pqO_EVw9SGzHEnw,Duuuuuude place tho Scott behind bar amazing s...
-1D4lTzn2HzQlW7OZJo16Q,love eating large calzone eat time Italian sub...


## User Tfidf Vectorizer

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
#userid vectorizer
userid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=6000)
userid_vectors = userid_vectorizer.fit_transform(userid_df['text'])
userid_vectors.shape

(10450, 6000)

In [21]:
userid_vectors

<10450x6000 sparse matrix of type '<class 'numpy.float64'>'
	with 419781 stored elements in Compressed Sparse Row format>

In [22]:
#Business id vectorizer
businessid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=6000)
businessid_vectors = businessid_vectorizer.fit_transform(business_df['text'])
businessid_vectors.shape

(46, 6000)

# Matrix Factorization

In [23]:
userid_rating_matrix = pd.pivot_table(res_data, values='rating', index=['user_id'], columns=['business_id'])
userid_rating_matrix.shape

(10450, 46)

In [24]:
userid_rating_matrix.head()

business_id,-sEbDB_5jI_yIlklu1o1VQ,3LWsVfsSmb_Nzbi2YQ-NIA,4L3VJwVqUareUUIGPE2zcw,4yAvytbVKHqSYAo3mkI9OA,5O7qB4gNmr3NgdYAuYbP4Q,5gv6AqHXfi3gJV4fb432Vw,9AnvV8V-UvA_rFhMCr_Dlw,ARp5inQqiKuDt2E9tIrX2Q,Cr066pnTj0ioEMZRTHgMOw,EIH5pDc75v4haOcww-SEJg,...,kkdm8TM6qTaVnrz5XsG0hQ,llifBVCFAnr124WdKXmtLg,nS6QENxe6YODNrQ45bpC-A,pcxssrp4IeSN3GlYpPwlQA,qTTBJ83d-nWSnQpXyd44gA,saQQmDQQJ14BVkm_sg53UA,suFSckaQV8NmND7dytpf4w,tRdVEMAtQRdz-Cgw-ff4-A,udh__7erx4PuM5quw8zuVg,vbkA2rfPXXorr-27pVB0DQ
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0ZKLS43isCG3V-nCObxmw,,,,,,,,,,,...,,,3.0,,,,,,,
-0kBwZEJqWbKgJtwnAsT2w,,4.0,,,,,,,,,...,,,,,,,,,,
-17vo-ag35TT-gccu6XbnA,,,,,,,,,,,...,,,,,,,,,,
-1BrzE0pqO_EVw9SGzHEnw,,,3.5,,,,,,,,...,,,,,,,,,,
-1D4lTzn2HzQlW7OZJo16Q,,,,,,,,,,,...,,,,,,,,,,


In [25]:
P = pd.DataFrame(userid_vectors.toarray(), index=userid_df.index, columns=userid_vectorizer.get_feature_names())
Q = pd.DataFrame(businessid_vectors.toarray(), index=business_df.index, columns=businessid_vectorizer.get_feature_names())

In [26]:
Q.head()

Unnamed: 0_level_0,0,05,1,10,100,1000,1010,1015,1030,1050,...,zealand,zeera,zero,zinger,zone,zu,zucchini,‍,。,，
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-sEbDB_5jI_yIlklu1o1VQ,0.018702,0.0,0.011827,0.011577,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3LWsVfsSmb_Nzbi2YQ-NIA,0.002391,0.0,0.01109,0.015296,0.004127,0.001072,0.001002,0.0,0.002309,0.0,...,0.0,0.0,0.000739,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4L3VJwVqUareUUIGPE2zcw,0.0,0.0,0.023968,0.014076,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4yAvytbVKHqSYAo3mkI9OA,0.0,0.0,0.005269,0.010315,0.003595,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5O7qB4gNmr3NgdYAuYbP4Q,0.0,0.0,0.026917,0.015056,0.0,0.0,0.0,0.0,0.0,0.022403,...,0.0,0.0,0.011277,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Prediction for input text

In [28]:
words = "i want to eat pizza"
test_df= pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(text_process)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:5]

for i in topRecommendations.index:
    print(df_business[df_business['business_id']==i]['name'].iloc[0])
    print(df_business[df_business['business_id']==i]['categories'].iloc[0])
    print(str(df_business[df_business['business_id']==i]['ratings'].iloc[0])+ ' '+str(df_business[df_business['business_id']==i]['review_count'].iloc[0]))
    print('')


Little Italy Pizza
Halal, Italian, Pizza, Restaurants
3.5 153

Sorrento Pizza
Pizza, Restaurants, Halal, Salad, Chicken Wings
3.0 94

Pizza house & bakery Middle East Fast Food
Persian/Iranian, Restaurants, Sandwiches, Pizza, Mediterranean, Bakeries, Halal, Food
4.5 33

Verdi Pizza
Restaurants, Pizza, Cafes, Mediterranean, Halal, Chicken Wings, Italian
3.0 233

La Bocce
Chicken Wings, Restaurants, Italian, Salad, Halal, Pizza
4.0 139

