In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

In [4]:
yelp = pd.read_csv('~/yelp_review_arizona.csv')

In [5]:
df = yelp[['review_id', 'user_id', 'business_id', 'text', 'stars', 'date']]

In [6]:
df.isnull().sum()

review_id      0
user_id        0
business_id    0
text           0
stars          0
date           0
dtype: int64

In [7]:
df.head()


Unnamed: 0,review_id,user_id,business_id,text,stars,date
0,V93SYj2OLh5m9Cquzf-7kg,ZwVz20be-hOZnyAbevyMyQ,2c9Vptks_vowLgVUMnCgjw,Came here while in town for a country concert....,4.0,2013-09-04 01:29:46
1,vNTFadc6T9HeH3Qa78dc_Q,91TB-gzcNyxFh46TL0pmnQ,6nKR80xEGHYf2UxAe_Cu_g,Best barbecue this side of the Mississippi!!!!...,5.0,2015-12-05 02:50:10
2,SXRFBCt5eXCBF7TlI7UG6Q,Y_QBiZpATJoz8hKUfYF66A,fbQaKW0Lte0JQ_opbnjdKg,Absolutely amazing. Think Chipotle for enchila...,5.0,2014-04-01 01:56:00
3,CqMNjtG0hNZGhDw4RDE-zw,_Jg-IA0M-GSjBlGu-wmejg,r8764MtYyt8JhxMvrfM_xQ,I was really disappointed with my most recent ...,2.0,2014-10-11 03:53:53
4,5hZLouGEW4wm6BTJ5aNUNw,1CqkFliipv_X15WYn5aPfg,QS3QxI7u5PRdtbGgI0-UsA,I grade sushi restaurants on 3 factors:\n- Qua...,4.0,2015-03-04 19:36:21


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26606 entries, 0 to 26605
Data columns (total 6 columns):
review_id      26606 non-null object
user_id        26606 non-null object
business_id    26606 non-null object
text           26606 non-null object
stars          26606 non-null float64
date           26606 non-null object
dtypes: float64(1), object(5)
memory usage: 1.2+ MB


In [11]:
business = pd.read_csv('yelp_business.csv')

In [15]:
business.info()
business.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192609 entries, 0 to 192608
Data columns (total 14 columns):
business_id     192609 non-null object
name            192609 non-null object
address         184927 non-null object
city            192608 non-null object
state           192609 non-null object
postal_code     191950 non-null object
latitude        192609 non-null float64
longitude       192609 non-null float64
stars           192609 non-null float64
review_count    192609 non-null int64
is_open         192609 non-null int64
attributes      163773 non-null object
categories      192127 non-null object
hours           147779 non-null object
dtypes: float64(3), int64(2), object(9)
memory usage: 20.6+ MB


business_id         0
name                0
address          7682
city                1
state               0
postal_code       659
latitude            0
longitude           0
stars               0
review_count        0
is_open             0
attributes      28836
categories        482
hours           44830
dtype: int64

In [16]:
yelp = df[['business_id', 'user_id', 'stars', 'text']]

In [18]:

import string
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

In [20]:
stop = []
for word in stopwords.words('english'):
    s = [char for char in word if char not in string.punctuation]
    stop.append(''.join(s))

In [21]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return " ".join([word for word in nopunc.split() if word.lower() not in stop])

In [23]:
yelp['text'] = yelp['text'].apply(text_process)

In [25]:
#Split train test for testing the model later
vld_size=0.15
X_train, X_valid, y_train, y_valid = train_test_split(yelp['text'], df['business_id'], test_size = vld_size)


In [26]:
userid_df = yelp[['user_id','text']]
business_df = yelp[['business_id', 'text']]

In [27]:
userid_df.head()

Unnamed: 0,user_id,text
0,ZwVz20be-hOZnyAbevyMyQ,Came town country concert better way start day...
1,91TB-gzcNyxFh46TL0pmnQ,Best barbecue side Mississippi Come car washed...
2,Y_QBiZpATJoz8hKUfYF66A,Absolutely amazing Think Chipotle enchiladas N...
3,_Jg-IA0M-GSjBlGu-wmejg,really disappointed recent visit Vintage 95 ma...
4,1CqkFliipv_X15WYn5aPfg,grade sushi restaurants 3 factors Quality Exce...


In [28]:
userid_df[userid_df['user_id']=='ZwVz20be-hOZnyAbevyMyQ']['text']

0        Came town country concert better way start day...
1098     Amazing Im golfer ended loving went family lov...
3909     saw many great reviews place decided try high ...
11491    Amazing Mexican Asian infusion good service sa...
19647    beautiful hotel excellent service Nick front a...
Name: text, dtype: object

In [29]:
business_df.head()

Unnamed: 0,business_id,text
0,2c9Vptks_vowLgVUMnCgjw,Came town country concert better way start day...
1,6nKR80xEGHYf2UxAe_Cu_g,Best barbecue side Mississippi Come car washed...
2,fbQaKW0Lte0JQ_opbnjdKg,Absolutely amazing Think Chipotle enchiladas N...
3,r8764MtYyt8JhxMvrfM_xQ,really disappointed recent visit Vintage 95 ma...
4,QS3QxI7u5PRdtbGgI0-UsA,grade sushi restaurants 3 factors Quality Exce...


In [30]:
userid_df = userid_df.groupby('user_id').agg({'text': ' '.join})
business_df = business_df.groupby('business_id').agg({'text': ' '.join})

In [31]:
userid_df.head()

Unnamed: 0_level_0,text
user_id,Unnamed: 1_level_1
--2HUmLkcNHZp0xw6AMBPg,place JAM Surfer vibe great eats love machaca ...
--4rAAfZnEIAKJE80aIiYg,pulled pork spicy bbq sauce impressed Probably...
--Nnm_506G_p8MxAOQna5w,Cant say burger anything special Taste ok Shak...
--ty7Z9fEt08E3dS3_qoSA,know think important trust Yelp kind reviews t...
-0IiMAZI2SsQ7VmyzJjokQ,Ever fan roadside attractions Americana Id rea...


In [32]:
userid_df.loc['ZwVz20be-hOZnyAbevyMyQ']['text']

'Came town country concert better way start day Everything great service amazing time walked door sat table surprise beer whiskey tap table pay ounce screen show drank food great whiskey burger favorite country fans must Scottsdale Amazing Im golfer ended loving went family loves golf planned staying hour since golf 6 hours later left far drive place right food amazing expect good brisket tacos chorizo sliders favorite Good quality food many unique items highly recommend place people ages teens loved bays nice roomy 7 us bay 10 people must try back soon saw many great reviews place decided try high hopes loved atmosphere kids loved Texas trivia game thats good thing could say steak came cooked way asked took back fix steak flavor covered butter thought steak even kids eat theres waitress asked daughter something wrong since ate bites explained like said ok walked away shrimp good pricey bugers ok end pushing us take home since ate almost none politely declined still came boxes trying g

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [34]:
#userid vectorizer
userid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
userid_vectors = userid_vectorizer.fit_transform(userid_df['text'])
userid_vectors.shape

(10937, 5000)

In [35]:
userid_vectors

<10937x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1083887 stored elements in Compressed Sparse Row format>

In [36]:
#Business id vectorizer
businessid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
businessid_vectors = businessid_vectorizer.fit_transform(business_df['text'])
businessid_vectors.shape

(1411, 5000)

In [38]:
userid_rating_matrix = pd.pivot_table(yelp, values='stars', index=['user_id'], columns=['business_id'])
userid_rating_matrix.shape

(10937, 1411)

In [39]:
userid_rating_matrix.head()

business_id,-050d_XIor1NpCuWkbIVaQ,-1UMR00eXtwaeh59pEiDjA,-4TMQnQJW1yd6NqGRDvAeA,-6h3K1hj0d4DRcZNUtHDuw,-8QlV3b_9H4BAh6LgMIr1g,-9eNGMp8XiygI8t8QFuFWw,-9nai28tnoylwViuJVrYEQ,-Bdw-5H5C4AYSMGnAvmnzw,-BxWyEIQ6wypT-37MzZizQ,-CfFjcCcGGDM9MVH_d42RQ,...,z_lDO8d8nkSmcvTjB4N69A,za9qr9ZZWLTfEgTfogRbUw,zbrFk-4ejesAJD8EwcdHxg,zfiSQ1dl3vTJ-og96eqXGA,zidkKI_N1OPxsiddTOQH_Q,zqNgwQjj0_XAll-neGikIw,zr93wrNyXzc-HW4IcK4iRQ,zrDi4gEaUi64lAMfJU51dw,zrTGcb83AsfyVTMrsCa65A,zuVvDYJkKAbXQTTBauAqJQ
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--2HUmLkcNHZp0xw6AMBPg,,,,,,,,,,,...,,,,,,,,,,
--4rAAfZnEIAKJE80aIiYg,,,,,,,,,,,...,,,,,,,,,,
--Nnm_506G_p8MxAOQna5w,,,,,,,,,,,...,,,,,,,,,,
--ty7Z9fEt08E3dS3_qoSA,,,,,,,,,,,...,,,,,,,,,,
-0IiMAZI2SsQ7VmyzJjokQ,,,,,,,,,,,...,,,,,,,,,,


In [40]:
P = pd.DataFrame(userid_vectors.toarray(), index=userid_df.index, columns=userid_vectorizer.get_feature_names())
Q = pd.DataFrame(businessid_vectors.toarray(), index=business_df.index, columns=businessid_vectorizer.get_feature_names())

In [41]:
Q.head()

Unnamed: 0_level_0,0,1,10,100,1000,101,1015,1030,10pm,11,...,z,zero,zest,zinburger,zinc,zipps,ziti,zookz,ztejas,zucchini
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-050d_XIor1NpCuWkbIVaQ,0.0,0.008223,0.007567,0.018716,0.0,0.0,0.0,0.01094,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-1UMR00eXtwaeh59pEiDjA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-4TMQnQJW1yd6NqGRDvAeA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-6h3K1hj0d4DRcZNUtHDuw,0.0,0.028893,0.026588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-8QlV3b_9H4BAh6LgMIr1g,0.0,0.0,0.017147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:

def matrix_factorization(R, P, Q, steps=25, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P,Q

In [45]:
%%time
P, Q = matrix_factorization(userid_rating_matrix, P, Q, steps=25, gamma=0.001,lamda=0.02)

KeyboardInterrupt: 

In [47]:

words = "i want to have dinner with beautiful views"
test_df= pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(text_process)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:7]

for i in topRecommendations.index:
    print(business[business['business_id']==i]['name'].iloc[0])
    print(business[business['business_id']==i]['categories'].iloc[0])
    print(str(business[business['business_id']==i]['stars'].iloc[0])+ ' '+str(business[business['business_id']==i]['review_count'].iloc[0]))
    print('')

Isabella's Kitchen
Bars, Nightlife, Beer, Wine & Spirits, Italian, Food, Restaurants
3.5 259

Compass Arizona Grill
Nightlife, American (New), Bars, Restaurants, Lounges
3.5 540

Chart House
Event Planning & Services, Restaurants, Venues & Event Spaces, American (Traditional), Seafood, Nightlife, Bars, Steakhouses, Wine Bars
3.0 229

Top of the Rock Restaurant
Restaurants, American (New), Hotels & Travel, Event Planning & Services, Hotels
3.5 325

Orange Sky
Seafood, Steakhouses, American (New), Restaurants
4.0 562

Different Pointe of View
Restaurants, American (Traditional), Italian, Mediterranean, Resorts, Hotels & Travel
4.0 512

The Wrigley Mansion Club
Breakfast & Brunch, Venues & Event Spaces, Restaurants, Nightlife, Music Venues, Arts & Entertainment, American (New), Event Planning & Services, Bars
3.5 238

