In [2]:
import sys
import string
import nltk as nltk
from nltk.tokenize import RegexpTokenizer,sent_tokenize,word_tokenize
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
import os
from sklearn.metrics import confusion_matrix,classification_report
import matplotlib.pyplot as plt
import pandas  as pd
import json
import numpy as np
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
import json

In [3]:
busfile ='/home/rungsunan/data/yelp/business.json'
reviewfile = '/home/rungsunan/data/yelp/review.json'

In [4]:
business_df = pd.read_json(busfile,lines=True)

    The reviewfile is too big to import with pd.read_json so we use a for loop to process iteratively. 
    Also, the json file is newline separated, which is better handled with the json.loads() method. 

In [6]:
reviews = []
with open(reviewfile) as data_file:    
    for line in data_file:
        reviews.append(json.loads(line))

In [7]:
reviews[0]

{'review_id': 'Q1sbwvVQXV2734tPgoKj4Q',
 'user_id': 'hG7b0MtEbXx5QzbzE6C_VA',
 'business_id': 'ujmEBvifdJM6h6RLv4wQIg',
 'stars': 1.0,
 'useful': 6,
 'funny': 1,
 'cool': 0,
 'text': 'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.',
 'date': '2013-05-07 04:34:36'}

We are making a dataset with just review text of Chinese or Italian restaurants. From the business dataframe we take the businesses with those types in the categories column. From the review list we select the business_id and the review text. Then merge the two dataframes to create a training/test set. 

In [8]:
business_df = business_df[business_df['categories'].str.contains("Chinese|Italian")==True]
business_df = business_df[['business_id','categories']]

In [9]:
business_df.head()

Unnamed: 0,business_id,categories
1,QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported..."
13,fweCYi8FmbJXHCqLnwuk8w,"Italian, Restaurants, Pizza, Chicken Wings"
17,PZ-LZzSlhSe9utkQYU8pFg,"Restaurants, Italian"
35,BvYU3jvGd0TJ7IyZdfiN2Q,"Sandwiches, Italian, American (Traditional), A..."
45,e_EMySqP0uwlVZfd8mRaaQ,"Chinese, Dim Sum, Restaurants"


In [10]:
business_df.loc[business_df['categories'].str.contains("Chinese"), "cuisine"] = "Chinese"
business_df.loc[business_df['categories'].str.contains("Italian"), "cuisine"] = "Italian"
business_df = business_df[['business_id','cuisine']]

In [11]:
business_df.head()

Unnamed: 0,business_id,cuisine
1,QXAEGFB4oINsVuTFxEYKFQ,Chinese
13,fweCYi8FmbJXHCqLnwuk8w,Italian
17,PZ-LZzSlhSe9utkQYU8pFg,Italian
35,BvYU3jvGd0TJ7IyZdfiN2Q,Italian
45,e_EMySqP0uwlVZfd8mRaaQ,Chinese


In [12]:
print('Number of Italian or Chinese Restaurants: ' + 
      str(len((business_df[(business_df.cuisine == 'Italian') | (business_df.cuisine == 'Chinese')]))))
print('Number of Italian Restaurants: ' + str(len((business_df[(business_df.cuisine == 'Italian')]))))
print('Number of Chinese Restaurants: ' + str(len((business_df[(business_df.cuisine == 'Chinese')]))))

Number of Italian or Chinese Restaurants: 9349
Number of Italian Restaurants: 4716
Number of Chinese Restaurants: 4633


In [13]:
reviews_df = pd.DataFrame.from_dict(reviews)
reviews_df.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,ujmEBvifdJM6h6RLv4wQIg,0,2013-05-07 04:34:36,1,Q1sbwvVQXV2734tPgoKj4Q,1.0,Total bill for this horrible service? Over $8G...,6,hG7b0MtEbXx5QzbzE6C_VA
1,NZnhc2sEQy3RmzKTZnqtwQ,0,2017-01-14 21:30:33,0,GJXCdrto3ASJOqKeVWPi6Q,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,0,yXQM5uF2jS6es16SJzNHfg
2,WTqjgwHlXbSFevF32_DJVw,0,2016-11-09 20:09:03,0,2TzJjDVDEuAW6MR5Vuc1ug,5.0,I have to say that this office really has it t...,3,n6-Gk65cPZL6Uz8qRm3NYw
3,ikCg8xy5JIg_NGPx-MSIDA,0,2018-01-09 20:56:38,0,yi0R0Ugj_xUx_Nek0-_Qig,5.0,Went in for a lunch. Steak sandwich was delici...,0,dacAIZ6fTM6mqwW5uxkskg
4,b1b1eb3uo-w561D0ZfCEiQ,0,2018-01-30 23:07:38,0,11a8sVPMUFtaC7_ABRkmtw,1.0,Today was my second out of three sessions I ha...,7,ssoyf2_x0EQMed6fgHeMyQ


We only are interested in the businessID and the text of the reviews

In [14]:
reviews_df = reviews_df[['business_id','text']]
reviews_df.head()

Unnamed: 0,business_id,text
0,ujmEBvifdJM6h6RLv4wQIg,Total bill for this horrible service? Over $8G...
1,NZnhc2sEQy3RmzKTZnqtwQ,I *adore* Travis at the Hard Rock's new Kelly ...
2,WTqjgwHlXbSFevF32_DJVw,I have to say that this office really has it t...
3,ikCg8xy5JIg_NGPx-MSIDA,Went in for a lunch. Steak sandwich was delici...
4,b1b1eb3uo-w561D0ZfCEiQ,Today was my second out of three sessions I ha...


In [15]:
final_df =  pd.merge(business_df,reviews_df,on='business_id')
final_df.head(1000)

Unnamed: 0,business_id,cuisine,text
0,QXAEGFB4oINsVuTFxEYKFQ,Chinese,My girlfriend and I went for dinner at Emerald...
1,QXAEGFB4oINsVuTFxEYKFQ,Chinese,We've always been there on a Sunday so we were...
2,QXAEGFB4oINsVuTFxEYKFQ,Chinese,"***No automatic doors, not baby friendly!*** I..."
3,QXAEGFB4oINsVuTFxEYKFQ,Chinese,"Horrible service,\nI went there tonight with m..."
4,QXAEGFB4oINsVuTFxEYKFQ,Chinese,One of the gauges of a good Chinese restaurant...
5,QXAEGFB4oINsVuTFxEYKFQ,Chinese,"I've been a frequent at this place for years, ..."
6,QXAEGFB4oINsVuTFxEYKFQ,Chinese,We chose this restaurant for our Chinese New Y...
7,QXAEGFB4oINsVuTFxEYKFQ,Chinese,I went at 230 on a Monday. It was dimsum \n\nI...
8,QXAEGFB4oINsVuTFxEYKFQ,Chinese,My family and I were at Emerald yesterday duri...
9,QXAEGFB4oINsVuTFxEYKFQ,Chinese,"Because we are in Mississauga, I think this pl..."


In [16]:
print('Number of Italian Restaurant reviews: ' + str(len((final_df[(final_df.cuisine == 'Italian')]))))
print('Number of Chinese Restaurant reviews: ' + str(len((final_df[(final_df.cuisine == 'Chinese')]))))

Number of Italian Restaurant reviews: 392125
Number of Chinese Restaurant reviews: 262695


For text analysis we use a code for the cuisine type(-1 for italian, 1 for chinese), and make all of the text lowercase for stemming.

In [17]:
final_df["cuisine_code"] = np.where(final_df["cuisine"].str.contains("Italian"), 1,-1)
final_df['text'] = final_df['text'].str.lower()

In [18]:
final_df['length'] = final_df['text'].apply(len)

In [19]:
final_df.length.describe()

count    654820.000000
mean        597.582896
std         560.578110
min           1.000000
25%         232.000000
50%         420.000000
75%         765.000000
max        5000.000000
Name: length, dtype: float64

In [20]:
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/rungsunan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
small_df = final_df.head(10000)

In [32]:
stop_words = get_stop_words("english")
print(stop_words)
len(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 't

174

In [23]:
small_df['word_tokenized'] = small_df['text'].apply(word_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [24]:
small_df.head()

Unnamed: 0,business_id,cuisine,text,cuisine_code,length,word_tokenized
0,QXAEGFB4oINsVuTFxEYKFQ,Chinese,my girlfriend and i went for dinner at emerald...,-1,1475,"[my, girlfriend, and, i, went, for, dinner, at..."
1,QXAEGFB4oINsVuTFxEYKFQ,Chinese,we've always been there on a sunday so we were...,-1,439,"[we, 've, always, been, there, on, a, sunday, ..."
2,QXAEGFB4oINsVuTFxEYKFQ,Chinese,"***no automatic doors, not baby friendly!*** i...",-1,252,"[***no, automatic, doors, ,, not, baby, friend..."
3,QXAEGFB4oINsVuTFxEYKFQ,Chinese,"horrible service,\ni went there tonight with m...",-1,1113,"[horrible, service, ,, i, went, there, tonight..."
4,QXAEGFB4oINsVuTFxEYKFQ,Chinese,one of the gauges of a good chinese restaurant...,-1,1668,"[one, of, the, gauges, of, a, good, chinese, r..."


In [138]:
small_df['length']

0       1475
1        439
2        252
3       1113
4       1668
5       1714
6        409
7        422
8       1251
9        406
10       557
11       654
12       773
13       242
14       689
15       358
16      1268
17       301
18       175
19       792
20       181
21       932
22      1006
23       461
24       741
25       838
26       286
27      1503
28       158
29       990
        ... 
9970     330
9971     154
9972     556
9973     146
9974    1151
9975     716
9976     350
9977    1139
9978     494
9979     306
9980     756
9981     279
9982     880
9983     191
9984     228
9985     265
9986     318
9987     545
9988     300
9989     120
9990     220
9991    1203
9992     378
9993     663
9994     301
9995     179
9996     764
9997      59
9998     942
9999     465
Name: length, Length: 10000, dtype: int64

let's put the text and the cuisine code into a set(reviewtext, cuisinecode)

In [46]:
textcode_df = final_df[['cuisine_code','text']]

In [47]:
codetext = list(zip(textcode_df['cuisine_code'].tolist(),textcode_df['text'].tolist()))

In [48]:
len(codetext)

654820

In [105]:
cuisinetype_list,reviewtext_list = list(zip(*codetext))

In [121]:
type(reviewtext_list)

tuple