# NLP: Analyzing Review Text


In [3]:
import gzip
import ujson as json

with gzip.open('yelp_train_academic_dataset_review_reduced.json.gz') as f:
    data = [json.loads(line) for line in f]

Scikit Learn will want the labels in a separate data structure, so let's pull those out now.

In [4]:
stars = [row['stars'] for row in data]

In [8]:
from sklearn import base

class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col_names):
        self.col_names = col_names  # We will need these in transform()
    
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        return self
    
    def transform(self, X):             
        return [row[col] for col in self.col_names for row in X]

In [83]:
def mytokenizer(string):
    tokenized=[]
    for t in string:
        doc = nlp(t)
        tokens = [token.lemma_ for token in doc if not token.is_punct]
        tokens = [token.lower() for token in tokens 
                  if token.lower() not in stop_words_lemma]
        tokens = [token for token in tokens if token != '-pron-']
        string=" ".join(tokens)
        tokenized.append(string)
    return tokenized

In [88]:
cst=ColumnSelectTransformer(['text'])
X=cst.fit_transform(data[:2])
X

["I don't know what Dr. Goldberg was like before  moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office has an attitude like they are doing you a favor. Give me a break! Stay away from this doc and the practice. You deserve better and they will not be there when you really ne

In [20]:
mytokenizer(X)

['know dr. goldberg like   arizona let tell stay away doctor office dr. johnson leave goldberg johnson leave care doctor interest co pay come medication refill month refill patient financial situation trying 90 day mail away pharmacy prescription guy joke matter wrong office staff incompetent 90 time office voice mail answer return adult child husband decide leave practice experience frustration entire office attitude like favor break stay away doc practice deserve well need feel compel write bad review meet pathetic excuse doctor money',
 'like lot lizard love pine cone']

## bag_of_words_model: Using Hash Vectorizer

In [29]:
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [120]:
bag_of_words_est = Pipeline([
    # Column selector (remember the ML project?)
    ('cst', ColumnSelectTransformer(['text'])),
    # Vectorizer
    ('cv', HashingVectorizer()),
    # Frequency filter (if necessary)
    # Regressor
    ('predictor',Ridge())
])
#gs=GridSearchCV(bag_of_words_est,param_grid=parameters)
bag_of_words_est.fit(data, stars)

Pipeline(memory=None,
         steps=[('cst', ColumnSelectTransformer(col_names=['text'])),
                ('cv',
                 HashingVectorizer(alternate_sign=True, analyzer='word',
                                   binary=False, decode_error='strict',
                                   dtype=<class 'numpy.float64'>,
                                   encoding='utf-8', input='content',
                                   lowercase=True, n_features=1048576,
                                   ngram_range=(1, 1), norm='l2',
                                   preprocessor=None, stop_words=None,
                                   strip_accents=None,
                                   token_pattern='(?u)\\b\\w\\w+\\b',
                                   tokenizer=None)),
                ('predictor',
                 Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                       max_iter=None, normalize=False, random_state=None,
                       solver='auto', tol=0.001

## normalized_model: Using TFIDF Vectorizer

In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer
normalized_est = Pipeline([
    # Column selector (remember the ML project?)
    ('cst', ColumnSelectTransformer(['text'])),
    # Vectorizer
    ('cv', TfidfVectorizer()),
    # Frequency filter (if necessary)
    # Regressor
    ('predictor',Ridge())
])
#gs=GridSearchCV(bag_of_words_est,param_grid=parameters)
normalized_est.fit(data, stars)

Pipeline(memory=None,
         steps=[('cst', ColumnSelectTransformer(col_names=['text'])),
                ('cv',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('predictor',
                 Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
       

## bigram_model: Including Bigrams

In [124]:
from sklearn.feature_extraction.text import TfidfVectorizer
bigram_est = Pipeline([
    # Column selector (remember the ML project?)
    ('cst', ColumnSelectTransformer(['text'])),
    # Vectorizer
    ('cv', TfidfVectorizer(ngram_range=(1,2))),
    # Frequency filter (if necessary)
    # Regressor
    ('predictor',Ridge())
])
#gs=GridSearchCV(bag_of_words_est,param_grid=parameters)
bigram_est.fit(data, stars)

Pipeline(memory=None,
         steps=[('cst', ColumnSelectTransformer(col_names=['text'])),
                ('cv',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('predictor',
                 Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
       

## food_bigrams: Find the bigrams that show up frequently in food context

In [4]:
with gzip.open('yelp_train_academic_dataset_business.json.gz') as f:
    business_data = [json.loads(line) for line in f]

Each row of this file corresponds to a single business.  The category key gives a list of categories for each; take all where "Restaurants" appears.

In [130]:
business_data[0]

{'business_id': 'vcNAWiLM4dR7D2nwwJ7nCA',
 'full_address': '4840 E Indian School Rd\nSte 101\nPhoenix, AZ 85018',
 'hours': {'Tuesday': {'close': '17:00', 'open': '08:00'},
  'Friday': {'close': '17:00', 'open': '08:00'},
  'Monday': {'close': '17:00', 'open': '08:00'},
  'Wednesday': {'close': '17:00', 'open': '08:00'},
  'Thursday': {'close': '17:00', 'open': '08:00'}},
 'open': True,
 'categories': ['Doctors', 'Health & Medical'],
 'city': 'Phoenix',
 'review_count': 7,
 'name': 'Eric Goldberg, MD',
 'neighborhoods': [],
 'longitude': -111.983758,
 'state': 'AZ',
 'stars': 3.5,
 'latitude': 33.499313,
 'attributes': {'By Appointment Only': True},
 'type': 'business'}

In [5]:
restaurants=[]
for b in business_data:
    if "Restaurants" in b['categories']:
        restaurants.append(b)

In [6]:
len(restaurants)

12876

In [7]:
restaurant_ids = [r['business_id'] for r in restaurants]

In [8]:
assert len(restaurant_ids) == 12876

The "business_id" here is the same as in the review data.  Use this to extract the review text for all reviews of restaurants.

In [138]:
restaurants[0]

{'business_id': 'JwUE5GmEO-sH1FuwJgKBlQ',
 'full_address': '6162 US Highway 51\nDe Forest, WI 53532',
 'hours': {},
 'open': True,
 'categories': ['Restaurants'],
 'city': 'De Forest',
 'review_count': 26,
 'name': 'Pine Cone Restaurant',
 'neighborhoods': [],
 'longitude': -89.335844,
 'state': 'WI',
 'stars': 4.0,
 'latitude': 43.238893,
 'attributes': {'Take-out': True,
  'Good For': {'dessert': False,
   'latenight': False,
   'lunch': True,
   'dinner': False,
   'breakfast': False,
   'brunch': False},
  'Caters': False,
  'Noise Level': 'average',
  'Takes Reservations': False,
  'Delivery': False,
  'Ambience': {'romantic': False,
   'intimate': False,
   'touristy': False,
   'hipster': False,
   'divey': False,
   'classy': False,
   'trendy': False,
   'upscale': False,
   'casual': False},
  'Parking': {'garage': False,
   'street': False,
   'validated': False,
   'lot': True,
   'valet': False},
  'Has TV': True,
  'Outdoor Seating': False,
  'Attire': 'casual',
  'Alcoho

In [139]:
data[0]

{'votes': {'funny': 0, 'useful': 0, 'cool': 0},
 'user_id': 'Qrs3EICADUKNFoUq2iHStA',
 'review_id': '_ePLBPrkrf4bhyiKWEn4Qg',
 'stars': 1,
 'date': '2013-04-19',
 'text': "I don't know what Dr. Goldberg was like before  moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office ha

In [140]:
len(data)

253272

In [9]:
import pandas as pd
data_df=pd.DataFrame(data)[['business_id','text']]
r_df=pd.DataFrame(restaurants)[['business_id']]
r_df.head()

Unnamed: 0,business_id
0,JwUE5GmEO-sH1FuwJgKBlQ
1,uGykseHzyS5xAMWoN6YUqA
2,LRKJF43s9-3jG9Lgx4zODg
3,RgDg-k9S5YD_BaxMckifkg
4,_wZTYYL7cutanzAnJUTGMA


In [10]:
df=r_df.merge(data_df, on='business_id',how='left').dropna()
df.shape

(143361, 2)

In [11]:
df.head()

Unnamed: 0,business_id,text
0,JwUE5GmEO-sH1FuwJgKBlQ,"If you like lot lizards, you'll love the Pine ..."
1,JwUE5GmEO-sH1FuwJgKBlQ,Only went here once about a year and a half ag...
2,JwUE5GmEO-sH1FuwJgKBlQ,Ate a Saturday morning breakfast at the Pine C...
3,JwUE5GmEO-sH1FuwJgKBlQ,This is definitely not your usual truck stop. ...
4,JwUE5GmEO-sH1FuwJgKBlQ,I like this location better than the one near ...


In [12]:
res=df.to_dict(orient='records')

In [13]:
restaurant_reviews = [row['text'] for row in res]

In [14]:
assert len(restaurant_reviews) == 143361

In [15]:
restaurant_reviews[0]

"If you like lot lizards, you'll love the Pine Cone!"

In [2]:
import dill
#dill.dump(restaurant_reviews,open('reviews.pkd', 'wb'))

In [3]:
import dill
restaurant_reviews = dill.load(open('reviews.pkd', 'rb'))

Calculate:

  $$ \frac{p(w_1 w_2)}{p(w_1) p(w_2)} $$




In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [5]:
##count single words
cv = CountVectorizer(min_df=20,stop_words='english')
single=cv.fit_transform(restaurant_reviews)

grams=list(cv.get_feature_names())
counts = single.sum(axis=0).A1

freq_distribution = Counter(dict(zip(grams, counts)))

#count double words
cv = CountVectorizer(ngram_range=(2,2),min_df=20,stop_words='english')
double = cv.fit_transform(restaurant_reviews)

bigrams=list(cv.get_feature_names())
counts_bi = double.sum(axis=0).A1

freq_distribution_bi = Counter(dict(zip(bigrams, counts_bi)))

In [6]:
##calculate p(w)
value=sum(freq_distribution.values())
for item, count in freq_distribution.items():
    freq_distribution[item]+=30 
    freq_distribution[item]/= value

##calculate p(w1w2)
value=sum(freq_distribution_bi.values())
for item, count in freq_distribution_bi.items():
    freq_distribution_bi[item] /= value

##calculate ratio
for item, count in freq_distribution_bi.items():
    lis_grams=item.split()
    value1=freq_distribution[lis_grams[0]]
    value2=freq_distribution[lis_grams[1]]
    value=value1*value2
    freq_distribution_bi[item] /= value

In [7]:
top100=[item[0] for item in freq_distribution_bi.most_common(100)]

In [8]:
top100

['knick knacks',
 'rula bula',
 'ropa vieja',
 'itty bitty',
 'dac biet',
 'gulab jamun',
 'patatas bravas',
 'puerto rican',
 'wal mart',
 'bradley ogden',
 'lomo saltado',
 'vice versa',
 'valle luna',
 'kao tod',
 'sous vide',
 'artery clogging',
 'har gow',
 'pina colada',
 'bells whistles',
 'harry potter',
 'aguas frescas',
 'ping pang',
 'casey moore',
 'pin kaow',
 'cochinita pibil',
 'scantily clad',
 'demi glace',
 'lactose intolerant',
 'thit nuong',
 'kilt lifter',
 'moscow mule',
 'woody allen',
 'hustle bustle',
 'dulce leche',
 'cabo wabo',
 'kee mao',
 'mt everest',
 'tres leches',
 'arnold palmer',
 'coca cola',
 'stainless steel',
 'kool aid',
 'rick moonen',
 'osso bucco',
 'van buren',
 'huli huli',
 'fleur lys',
 'insult injury',
 'quench thirst',
 'bok choy',
 'fogo chao',
 'jean philippe',
 'toby keith',
 'tilted kilt',
 'identity crisis',
 'parmigiano reggiano',
 'hush puppies',
 'sierra bonita',
 'nba finals',
 'panna cotta',
 'apache junction',
 'petit fours',