# Data Ingest

(http://thedataincubator.s3.amazonaws.com/coursedata/mldata/yelp_train_academic_dataset_review.json.gz)

In [2]:
data = open('yelp_train_academic_dataset_review.json').readlines()

In [6]:
import json
import gzip
with gzip.open('../ml/yelp_train_academic_dataset_business.json.gz', 'rb') as f:
    rest = f.readlines()

    
rest = map(lambda x: x.rstrip(), rest)

import pandas as pd
# each element of 'data' is an individual JSON object.
# i want to convert it into an *array* of JSON objects
# which, in and of itself, is one large JSON object
# basically... add square brackets to the beginning
# and end, and have all the individual business JSON objects
# separated by a comma
data_json_str = "[" + ','.join(rest) + "]"

# now, load it into pandas
df_rest = pd.read_json(data_json_str)

In [7]:
data[0]

'{"votes": {"funny": 0, "useful": 2, "cool": 1}, "user_id": "Xqd0DzHaiyRqVH3WRG7hzg", "review_id": "15SdjuK7DmYqUAj6rjGowg", "stars": 5, "date": "2007-05-17", "text": "dr. goldberg offers everything i look for in a general practitioner.  he\'s nice and easy to talk to without being patronizing; he\'s always on time in seeing his patients; he\'s affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i\'m sitting here trying to think of any complaints i have about him, but i\'m really drawing a blank.", "type": "review", "business_id": "vcNAWiLM4dR7D2nwwJ7nCA"}\n'

In [8]:
reviews = [json.loads(line)['text'] for line in data]

In [9]:
ids = [json.loads(line)['business_id'] for line in data]

In [80]:
stars = [json.loads(line)['stars'] for line in data]

In [30]:
len(reviews), len(stars)

(1012913, 1012913)

## Test Json

In [60]:
records = [
    {"votes": {"funny": 0, "useful": 0, "cool": 0}, "user_id": "WsGQfLLy3YlP_S9jBE3j1w", "review_id": "kzFlI35hkmYA_vPSsMcNoQ", "stars": 5, "date": "2012-11-03", "text": "Love it!!!!! Love it!!!!!! love it!!!!!!!   Who doesn't love Culver's!", "type": "review", "business_id": "LRKJF43s9-3jG9Lgx4zODg"},
    {"votes": {"funny": 0, "useful": 0, "cool": 0}, "user_id": "Veue6umxTpA3o1eEydowZg", "review_id": "Tfn4EfjyWInS-4ZtGAFNNw", "stars": 3, "date": "2013-12-30", "text": "Everything was great except for the burgers they are greasy and very charred compared to other stores.", "type": "review", "business_id": "LRKJF43s9-3jG9Lgx4zODg"},
    {"votes": {"funny": 0, "useful": 0, "cool": 0}, "user_id": "u5xcw6LCnnMhddoxkRIgUA", "review_id": "ZYaS2P5EmK9DANxGTV48Tw", "stars": 5, "date": "2010-12-04", "text": "I really like both Chinese restaurants in town.  This one has outstanding crab rangoon.  Love the chicken with snow peas and mushrooms and General Tso Chicken.  Food is always ready in 10 minutes which is accurate.  Good place and they give you free pop.", "type": "review", "business_id": "RgDg-k9S5YD_BaxMckifkg"},
    {"votes": {"funny": 0, "useful": 0, "cool": 0}, "user_id": "kj18hvJRPLepZPNL7ySKpg", "review_id": "uOLM0vvnFdp468ofLnszTA", "stars": 3, "date": "2011-06-02", "text": "Above average takeout with friendly staff. The sauce on the pan fried noodle is tasty. Dumplings are quite good.", "type": "review", "business_id": "RgDg-k9S5YD_BaxMckifkg"},
    {"votes": {"funny": 0, "useful": 0, "cool": 0}, "user_id": "L5kqM35IZggaPTpQJqcgwg", "review_id": "b3u1RHmZTNRc0thlFmj2oQ", "stars": 4, "date": "2012-05-28", "text": "We order from Chang Jiang often and have never been disappointed.  The menu is huge, and can accomodate anyone's taste buds.  The service is quick, usually ready in 10 minutes.", "type": "review", "business_id": "RgDg-k9S5YD_BaxMckifkg"}
]

## bag_of_words_model

In [160]:
class ReviewTransformer():
    """
    Returns text reviews in lists
    """
    def __init__(self):
        import nltk.tokenize
        pass

    def fit(self, X, y):
        return self
    
    def transform(self, x):
        if type(x) == dict:
            x = [x['text']]
        else:
            x = json.loads(x)['text']
        ##X = [nltk.tokenize.word_tokenize(x) for x in X]
        return x

In [161]:
class VectorizerTransformer():
    """
    Returns count vectorizer
    """
    def __init__(self, min_df, max_df):
        import nltk.tokenize
        from sklearn.feature_extraction.text import CountVectorizer
        self.trans = ReviewTransformer()
        self.bag_of_words_vectorizer =CountVectorizer(min_df = min_df, max_df=max_df,
                        stop_words=nltk.corpus.stopwords.words('english'))

        pass

    def fit(self, X):
        X = [self.trans.transform(x) for x in X]
        X = self.bag_of_words_vectorizer.fit_transform(X).toarray()
        return X
    
    def transform(self, x):
        x = self.trans.transform(x)
        x = self.bag_of_words_vectorizer.transform(x).toarray()
        ##X = [nltk.tokenize.word_tokenize(x) for x in X]
        return x

In [162]:
class BoWEstimator():
    def __init__(self, alpha, min_df, max_df):
        self.trans = VectorizerTransformer(min_df,max_df)  
        self.alpha = alpha
        pass
    
    def fit(self, X, y):    
        from sklearn import linear_model, utils, preprocessing
        X = self.trans.fit(X)
        self.model = linear_model.Ridge(alpha=self.alpha)
        self.model.fit(X,y)
        return self

    def predict(self, x):
        x = self.trans.transform(x)
        model=self.model
        self.y_pred = model.predict(x)
        return self.y_pred

In [165]:
est = BoWEstimator(alpha=0.01, min_df=0.1, max_df=0.9)

In [185]:
sample_data = data[:25000]

In [144]:
sample_data[0]

'{"votes": {"funny": 0, "useful": 2, "cool": 1}, "user_id": "Xqd0DzHaiyRqVH3WRG7hzg", "review_id": "15SdjuK7DmYqUAj6rjGowg", "stars": 5, "date": "2007-05-17", "text": "dr. goldberg offers everything i look for in a general practitioner.  he\'s nice and easy to talk to without being patronizing; he\'s always on time in seeing his patients; he\'s affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i\'m sitting here trying to think of any complaints i have about him, but i\'m really drawing a blank.", "type": "review", "business_id": "vcNAWiLM4dR7D2nwwJ7nCA"}\n'

In [118]:
sample_stars[0]

5

In [184]:
sample_stars = stars[:25000]

In [153]:
bow_model = est.fit(sample_data,sample_stars)

In [166]:
bow_model = est.fit(data,stars)

In [167]:
import dill
with open('bow_model.dill', 'w') as filename:
    dill.dump(bow_model,filename)

In [168]:
bow_model.predict(records[0])[0]

4.8659471568236796

# normalized_model

In [172]:
class TfidfTransformer():
    """
    Returns count vectorizer
    """
    def __init__(self, min_df, max_df):
        import nltk.tokenize
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        self.trans = VectorizerTransformer(min_df,max_df)
        self.tfidf_vectorizer =TfidfTransformer()

        pass

    def fit(self, X):
        X = self.trans.fit(X)
        X = self.tfidf_vectorizer.fit_transform(X)
        return X
    
    def transform(self, x):
        x = self.trans.transform(x)
        x = self.tfidf_vectorizer.transform(x)
        ##X = [nltk.tokenize.word_tokenize(x) for x in X]
        return x



In [173]:
class NormalizedEstimator():
    def __init__(self, alpha, min_df, max_df):
        self.trans = TfidfTransformer(min_df,max_df)  
        self.alpha = alpha
        pass
    
    def fit(self, X, y):    
        from sklearn import linear_model, utils, preprocessing
        X = self.trans.fit(X)
        self.model = linear_model.Ridge(alpha=self.alpha)
        self.model.fit(X,y)
        return self

    def predict(self, x):
        x = self.trans.transform(x)
        model=self.model
        self.y_pred = model.predict(x)
        return self.y_pred

In [174]:
est2 = NormalizedEstimator(alpha=0.001, min_df=0.1, max_df=0.9)

In [175]:
normalized_model = est2.fit(sample_data,sample_stars)

In [176]:
normalized_model.predict(records[0])[0]

4.6801734117259688

In [177]:
normalized_model = est2.fit(data,stars)

In [178]:
import dill
with open('normalized_model.dill', 'w') as filename:
    dill.dump(normalized_model,filename)

# bigram_model

In [179]:
class BigramVectorizerTransformer():
    """
    Returns count vectorizer
    """
    def __init__(self, min_df, max_df):
        import nltk.tokenize
        from sklearn.feature_extraction.text import CountVectorizer
        self.trans = ReviewTransformer()
        self.bag_of_words_vectorizer =CountVectorizer(min_df = min_df, max_df=max_df,ngram_range=(1,2),
                        stop_words=nltk.corpus.stopwords.words('english'))

        pass

    def fit(self, X):
        X = [self.trans.transform(x) for x in X]
        X = self.bag_of_words_vectorizer.fit_transform(X).toarray()
        return X
    
    def transform(self, x):
        x = self.trans.transform(x)
        x = self.bag_of_words_vectorizer.transform(x).toarray()
        ##X = [nltk.tokenize.word_tokenize(x) for x in X]
        return x


In [180]:
class TruncateTransformer():
    """
    Returns count vectorizer
    """
    def __init__(self, min_df, max_df, n_components):
        import nltk.tokenize
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.decomposition import TruncatedSVD
        self.trans = BigramVectorizerTransformer(min_df,max_df)
        self.svd =TruncatedSVD(n_components=n_components)

        pass

    def fit(self, X):
        X = self.trans.fit(X)
        X = self.svd.fit_transform(X)
        return X
    
    def transform(self, x):
        x = self.trans.transform(x)
        x = self.svd.transform(x)
        ##X = [nltk.tokenize.word_tokenize(x) for x in X]
        return x


In [181]:
class BigramEstimator():
    def __init__(self, alpha, min_df, max_df,n_components):
        self.trans = TruncateTransformer(min_df,max_df,n_components)  
        self.alpha = alpha
        pass
    
    def fit(self, X, y):    
        from sklearn import linear_model, utils, preprocessing
        X = self.trans.fit(X)
        self.model = linear_model.Ridge(alpha=self.alpha)
        self.model.fit(X,y)
        return self

    def predict(self, x):
        x = self.trans.transform(x)
        model=self.model
        self.y_pred = model.predict(x)
        return self.y_pred

In [186]:
est3 = BigramEstimator(alpha=0.001, min_df=0.1, max_df=0.9, n_components = 50)

In [187]:
bigram_model = est3.fit(sample_data,sample_stars)

In [188]:
import dill
with open('bigram_model.dill', 'w') as filename:
    dill.dump(bigram_model,filename)

In [189]:
bigram_model.predict(records[0])[0]

4.8789948774999603

# food_bigrams

In [17]:
df_rest = df_rest[['categories', 'business_id']]

In [18]:
list_categories = df_rest['categories']

In [19]:
list_restaurants = []
for items in list_categories:
    if 'Restaurants'in items:
        list_restaurants.append(True)
    else:
        list_restaurants.append(False)

In [20]:
df_rest['rest'] = list_restaurants

In [21]:
df_rest = df_rest.groupby('business_id').head(1)

In [22]:
df_rest = df_rest[(df_rest['rest']==True)]

In [23]:
df_id_review = pd.concat([pd.Series(ids), pd.Series(reviews)], axis=1)

In [24]:
df_id_review.columns = ['business_id', 'text']

In [26]:
import pandas as pd
df_filtered =pd.merge(df_rest, df_id_review, on='business_id', how='outer').dropna()

In [27]:
df_filtered

Unnamed: 0,categories,business_id,rest,text
0,[Restaurants],JwUE5GmEO-sH1FuwJgKBlQ,True,Good truck stop dining at the right price. We ...
1,[Restaurants],JwUE5GmEO-sH1FuwJgKBlQ,True,"If you like lot lizards, you'll love the Pine ..."
2,[Restaurants],JwUE5GmEO-sH1FuwJgKBlQ,True,Enjoyable experience for the whole family. The...
3,[Restaurants],JwUE5GmEO-sH1FuwJgKBlQ,True,One of my favorite truck stop diners with soli...
4,[Restaurants],JwUE5GmEO-sH1FuwJgKBlQ,True,Only went here once about a year and a half ag...
5,[Restaurants],JwUE5GmEO-sH1FuwJgKBlQ,True,Great truck stop restaurant. I've had breakfa...
6,[Restaurants],JwUE5GmEO-sH1FuwJgKBlQ,True,"Yeah, thats right a five freakin star rating. ..."
7,[Restaurants],JwUE5GmEO-sH1FuwJgKBlQ,True,Ate a Saturday morning breakfast at the Pine C...
8,[Restaurants],JwUE5GmEO-sH1FuwJgKBlQ,True,Attention fans of David Lynch. Do stop by thi...
9,[Restaurants],JwUE5GmEO-sH1FuwJgKBlQ,True,With a recent addition of a truck driver for a...


### sample data

In [28]:
rest_reviews = df_filtered['text']

In [29]:
sample_rest_reviews = rest_reviews[:500]

In [30]:
sample_rest_reviews[0]

u"Good truck stop dining at the right price. We love coming here on the weekends when we don't feel like cooking."

In [31]:
class UniGramTransformer():
    """
    Returns count vectorizer
    """
    def __init__(self, min_df, max_df):
        import nltk.tokenize
        from sklearn.feature_extraction.text import CountVectorizer
        self.bag_of_words_vectorizer =CountVectorizer(min_df = min_df, max_df=max_df,
                        stop_words='english')
        self.unigram_dict = {}

        pass

    def fit(self, X):
        return self
    
    def transform(self, X):
        X = self.bag_of_words_vectorizer.fit_transform(X)
        features = self.bag_of_words_vectorizer.get_feature_names()
        n = X.sum(axis=0)
        count = n[0]
        count = count.tolist()[0]
        self.unigram_dict = dict(zip(features, count))
        return self.unigram_dict

In [32]:
trans = UniGramTransformer(0.0001, 0.90)

In [515]:
unigram_dict = trans.transform(rest_reviews)

In [516]:
len(unigram_dict)

15747

In [33]:
class BiGramCountTransformer():
    """
    Returns count vectorizer
    """
    def __init__(self, min_df, max_df):
        import nltk.tokenize
        from sklearn.feature_extraction.text import CountVectorizer
        self.bag_of_words_vectorizer =CountVectorizer(min_df = min_df, max_df=max_df,ngram_range=(2,2),
                        stop_words='english')
        self.bigram_dict = {}

        pass

    def fit(self, X):
        return self
    
    def transform(self, X):
        X = self.bag_of_words_vectorizer.fit_transform(X)
        features = self.bag_of_words_vectorizer.get_feature_names()
        n = X.sum(axis=0)
        count = n[0]
        count = count.tolist()[0]
        self.bigram_dict = dict(zip(features, count))
        return self.bigram_dict

In [518]:
trans = BiGramCountTransformer(0.0001, 0.90)

In [519]:
bigram_dict = trans.transform(rest_reviews)

In [520]:
len(bigram_dict)

61677

In [34]:
class SpecialBigrams():
    """
    Returns count vectorizer
    """
    def __init__(self, min_uni, max_uni, min_bi, max_bi, smooth):
        self.uni_trans = UniGramTransformer(min_uni, max_uni)
        self.bi_trans = BiGramCountTransformer(min_bi, max_bi)
        self.s = smooth
        self.bigram_dict = {}

        pass

    def fit(self, X):
        return self
    
    def transform(self, X):
        unigram_dict = self.uni_trans.transform(X)
        bigram_dict = self.bi_trans.transform(X)
        for bi in bigram_dict:
            try:
                bigram_dict[bi]=float(bigram_dict[bi])/((unigram_dict[bi.split()[0]]+self.s)*(unigram_dict[bi.split()[1]]+self.s))
            except ValueError:
                continue
        sorted_bigrams = sorted(bigram_dict, key=bigram_dict.get ,reverse=True)
        return str(sorted_bigrams[0:100])
    

In [47]:
trans = SpecialBigrams(0.000001, 0.90, 0.00006, 0.90, 90)

In [48]:
trans.transform(rest_reviews)

"[u'hodge podge', u'himal chuli', u'hoity toity', u'roka akor', u'knick knacks', u'reina pepiada', u'cien agaves', u'baskin robbins', u'itty bitty', u'khai hoan', u'riff raff', u'grana padano', u'tutti santi', u'ropa vieja', u'gulab jamun', u'ore ida', u'dac biet', u'rula bula', u'hu tieu', u'innis gunn', u'tammie coe', u'alain ducasse', u'feng shui', u'leaps bounds', u'hors oeuvres', u'marche bacchus', u'uuu uuu', u'nooks crannies', u'celine dion', u'luc lac', u'krispy kreme', u'perrier jouet', u'deja vu', u'molecular gastronomy', u'puerto rican', u'vice versa', u'patatas bravas', u'lloyd wright', u'holyrood 9a', u'pura vida', u'lomo saltado', u'valle luna', u'nuoc mam', u'wal mart', u'bradley ogden', u'barnes noble', u'haricot vert', u'kao tod', u'ak yelpcdn', u'porta alba', u'khao soi', u'malai kofta', u'aguas frescas', u'loup mer', u'yadda yadda', u'mccormick schmick', u'yada yada', u'shiner bock', u'artery clogging', u'ritz carlton', u'womp womp', u'chino bandido', u'sous vide', u

In [527]:
len(sample_rest_reviews)

500

In [441]:
likelihoods = []
for bigram in bigrams:
    w1 = bigram.split(' ')[0]
    w2 = bigram.split(' ')[1]
    ratio = bigrams[bigram]/ (unigrams[w1]*unigrams[w2])
    likelihoods.append((bigram,ratio))
    

In [442]:
sorted_likelihoods = sorted(likelihoods, key=lambda x: x[1], reverse= True)

In [443]:
sorted_likelihoods[:100]

[(u'hong kong', 28739.609326123718),
 (u'valle luna', 26940.369175563868),
 (u'buon gusto', 25715.059344318746),
 (u'tutti santi', 25029.79422204203),
 (u'foie gras', 24773.733199746483),
 (u'kung pao', 24470.779846809499),
 (u'wi fi', 24068.538673996998),
 (u'osso bucco', 23781.48937519782),
 (u'pei wei', 23765.823335620542),
 (u'dac biet', 23107.021989056153),
 (u'kilt lifter', 22977.886928209355),
 (u'prix fixe', 22814.968951392992),
 (u'peter piper', 20957.452201273616),
 (u'bok choy', 20793.720575078652),
 (u'huevos rancheros', 20741.413214710603),
 (u'tammie coe', 20563.172015835389),
 (u'monte cristo', 20460.76327248184),
 (u'cracker barrel', 20159.972564544492),
 (u'surf turf', 20030.276395189474),
 (u'tex mex', 19994.694562156936),
 (u'si senor', 19929.384643054607),
 (u'coca cola', 19705.916620377844),
 (u'tikka masala', 19699.033446114936),
 (u'prickly pear', 19536.975309906906),
 (u'bloody mary', 19325.493074029786),
 (u'butternut squash', 19060.193841439439),
 (u'loco moco

In [366]:
food_bigrams = [[str(pair[0])] for pair in sorted_likelihoods[:100]]

In [444]:
food_dic = dict(sorted_likelihoods[:100])

In [445]:
food_dic.keys()

[u'neck woods',
 u'au jus',
 u'pei wei',
 u'ooey gooey',
 u'wi fi',
 u'capital grille',
 u'miracle mile',
 u'beaten path',
 u'dac biet',
 u'tikka masala',
 u'texas roadhouse',
 u'aunt chilada',
 u'bang buck',
 u'royal palms',
 u'pet peeve',
 u'moo shu',
 u'buca di',
 u'royal taj',
 u'dimly lit',
 u'de gallo',
 u'cherry blossom',
 u'peter piper',
 u'mahi mahi',
 u'ho hum',
 u'si senor',
 u'casa grande',
 u'saving grace',
 u'creme brulee',
 u'papa johns',
 u'zen 32',
 u'au gratin',
 u'thee pitts',
 u'peach cobbler',
 u'bo hue',
 u'co worker',
 u'lo mein',
 u'valle luna',
 u'kilt lifter',
 u'pane bianco',
 u'cracker barrel',
 u'surf turf',
 u'pine nuts',
 u'cave creek',
 u'macadamia nut',
 u'tokyo lobby',
 u'buon gusto',
 u'cheba hut',
 u'track betting',
 u'chow mein',
 u'cotton candy',
 u'clam chowder',
 u'com biz_photos',
 u'foie gras',
 u'bowling alley',
 u'hash browns',
 u'alice cooper',
 u'san francisco',
 u'rustler rooste',
 u'prickly pear',
 u'award winning',
 u'language barrier',


In [367]:
np.transpose(food_bigrams)

array([['carne asada', 'strip mall', 'hole wall', 'ice cream',
        'highly recommend', 'happy hour', 'dining room', 'years ago',
        'chips salsa', 'parking lot', 'friday night', 'several times',
        '10 minutes', 'last night', 'nothing special', 'fried rice',
        'customer service', 'many times', 'even though', 'make sure',
        'wait staff', 'next time', 'coming back', 'staff friendly',
        'looks like', 'go wrong', 'every time', 'felt like', 'first time',
        'come back', 'much better', 'feel like', 'tasted like',
        'would recommend', 'long time', 'pretty much', 'mexican food',
        'go back', 'last time', 'going back', 'looked like', 'best ever',
        'fast food', 'one favorite', 'chinese food', 'pretty good',
        'one thing', 'one best', 'back try', 'quality food', 'love place',
        'really good', 'friendly service', 'service friendly',
        'really nice', 'service always', 'great service', 'always get',
        'would go', 'servic

In [338]:
import numpy as np