In [1]:
import gzip
import ujson
import pandas as pd
import sklearn as sk
import numpy as np

In [2]:
filedict = []
with gzip.open("data/yelp_train_academic_dataset_business.json.gz",'rb') as f:
    for line in f:
        tempdict= ujson.loads(line)
        filedict.append(tempdict)

In [3]:
df = pd.DataFrame(filedict)

## City_model

In [4]:
city_df = df[['city','stars']]

In [5]:
gcity = city_df.groupby("city").agg({'stars':np.mean}).reset_index().head()

In [6]:
gcity.head()

Unnamed: 0,city,stars
0,Ahwatukee,3.6875
1,Anthem,3.781818
2,Apache Junction,3.6375
3,Arcadia,5.0
4,Atlanta,3.5


In [7]:
gcity[gcity.city=="Atlanta"].stars.values[0]

3.5

In [8]:
class City_model():
    def __init__(self):
        pass
    def fit(self, city_df):
        self.mean = city_df.stars.mean()
        self.city_grouped = city_df.groupby('city').agg({'stars':np.mean}).reset_index()
    def predict(self, name):
        if name not in self.city_grouped.city.unique():
            return self.mean
        else:
            return self.city_grouped[self.city_grouped.city==name].stars.values[0]

In [9]:
city_model = City_model()

In [10]:
city_model.fit(city_df)

In [11]:
city_model.predict("Atlanta")

3.5

In [12]:
import dill

In [13]:
with open('city_model.pkl', 'wb') as f:
    dill.dump(city_model,f)

In [14]:
with open('city_model.pkl', 'rb') as f:
    model = dill.load(f)

In [15]:
model.predict("Atanta")

3.6729137013021247

## lat_long_model

In [16]:
df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,{u'By Appointment Only': True},vcNAWiLM4dR7D2nwwJ7nCA,"[Doctors, Health & Medical]",Phoenix,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","{u'Thursday': {u'close': u'17:00', u'open': u'...",33.499313,-111.983758,"Eric Goldberg, MD",[],True,7,3.5,AZ,business
1,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,-89.335844,Pine Cone Restaurant,[],True,26,4.0,WI,business
2,"{u'Take-out': True, u'Outdoor Seating': False,...",uGykseHzyS5xAMWoN6YUqA,"[American (Traditional), Restaurants]",De Forest,"505 W North St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'06...",43.252267,-89.353437,Deforest Family Restaurant,[],True,16,4.0,WI,business
3,"{u'Take-out': True, u'Accepts Credit Cards': T...",LRKJF43s9-3jG9Lgx4zODg,"[Food, Ice Cream & Frozen Yogurt, Fast Food, R...",De Forest,"4910 County Rd V\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'10...",43.251045,-89.374983,Culver's,[],True,7,4.5,WI,business
4,"{u'Take-out': True, u'Has TV': False, u'Outdoo...",RgDg-k9S5YD_BaxMckifkg,"[Chinese, Restaurants]",De Forest,"631 S Main St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'11...",43.240875,-89.343722,Chang Jiang Chinese Kitchen,[],True,3,4.0,WI,business


In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import grid_search

In [18]:
rf = RandomForestRegressor()

In [19]:
gs = grid_search.GridSearchCV(
    rf,
    {"max_depth":range(1,11)},
    cv = 5,
    n_jobs=2,
    scoring="mean_squared_error"
)

In [20]:
latlongdf = df[['latitude','longitude']]

In [21]:
gs.fit(latlongdf, df.stars)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=2,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='mean_squared_error', verbose=0)

In [22]:
print gs.best_params_

{'max_depth': 6}


In [23]:
gs.predict([43.252267,-89.353437])

array([ 3.66910857])

In [24]:
with open('latlong.pkl', 'wb') as f:
    dill.dump(gs,f)

## category_model

In [25]:
df.categories[0]

[u'Doctors', u'Health & Medical']

In [26]:
from sklearn.feature_extraction import DictVectorizer

In [335]:
x = map(lambda x: dict(zip(x,[1 for _ in range(len(x))])) ,df.categories)

In [340]:
x[0:10]

[{u'Doctors': 1, u'Health & Medical': 1},
 {u'Restaurants': 1},
 {u'American (Traditional)': 1, u'Restaurants': 1},
 {u'Fast Food': 1,
  u'Food': 1,
  u'Ice Cream & Frozen Yogurt': 1,
  u'Restaurants': 1},
 {u'Chinese': 1, u'Restaurants': 1},
 {u'Mass Media': 1, u'Television Stations': 1},
 {u'Heating & Air Conditioning/HVAC': 1, u'Home Services': 1},
 {u'Libraries': 1, u'Public Services & Government': 1},
 {u'Pets': 1, u'Veterinarians': 1},
 {u'American (Traditional)': 1,
  u'Bars': 1,
  u'Lounges': 1,
  u'Nightlife': 1,
  u'Restaurants': 1}]

In [341]:
v = DictVectorizer()

In [342]:
transformed_X = v.fit_transform(x)

In [343]:
transformed_X

<37938x706 sparse matrix of type '<type 'numpy.float64'>'
	with 108205 stored elements in Compressed Sparse Row format>

In [347]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()

In [350]:
tfidf_x = transformer.fit_transform(transformed_X)

In [None]:
transformer([0,1,0])

In [367]:
tfidf_x.toarray()

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.80348886,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [353]:
transformed_Y = df.stars 

In [354]:
from sklearn.linear_model import RidgeCV

In [361]:
clf = RidgeCV(alphas=range(20),cv=5)

In [362]:
clf.fit(tfidf_x,transformed_Y)

RidgeCV(alphas=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
    cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
    store_cv_values=False)

In [363]:
clf.alpha_

3

In [365]:
with open('cat_transform.pkl', 'wb') as f:
    dill.dump(v,f)

In [366]:
with open('cat_model.pkl', 'wb') as f:
    dill.dump(clf,f)

In [369]:
with open('tfidftransform.pkl', 'wb') as f:
    dill.dump(transformer,f)

## attribute_knn_model

In [483]:
import sklearn as sk

In [500]:
class Attribute_transformer(sk.base.BaseEstimator, sk.base.TransformerMixin):
    def __init__(self):
        from sklearn.feature_extraction import DictVectorizer
        self.attribute_knn_v = DictVectorizer()
        # initialization code
    def flatten(self,d, parent_key='', sep='_'):
        import collections
        items = []
        for k, v in d.items():
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, collections.MutableMapping):
                items.extend(flatten(v, new_key, sep=sep).items())
            else:
                if isinstance(v, bool):
                    items.append((new_key, int(v)))
                elif isinstance(v, int):
                    items.append((new_key, v))
                else:
                    items.append((new_key + "_" + v, 1))
        return dict(items)

    def fit(self, X, y=None):
        # fit the transformation
        # ...
        flat_x = map(lambda x: self.flatten(x),X)
        transformed_x = self.attribute_knn_v.fit_transform(flatten_x)
        return transformed_x
    def transform(self, X):
        flat_x = self.flatten(X)
        return self.attribute_knn_v.transform(X)

In [501]:
transfomer = Attribute_transformer()
transformed_x = transfomer.fit(df.attributes)

In [502]:
from sklearn.linear_model import RidgeCV

In [503]:
attribute_model = RidgeCV(alphas=range(20),cv=5)

In [504]:
attribute_model.fit(transformed_x,df.stars)

RidgeCV(alphas=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
    cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
    store_cv_values=False)

In [505]:
attribute_model.alpha_

13

In [506]:
with open('attribute_model.pkl', 'wb') as f:
    dill.dump(attribute_model,f)

In [507]:
with open('attribute_transform.pkl', 'wb') as f:
    dill.dump(transfomer,f)

In [508]:
trans_x = transfomer.transform({u'attributes': {u'Accepts Credit Cards': {}, u'Price Range': 2, u'Parking': {u'garage': False, u'street': False, u'validated': False, u'lot': False, u'valet': False}}})

In [509]:
float(attribute_model.predict(trans_x)[0])

3.6729137013021247