In [2]:
import ujson
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import cross_validation, grid_search
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [3]:
# read and parse the file
with open('yelp_academic_dataset_business.json') as f:
    data_all = pd.DataFrame(ujson.loads(line) for line in f)

data = data_all[ [ 'Restaurants' in cat for cat in data_all['categories'] ] ]
star = data['stars']
data.head(1)

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,"{u'Take-out': True, u'Drive-Thru': False, u'Ou...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{u'Tuesday': {u'close': u'21:00', u'open': u'1...",40.354327,-79.900706,Mr Hoagie,[],True,4,4.5,PA,business


In [4]:
### for testing estimator ###
class test_estimator():
    def __init__(self, estimator, X, y):
        self.estimator = estimator
        self.X = X
        self.y = y
        
    def score(self):
        """
        return root mean squared error
        """
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(self.X, self.y, 
                                                                             test_size=0.2, random_state=22)
        self.estimator.fit(X_train, y_train)
        y_predict = self.estimator.predict(X_test)
        return np.sqrt(mean_squared_error(y_test, y_predict))

In [5]:
# predict based on average stars of the selected 'city'
class city_estimator(BaseEstimator, RegressorMixin):
    def fit(self, X, y=None):
        self.existing_cities = X.groupby('city').mean()['stars']
        self.other_cities = X.mean()['stars']
        return self
    
    def predict(self, X, y=None):
        try:
            getattr(self, "existing_cities")
        except AttributeError:
            raise RuntimeError("Must train the regressor before predicting data!")

        return [self.existing_cities[c] if c in self.existing_cities else self.other_cities for c in X['city']]

city_model = city_estimator()
test_estimator(city_model, data, star).score()

0.75060058989508882

In [6]:
### Customized transformers ###
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.keys]

# predict based on lat and long
lat_long_pipeline = Pipeline([
    ('trans', ColumnSelectTransformer(['latitude', 'longitude'])),
    ('est', KNeighborsRegressor())
])

cv = cross_validation.ShuffleSplit(int(len(data)*0.8), n_iter=20, test_size=0.2, random_state=22)
knn_param_grid = { "est__n_neighbors": range(10, 150, 10) } # best param ~ 70
knn_regression_cv = grid_search.GridSearchCV(lat_long_pipeline, 
                                             param_grid=knn_param_grid, cv=cv,
                                             scoring="mean_squared_error")

test_estimator(knn_regression_cv, data, star).score()

0.7428556021479974

In [7]:
### predict based on category ###
class CategoryTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        D = []
        for record in X.values:
            D.append({k: 1 if "Restaurant" not in k else 0 for k in record})
        return D

category_pipeline = Pipeline([
    ('trans', ColumnSelectTransformer('categories')),
    ('cat_trans', CategoryTransformer()),
    ('vect', DictVectorizer()),
    ('tfidf_vect', TfidfTransformer()),
    ('est', Ridge())
])

cv = cross_validation.ShuffleSplit(int(len(data)*0.8), n_iter=20, test_size=0.2, random_state=22)
param_grid = { "est__alpha": np.logspace(-1, 1.5, 10) }
ridge_cv = grid_search.GridSearchCV(category_pipeline,
                                    param_grid=param_grid, cv=cv,
                                    scoring="mean_squared_error")
test_estimator(ridge_cv, data, star).score()



0.70060986838061245

In [None]:
ridge_cv.best_estimator_.get_params()

Restaurants have (potentially nested) attributes:
{'Attire': 'casual',
 'Accepts Credit Cards': True,
 'Ambience': {'casual': False, 'classy': False}}
 
We will flatten the above into something like this:
{'Attire_casual' : 1,
 'Accepts Credit Cards': 1,
 'Ambience_casual': 0,
 'Ambience_classy': 0 }

In [13]:
### predict based on attributes ###
class AttributeTransformer(BaseEstimator, TransformerMixin):
    def _flatten(self, d, parent_key='', sep='_'):
        """ Flatten dictonary
        """
        items = []
        for k, v in d.items():
            new_key = parent_key + (sep + k if parent_key else k)
            if isinstance(v, dict) and len(v)>0:
                items.extend(self._flatten(v, new_key, sep=sep).items())
            else:
                new_v = 1 if v == True else 0
                items.append((new_key, new_v))
        return dict(items)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        D = []
        for record in X.values:
            D.append(self._flatten(record))
        return D

attribute_pipeline = Pipeline([
    ('trans', ColumnSelectTransformer('attributes')),
    ('cat_trans', AttributeTransformer()),
    ('vect', DictVectorizer()),
    ('tfidf_vect', TfidfTransformer()),
    ('est', Ridge())
])

cv = cross_validation.ShuffleSplit(int(len(data)*0.8), n_iter=20, test_size=0.2, random_state=22)
ridge_param_grid = { "est__alpha": np.logspace(-6., -.3, 5) }
attribute_ridge_cv = grid_search.GridSearchCV(attribute_pipeline,
                                              param_grid=ridge_param_grid, cv=cv,
                                              scoring="mean_squared_error")
test_estimator(attribute_ridge_cv, data, star).score()

0.70257783693957587

In [37]:
### full_model ###
class ModelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict(X))
    
full_pipeline = Pipeline([
    ('feature_union', FeatureUnion([
        ('lat_long_feature', Pipeline([
            ('col_select', ColumnSelectTransformer(['latitude', 'longitude'])),
            ('knn', ModelTransformer(KNeighborsRegressor(n_neighbors=70)))
            ])),
        ('category_feature', Pipeline([
            ('col_select', ColumnSelectTransformer('categories')),
            ('cat_trans', CategoryTransformer()),
            ('vect', DictVectorizer()),
            ('tfidf_vect', TfidfTransformer()),

            ])),
        ('attribute_feature', Pipeline([
            ('col_select', ColumnSelectTransformer('attributes')),
            ('attr_transformer', AttributeTransformer()),
            ('vect', DictVectorizer()),
            ('tfidf_vect', TfidfTransformer()),
            ]))
        ])),
    ('est', Ridge())
])

cv = cross_validation.ShuffleSplit(int(len(data)*0.8), n_iter=20, test_size=0.2, random_state=22)
ridge_param_grid = { "est__alpha": np.logspace(-6., -.3, 5) }
full_ridge_cv = grid_search.GridSearchCV(full_pipeline,
                                         param_grid=ridge_param_grid, cv=cv,
                                         scoring="mean_squared_error")
test_estimator(full_ridge_cv, data, star).score()

0.67899750793599023