In [3]:
import random
import json
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error


In [4]:
def feature(datum):
    '''
    :param: datum, yelp review item
    :type: dict
    
    '''
    feat = [1]
    attributes = datum['attributes']
    
    if 'BusinessAcceptsCreditCards' in attributes:
        if attributes['BusinessAcceptsCreditCards'] == True:
            feat.append(1)
        else:
            feat.append(0)
    else:
        feat.append(random.randint(0, 1))
        
        
    if 'WheelchairAccessible' in attributes:
        if attributes['WheelchairAccessible'] == True:
            feat.append(1)
        else: 
            feat.append(0)
    else:
        feat.append(random.randint(0, 1))
        
        
    if 'HappyHour' in attributes:
        if attributes['HappyHour'] == True:
            feat.append(1)
        else:
            feat.append(0)
    else:
        feat.append(random.randint(0, 1))
        
        
    if 'HasTV' in attributes:
        if attributes['HasTV'] == True:
            feat.append(1)
        else:
            feat.append(0)
    else:
        feat.append(random.randint(0, 1))
        
        
    if 'GoodForGroups' in attributes:
        if attributes['GoodForGroups'] == True:
            feat.append(1)
        else:
            feat.append(0)
    else:
        feat.append(random.randint(0, 1))
        
        
    if 'RestaurantsPriceRange2' in attributes:
        feat.append(attributes['RestaurantsPriceRange2'])
    else:
        feat.append(random.randint(1,4))
        
        
    if 'BusinessParking' in attributes:
        if (attributes['BusinessParking']['garage'] or attributes['BusinessParking']['lot'] or attributes['BusinessParking']['street']):
            feat.append(1)
        else:
            feat.append(0)
    else:
        feat.append(random.randint(0,1))
        
    return feat

In [5]:
def MSE(theta, X, y):
    '''
    :param: X,test feature set
    :type: list
    :param: y,test label set
    :type: list
    :param: theta
    :type: list
    '''
    theta = np.matrix(theta).T
    X = np.matrix(X)
    y = np.matrix(y).T
    diff = X*theta - y
    diffSq = diff.T*diff
    diffSqReg = diffSq / len(X)
    return diffSqReg.flatten().tolist()[0][0]

In [6]:
def decision_tree_regression(X, y, X_test):
    '''
    :param: X,training feature set
    :type: list
    :param: y ,training label set
    :type: list
    :param: X_test,test feature set
    :type: list
    '''
    
    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X, y)
    return regressor.predict(X_test)

In [7]:
data1 = []
for line in open('business.json','r'): 
     data1.append(json.loads(line))
   

In [8]:
x =[]
for d in data1:
    if (d['stars'] ):
        cat = d['categories']
        for c in cat:
            if (c == 'Restaurants'):
                x.append(d)

In [9]:
random.shuffle(x)
#split into training and test sets
train_data = x[0:30000]
test_data = x[30000:]
X_train = [feature(d) for d in train_data]
y_train = [d['stars'] for d in train_data]
X_test = [feature(d) for d in test_data]
y_test = [d['stars'] for d in test_data]

In [10]:
price = []
for i in range(len(x)):
    if 'RestaurantsPriceRange2' in x[i]['attributes']:
        price.append(x[i]['attributes']['RestaurantsPriceRange2'])

In [11]:
len(test_data)

24618

### Baseline Model

In [12]:
allRatings = []
for l in train_data:
    allRatings.append(l['stars'])    
globalAverage = sum(allRatings) / len(allRatings)

rate = []
for l in test_data:
     rate.append(l['stars'])

In [13]:
squareError = 0
for r in rate:
    squareError += (r - globalAverage)**2
mse_m1 = squareError / (len(rate))
print ('mse of model_1:',mse_m1)

mse of model_1: 0.6321944569372613


In [14]:
globalAverage

3.4554

### Regression

In [16]:
theta,residuals,rank,s = np.linalg.lstsq(X_train, y_train)
mse_m2=MSE(theta,X_test,y_test)
print ('mse of model_2',mse_m2)

mse of model_2 0.6169964875708768


In [17]:
theta

array([  3.41686729e+00,  -1.71727505e-01,   5.05941545e-02,
         3.01379513e-03,  -4.41573051e-03,   1.32554487e-02,
         2.54233780e-02,   1.84373112e-01])

### Decision Tree Regression

In [19]:
#Model_3: decesion_tree regression
y_pred = decision_tree_regression(X_train,y_train,X_test)
mse_m3=mean_squared_error(y_test, y_pred)
print ('mse of model_3:',mse_m3)

mse of model_3: 0.615594640675
