In [25]:
import json
import time
import os
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.externals import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import precision_recall_curve, accuracy_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.multioutput import MultiOutputClassifier

import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
os.chdir("/home/gu38/Common/DIVA/FinalProject/yelp/dataset/")
os.listdir()

['attributeDictFile',
 'attributeDicts.json',
 'attributeLists.json',
 'business.json',
 'business.json.zip',
 'checkin.json',
 'checkin.json.zip',
 'Dataset_Challenge_Dataset_Agreement.pdf',
 'holder.txt',
 'photos.json',
 'postal_codes.csv',
 'review.json',
 'tip.json',
 'user.json',
 'Yelp_Dataset_Challenge_Round_11.pdf']

In [26]:
'''
Function to calculate Total check-ins in a week for a business
'''
checkins = {}
checkin = {}
with open('checkin.json', 'r') as f:
    for line in f:
        checkin = json.loads(line)
        checkins[checkin['business_id']] = checkin['time']
        
def get_checkin_count(id):
    count = 0
    record = checkins[id]
    for day in record:
        for hour in record[day]:
            count += record[day][hour]
    return count

In [None]:
'''
Function to create binary vector for 'hours' attribute
'''
checkins = {}
checkin = {}
with open('checkin.json', 'r') as f:
    for line in f:
        checkin = json.loads(line)
        checkins[checkin['business_id']] = checkin['time']
        
def get_checkin_count(id):
    count = 0
    record = checkins[id]
    for day in record:
        for hour in record[day]:
            count += record[day][hour]
    return count

In [27]:
'''
Function to extract multi-valued Attributes from Business data
'''
def extract_feature_lists(businesses):
    # List of single-valued attributes
    attributeLists = {}

    #List of multi-valued(dict form) attributes
    attributeDicts = {}

    # print("Before \n", json.dumps(businesses[:1], indent=4))

    #Generating a list of attribute values:
    for entry in businesses:
        for item in entry['attributes']:
            param = "attribute_" + item
            entry[param] = entry['attributes'][item]

            if isinstance(entry[param], dict):
                if param in attributeDicts:
                    for var in entry[param]:
                        if var not in attributeDicts[param]:
                            attributeDicts[param][var] = entry[param][var]
                else:
                    attributeDicts[param] = entry[param]

            else:
                if param in attributeLists:
                    if entry[param] not in attributeLists[param]:
                        attributeLists[param].append(entry[param])
                else:
                    attributeLists[param] = [entry[param]]
                    
    return attributeLists, attributeDicts

In [28]:
'''
Function to convert business attributes to binary vectors
'''
def generate_binary_vectors(businesses, attributeListsFile, attributeDictsFile):
    
    with open(attributeListsFile, 'r') as f:
        attributeLists = json.load(f)
    with open(attributeDictsFile, 'r') as f:
        attributeDicts = json.load(f)
    
    #Converting the attribute lists into binary vectors
    for entry in businesses:
        for param in attributeLists:
            if param in entry:
                encodedEntry = []
                
                # Single valued alphanumerical attributes, will be converted to n-bit vectors
                # with a single bit as '1'
                for value in attributeLists[param]:
                    if entry[param] == value:
                        encodedEntry.append(1)
                    else:
                        encodedEntry.append(0)
            else:
                encodedEntry = [0] * len(attributeLists[param])
            entry[param] = encodedEntry

        for param in attributeDicts:
            # Multi valued binary attributes, will be converted to n-bit vectors 
            # with possibly multiple bits as '1' 
            if param in entry:
                encodedEntry = []
                # Checking whether the binary sub-attribute value is True
                for value in attributeDicts[param]:
                    if entry[param][value]:
                        encodedEntry.append(1)
                    else:
                        encodedEntry.append(0)
            else:
                encodedEntry = [0] * len(attributeDicts[param])

            entry[param] = encodedEntry

        # At this stage we dont need the original attributes parameter        
        del entry['attributes']

    return businesses

In [29]:
'''
Function to retrieve business parameters:

Usage: prepare_data < filename, city, Labelling parameter, No. of labels>
'''
def prepare_data(filename, cities, parameter, maxLabels):

    businesses = []

    with open(filename, 'r') as f:
        
        interesting_categories = ['Restaurants', 'Food', 'Nightlife', 'Bars', 'Active Life', 'Sandwiches',
                                  'Fast Food', 'American (Traditional)', 'Pizza', 'Coffee & Tea', 
                                  'Hotels & Travel', 'Italian', 'Burgers', 'Breakfast & Brunch', 
                                  'Mexican', 'Chinese', 'Specialty Food', 'Bakeries', 'Cafes', 
                                  'Chicken Wings', 'Beer', 'Wine & Spirits', 'Steakhouses', 
                                  'Dance Clubs', 'Cocktail Bars', 'Pubs', 'Adult Entertainment']

        interesting_params = ['is_open', 'stars', 'categories', parameter]
        for line in f:
            entry = json.loads(line)

            # Filtering relevant businesses by city=Vegas and selected categories
            if entry['city'] in cities and any( i in entry['categories'] for i in interesting_categories) \
            and entry[parameter] != '':
                relevantEntry = {}
                for item in interesting_params:
                    if item in entry:
                        if item == 'categories':
                            cat = {}
                            for category in interesting_categories:
                                if category in entry[item]:
                                    cat[category] = True
                                else:
                                    cat[category] = False
                            entry['attributes'][item] = cat
                        else:
                            entry['attributes'][item] = entry[item]

                relevantEntry['attributes'] = entry['attributes']

                businesses.append(relevantEntry)
        print("\n\n\n\nData with only useful records and attributes: \n",
#               json.dumps(businesses[0], indent=4))
              businesses[0])
                
        counter = {}
        for item in businesses:
            val = item['attributes'][parameter]
            if val not in counter:
                counter[val] = 1
            else:
                counter[val] += 1
        sortedCounter = dict(sorted(counter.items(), key=lambda item: item[1], reverse = True)[:maxLabels])
        
        refinedBusinesses = []
        for entry in businesses:
            if entry['attributes'][parameter] in sortedCounter:
                refinedBusinesses.append(entry)
                
        print("\n\nSize of the final Input Vector:\n", len(refinedBusinesses))
        print(refinedBusinesses[:1])

#         print("before \n", json.dumps(refinedBusinesses[:1], indent=4))
        #
        attributesAsLists, attributesAsDicts = extract_feature_lists(refinedBusinesses)
        
        with open('attributeLists.json', 'w') as f:
            f.write(json.dumps(attributesAsLists))
            
        with open('attributeDicts.json', 'w') as f:
            f.write(json.dumps(attributesAsDicts))
        
        businessVectors = generate_binary_vectors(refinedBusinesses, 'attributeLists.json', 'attributeDicts.json')
        
#         print("\n\nResultant input data with binary attributes:\n",json.dumps(businessVectors[0], indent=4))
        print("\n\nResultant input data with binary attributes:\n",businessVectors[0])
#         for cat in interesting_categories:
#             print("OR category LIKE " + "'" + cat + "'")
        return businessVectors

In [30]:
'''
Function to invert binarized features/Labels back to original text
'''
def custom_invert_transform(binVec, maxLabels):
    
    with open('attributeLists.json', 'w') as f:
        f.write(json.dumps(attributesAsLists))
    with open('attributeDicts.json', 'w') as f:
        f.write(json.dumps(attributesAsDicts))
        
    origList = []
    if param in attributesAsLists:
        for item in binVec:
            i = 0
            

In [31]:
'''
This function concatenates all the feature vectors for input and flattens the Label data
'''
def concat_bin_vectors(binVec, Multioutput):
    outputConVec = []
    outputLabelVec = []
    for entry in binVec:
        rowVec = []
        for item in entry:
            if item == 'attribute_postal_code':
                if(not Multioutput):
                    #Convert the list-attribute to a string and remove from binary feature vector
                    outputLabelVec.append(''.join(map(str, entry[item])))
                else:
                    #Convert the list-attribute to a string and remove from binary feature vector
                    outputLabelVec.append(entry[item])
            else:
                rowVec.extend(entry[item])
        outputConVec.append(rowVec)
    return outputConVec, outputLabelVec

In [32]:
'''
Custom function to calculate HIT @N for test data
i.e. Probability that actual zone falls into N recommended zones.

Usage: hit_n < trained_Classifier, x_test, y_test, N >
'''
def hit_n(clf, test_data, true_labels, n):
    
    probs = clf.predict_proba(test_data)
    best_n = np.argsort(probs, axis = 1)

    i = 0
    count = 0
    for label in true_labels:
        relevant_prediction_indices = best_n[i][-n:]
        for j in range(n):
            if clf.classes_[relevant_prediction_indices[j]] == label:
                count += 1
                break
        i += 1
    print(count)
    return (count/true_labels.size)

In [33]:
startTime0 = datetime.now()
cities = ['Las Vegas', 'Scottsdale', 'Tempe',
          'Glendale', 'Phoenix', 'Chandler', 'Henderson']

classificationVectors = prepare_data(
    'business.json', cities, 'postal_code', 55)

featureData, labelData = concat_bin_vectors(classificationVectors, False)

featureNames = []
for solidFeature in classificationVectors[100]:
    for i in classificationVectors[0][solidFeature]:
        if 'postal' not in solidFeature:
            featureNames.append(solidFeature)
#

print("Data pre-processing took:", datetime.now() - startTime0)

npx = np.array(featureData)
npy = np.array(labelData)

x_train, x_test, y_train, y_test = train_test_split(npx, npy, test_size=0.2)

print("Training Data size:", len(x_train))
print("Training Data Dimensions:", x_train.shape)

# # SVM classifier with Radial Basis Function

# startTime1 = datetime.now()

# # for i in range()
# # classifier1 = LinearSVC(loss='hinge', tol = .001)
# classifier1 = svm.SVC(kernel='rbf', C=1, decision_function_shape='ovr')

# classifier1.fit(x_train, y_train)
# predicted1 = classifier1.predict(x_test)
# print("\nTime elapsed in SVMrbf:", str(datetime.now() - startTime1))
# print("Accuracy for SVMrbf:", accuracy_score(y_test, predicted1))
# # print("F1 for SVMrbf: ", f1_score(predicted1, y_test, average='micro'))


# # SGD classifier
# startTime2 = datetime.now()
# classifier2 = SGDClassifier(loss='hinge', max_iter=200)

# classifier2.fit(x_train, y_train)
# predicted2 = classifier2.predict(x_test)

# print("\nTime elapsed in SGDC:", str(datetime.now() - startTime2))
# print("Accuracy for SGDC:", accuracy_score(y_test, predicted2))
# # print("F1 for SGDC: ", f1_score(predicted2, y_test, average='micro'))





Data with only useful records and attributes: 
 {'attributes': {'RestaurantsReservations': False, 'categories': {'Hotels & Travel': False, 'Chinese': False, 'Wine & Spirits': False, 'Breakfast & Brunch': False, 'Restaurants': True, 'Coffee & Tea': False, 'Bars': False, 'Fast Food': True, 'Specialty Food': False, 'Steakhouses': False, 'Italian': False, 'Cafes': False, 'Pubs': False, 'Food': False, 'Bakeries': False, 'Active Life': False, 'Dance Clubs': False, 'Cocktail Bars': False, 'Nightlife': False, 'Mexican': False, 'Pizza': False, 'Beer': False, 'Burgers': True, 'Chicken Wings': False, 'Sandwiches': False, 'American (Traditional)': False, 'Adult Entertainment': False}, 'RestaurantsGoodForGroups': True, 'Ambience': {'romantic': False, 'intimate': False, 'touristy': False, 'casual': False, 'upscale': False, 'hipster': False, 'divey': False, 'trendy': False, 'classy': False}, 'is_open': 1, 'RestaurantsAttire': 'casual', 'stars': 1.0, 'WiFi': 'free', 'GoodForMeal': {'dessert': Fals

### SVM with rbf kernel

In [24]:
classifier1 = svm.SVC(kernel='rbf', C=1, decision_function_shape='ovr', probability=True)

classifier1.fit(x_train, y_train)
predicted1 = classifier1.predict(x_test)
# print("\nTime elapsed in SVMrbf:", str(datetime.now() - startTime1))
print("Accuracy for SVMrbf:", accuracy_score(y_test, predicted1))

print("HIT@10 score for SVM with rbf kernel is:", hit_n(classifier1, x_test, y_test, 10))

TypeError: __init__() got an unexpected keyword argument 'n_jobs'

In [None]:
import requests

req = "http://0.0.0.0:8000/predict?category=Bars&category=\
Adult+Entertainment&AgesAllowed=21plus&Smoking=yes&BusinessParking=lot&Alcohol=yes"
resp = requests.get(req)
print(resp.content)

In [27]:
with open('attributeLists.json', 'r') as f:
    attributeLists = json.load(f)
with open('attributeDicts.json', 'r') as f:
    attributeDicts = json.load(f)
print(attributeLists)
print("\n",attributeDicts)

{'attribute_RestaurantsTableService': [False, True], 'attribute_WheelchairAccessible': [True, False], 'attribute_WiFi': ['free', 'paid', 'no'], 'attribute_RestaurantsTakeOut': [True, False], 'attribute_OutdoorSeating': [False, True], 'attribute_DriveThru': [True, False], 'attribute_AcceptsInsurance': [True, False], 'attribute_HappyHour': [True, False], 'attribute_HasTV': [True, False], 'attribute_RestaurantsGoodForGroups': [True, False], 'attribute_RestaurantsReservations': [False, True], 'attribute_RestaurantsDelivery': [False, True], 'attribute_Smoking': ['no', 'outdoor', 'yes'], 'attribute_BYOBCorkage': ['no', 'yes_free', 'yes_corkage'], 'attribute_BusinessAcceptsBitcoin': [False, True], 'attribute_DogsAllowed': [True, False], 'attribute_Alcohol': ['none', 'full_bar', 'beer_and_wine'], 'attribute_postal_code': ['85022', '89119', '85016', '89103', '85032', '85258', '85013', '89121', '89014', '89117', '89104', '89123', '85051', '85027', '85281', '89145', '85260', '89109', '85004', '85

In [None]:
# import csv
# with open('postal_codes.csv', 'r') as f:
#     zips = csv.reader(f, delimiter=',')
# #     print(type(zips))
#     i = 0
#     data_read = [row[0] for row in zips]
#     print(data_read[1:2000])

### Random Forest classifier

In [10]:
# Random Forest classifier
# estimator = [('clf', 
#                MultiOutputClassifier(RandomForestClassifier(n_estimators=200, max_features = 15, random_state=0),
#              n_jobs=-1))]
# pipe = Pipeline(estimator)
# pipe = make_pipeline(RandomForestClassifier())
# parameter_grid1 = {"randomforestclassifier__n_estimators" : [100, 500, 250],
#               "randomforestclassifier__max_features" : [1, 10, 40],
# #              "randomforestclassifier__max_features" : [x*10 for x in range(1, 11)],
#              "randomforestclassifier__min_impurity_decrease": [0.1, 0.3, 0.01]
#              }
# # pipe
clf1 = RandomForestClassifier(n_estimators=200, max_features = 15, random_state=0, n_jobs=-1)
# grid1 = GridSearchCV(pipe, param_grid=parameter_grid, cv=3, n_jobs=-1)
clf1.fit(x_train, y_train)
predictions = clf1.predict(x_test)
# print("MAP@10 score for RandomForest Classifier is:", map_n(clf1, x_test, y_test, 10))
# print("Accuracy for Random Forest:", accuracy_score(y_test, predictions))
# print("\n",grid.best_params_)

In [11]:
print("Accuracy for Random Forest:", accuracy_score(y_test, predictions))
print("HIT@10 score for RandomForest Classifier is:", hit_n(clf1, x_test, y_test, 10))

Accuracy for Random Forest: 0.11307327358987876
1557
HIT@10 score for RandomForest Classifier is: 0.4103848181338956


In [15]:
# a = [x*10 for x in range(1, 11)]
print(y_test.size)
# print(map_n(grid2, x_test, y_test, 10))
# print("\n",clf1.best_params_)
# grid2
print("Best parameters: ", sorted(zip(map(lambda x: round(x, 4), clf1.feature_importances_), featureNames), 
             reverse=True))


3630
Best parameters:  [(0.0315, 'attribute_WheelchairAccessible'), (0.021, 'attribute_stars'), (0.0193, 'attribute_stars'), (0.0186, 'attribute_RestaurantsTakeOut'), (0.018, 'attribute_RestaurantsPriceRange2'), (0.0175, 'attribute_stars'), (0.0169, 'attribute_stars'), (0.015, 'attribute_Caters'), (0.0136, 'attribute_is_open'), (0.0134, 'attribute_is_open'), (0.0133, 'attribute_RestaurantsPriceRange2'), (0.0132, 'attribute_stars'), (0.013, 'attribute_WiFi'), (0.0129, 'attribute_RestaurantsTableService'), (0.0129, 'attribute_BusinessAcceptsCreditCards'), (0.0124, 'attribute_RestaurantsDelivery'), (0.0124, 'attribute_Caters'), (0.0123, 'attribute_categories'), (0.0122, 'attribute_HappyHour'), (0.0117, 'attribute_OutdoorSeating'), (0.0117, 'attribute_Ambience'), (0.0115, 'attribute_categories'), (0.0114, 'attribute_BusinessAcceptsBitcoin'), (0.0113, 'attribute_Ambience'), (0.0112, 'attribute_GoodForMeal'), (0.0111, 'attribute_DriveThru'), (0.011, 'attribute_categories'), (0.0109, 'attribu

### XGBoost

In [12]:
# clf2 = XGBClassifier(max_depth=10, n_jobs = -1)

clf2 = XGBClassifier(n_estimators = 170, min_child_weight = 0.01, n_jobs = -1)
# parameter_grid2 = {"xgbclassifier__n_estimators" : [170, 120],
# #               "xgbclassifier__max_depth" : [3, 5, 10],
# #              "randomforestclassifier__max_features" : [x*10 for x in range(1, 11)],
#              "xgbclassifier__min_child_weight": [0.01, 0.03]
#              }
# # pipe
# # clf1 = RandomForestClassifier(n_estimators=200, max_features = 15, random_state=0)
# grid2 = GridSearchCV(clf2, param_grid=parameter_grid2, cv=3, n_jobs=-1)
clf2.fit(x_train, y_train)
print("HIT@10 score for Xtreme Gradient Boosting Classifier is:", hit_n(clf2, x_test, y_test, 10))
# print("Best parameters: {}".format(pipe.feature_importances_))

# print("Best features: ", sorted(zip(map(lambda x: round(x, 4), clf2.feature_importances_), featureNames), 
#              reverse=True))

# print("\nTime elapsed in Grid Search:", str(datetime.now() - startTime3))
# print(grid.best_params_)

# predicted3 = pipe.predict(x_test)
# pipe = make_pipeline(PreProcessing(),
#                     RandomForestClassifier())

# classifier3.fit(x_train, y_train)
# predicted3 = classifier3.predict(x_test)
# print("\nTime elapsed in Random Forest:", str(datetime.now() - startTime4))
# print("Accuracy for Random Forest:", accuracy_score(y_test, predicted3))

predicted3 = clf2.predict(x_test)
print("Accuracy for XGBoost:", accuracy_score(y_test, predicted3))

1837
HIT@10 score for Xtreme Gradient Boosting Classifier is: 0.4841855561412757
Accuracy for XGBoost: 0.13494992092778071


In [14]:
probabilities = clf2.predict_proba(x_test[:1])
best_n = np.argsort(-probabilities, axis = 1)

with open('attributeLists.json', 'r') as f:
    attributeLists = json.load(f)
    
n_top = 10
topZips = []
for index in best_n[0][:n_top]:
    topZips.append(attributeLists['attribute_postal_code'][index])
    
print(topZips)

['89147', '85301', '85225', '89145', '89052', '85254', '89109', '89139', '89118', '85032']


In [15]:
priority = sorted(zip(map(lambda x: round(x, 4), clf2.feature_importances_), featureNames), 
             reverse=True)
print("Best features:",priority)



Best features: [(0.0311, 'attribute_BusinessParking'), (0.0207, 'attribute_BusinessParking'), (0.0187, 'attribute_Smoking'), (0.0186, 'attribute_BusinessParking'), (0.0183, 'attribute_OutdoorSeating'), (0.0139, 'attribute_categories'), (0.0136, 'attribute_WheelchairAccessible'), (0.0128, 'attribute_BikeParking'), (0.0123, 'attribute_categories'), (0.0123, 'attribute_OutdoorSeating'), (0.0122, 'attribute_NoiseLevel'), (0.0118, 'attribute_BikeParking'), (0.0117, 'attribute_RestaurantsPriceRange2'), (0.0117, 'attribute_BusinessParking'), (0.0115, 'attribute_stars'), (0.0115, 'attribute_categories'), (0.0113, 'attribute_GoodForMeal'), (0.0111, 'attribute_WiFi'), (0.0111, 'attribute_Smoking'), (0.0105, 'attribute_GoodForMeal'), (0.0105, 'attribute_Ambience'), (0.0104, 'attribute_stars'), (0.0104, 'attribute_stars'), (0.0104, 'attribute_stars'), (0.0102, 'attribute_GoodForKids'), (0.0094, 'attribute_Ambience'), (0.0093, 'attribute_RestaurantsPriceRange2'), (0.0092, 'attribute_stars'), (0.009

In [16]:
prob = clf2.predict_proba(x_test[:1])
best_n = np.argsort(-prob, axis = 1)
print(best_n[0][:10])
print(y_test[0])
with open('attributeLists.json', 'r') as f:
    attributeLists = json.load(f)
n_top = 10
topZips = []
for index in best_n[0][:n_top]:
    topZips.append(attributeLists['attribute_postal_code'][index])
print(topZips)
print(attributeLists)

[40 38 52 15 35 50 17 53 34  4]
0000000000000010000000000000000000000000000000000000000
['89147', '85301', '85225', '89145', '89052', '85254', '89109', '89139', '89118', '85032']
{'attribute_RestaurantsReservations': [False, True], 'attribute_stars': [1.0, 3.0, 4.0, 2.5, 1.5, 5.0, 3.5, 2.0, 4.5], 'attribute_Caters': [False, True], 'attribute_postal_code': ['85022', '89119', '85016', '89103', '85032', '85258', '85013', '89121', '89014', '89117', '89104', '89123', '85051', '85027', '85281', '89145', '85260', '89109', '85004', '85018', '85226', '85044', '85224', '89102', '89149', '85286', '85034', '85255', '85008', '89128', '89130', '85251', '89169', '85014', '89118', '89052', '89101', '85282', '85301', '85308', '89147', '85283', '85257', '89135', '89108', '85020', '89113', '89148', '85284', '85003', '85254', '89146', '85225', '89139', '89015'], 'attribute_BikeParking': [True, False], 'attribute_BusinessAcceptsBitcoin': [False, True], 'attribute_WiFi': ['free', 'paid', 'no'], 'attribute_H

In [None]:
print((predicted3)[:10])
print((y_test)[:10])

print(predicted3.size)
print(y_test.size)

print(pipe.classes_)

main = pipe.classes_

probs = pipe.predict_proba(x_test)
best_n = np.argsort(probs)
arr = best_n[10][-5:]

In [None]:
# import pickle
import xgboost as xgb
filename = "Model_v0.3.pkl"

# with open('../Models/'+filename, 'wb') as file:
#     pickle.dump(clf2, file)

clf2.save_model('../Models/'+filename)

In [None]:
with open('./attributeLists.json', 'r') as f:
    attributeLists = json.load(f)

for encodedZip in y_test[1:10]:
    i = 0
    for bit in encodedZip:
        if bit == 1:
            print(attributeLists['attribute_postal_code'][i])
        i += 1

In [None]:
classificationVectors[0]
binnis = []
with open('business.json', 'r') as f:
    for lines in f:
        binnis.append(json.loads(lines))

In [None]:
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import numpy as np
X, y1 = make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3, random_state=1)
X1, y1 = make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3)
y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)
Y = np.vstack((y1, y2, y3)).T
Z = np.array([[1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0]])
n_samples, n_features = X.shape # 10,100
n_outputs = Y.shape[1] # 3
n_classes = 3
forest = RandomForestClassifier(n_estimators=100, random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
p = multi_target_forest.fit(X, Z).predict(X1)
print(p)
# print("Accuracy for Random Forest:", accuracy_score(Y, p))

In [24]:
from timeit import timeit
print(timeit("[]"))
print(timeit("list()"))

0.020265811999706784
0.07899421399997664


In [13]:
binnis[0]

NameError: name 'binnis' is not defined

In [None]:
with open('/media/gu38/Common/DIVA/FinalProject/yelp/dummy_data.json', 'r') as f:
    dummyList = json.load(f)
print(dummyList[0])
dummyList2 = [
  "85282", 
  "89147", 
  "85224", 
  "85226", 
  "89139", 
  "85004", 
  "89130", 
  "89146", 
  "85301", 
  "85251"
]


In [None]:
print(attribufeatures[1][10:])

In [None]:
d1={1:2,3:4}
d2={5:6,7:9}
d3={10:8,13:22}
d4 = dict(d1.items() + d2.items())
print(d4)