In [2]:
# %load load.py
import os
import json
from business import Business
from sklearn.cross_validation import train_test_split
import modeldata as md
import mlmodels as mlm
import pickle
import random as rd

'''
Takes as input a file with a business info on each line and splits it into separate files for each state
'''
def split_business_by_state(infile, outfile=None):
    if outfile is None:
        outfile = infile
    outfiles = {}
    with open(infile) as f:
        for line in iter(f):
            jline = json.loads(line)
            state = jline['state']
            if state not in outfiles:
                print ('adding new state {}'.format(state))
                o = outfile + '_' + state
                outfiles[state] = open(o, 'w')
            outfiles[state].write(line)
    for v in iter(outfiles.values()):
        v.close()


'''
Takes a file with a set of businesses and reviews, save a set of reviews that only relevant to the businesses
'''
def get_reviews_for_businesses(businesses, reviews, outfile=None):
    if outfile is None:
        outfile = businesses + '_reviews'
    b_ids = []
    with open(businesses) as f:
        for line in iter(f):
            jline = json.loads(line)
            b_ids.append(jline['business_id'])
        b_set = set(b_ids)

    with open(reviews) as f:
        with open(outfile, 'w') as o:
            for line in iter(f):
                jline = json.loads(line)
                b_id = jline['business_id']
                if b_id in b_set:
                    o.write(line)

'''
Based on a business file, output a file with all possible attributes and values
'''
def get_attributes(business_file, outfile=None):
    if outfile is None:
        outfile = business_file + '_attributes'
    attr_hash = {}
    with open(business_file) as f:
        for line in iter(f):
            jline = json.loads(line)
            attributes = jline['attributes']
            for k, v in iter(attributes.items()):
                if k not in attr_hash:
                    attr_hash[k] = []
                values = []
                if not isinstance(v, dict):
                    values.append(v)
                else:
                    for val in iter(v.keys()):
                        values.append(val)
                for value in values:
                    if value not in attr_hash[k]:
                        attr_hash[k].append(value)
    with open(outfile, 'w') as o:
        for k, v in iter(attr_hash.items()):
            o.write(k + str(v) + '\n')

'''
Reads a business file and get only restaurants
'''
def get_restaurants(business_file, outfile=None):
    if outfile is None:
        outfile = business_file + '_restaurants'
    with open(outfile, 'w') as o:
        with open(business_file, 'r') as f:
            for line in iter(f):
                jline = json.loads(line)
                if 'Restaurants' in jline['categories']:
                    o.write(line)
'''
number_set: 1-?
train_weight: 1-9.
'''
def split_data(businessfile, reviewfile, number_sets, train_weight):
    all_sets = {}
    for i  in range(1, number_sets+1):
        all_sets["train" + str(i)] = {}
        all_sets["test" + str(i)] = {}

    with open(businessfile) as rst:
        for line in rst:
            jline = json.loads(line)
            b_id = jline['business_id']
            set_num = rd.randint(1, number_sets)
            isTrain = rd.randint(1, 10)
            if isTrain > train_weight:
                dict_type = "test"
            else:
                dict_type = "train"
            dict_temp = {}
            if (dict_type + str(set_num)) in all_sets.keys():
                dict_temp = all_sets[dict_type + str(set_num)]
            dict_temp[b_id] = Business(jline)
            all_sets[dict_type + str(set_num)]  = dict_temp

    all_keys = all_sets.keys()
    with open(reviewfile) as rvw:
        for line in rvw:
            jline = json.loads(line)
            b_id = jline['business_id']
            for i_dict in all_keys:
                if b_id in all_sets[i_dict].keys():
                    dict_temp = all_sets[i_dict]
                    dict_temp[b_id].add_review(jline)
                    all_sets[i_dict]  = dict_temp
    return all_sets
if __name__ == '__main__':
    data_dir = '../data/'
    parsed_dir = data_dir + 'parsed/'
    raw_dir = data_dir + 'yelp_data/'
    '''
    some setting up
    '''
    #raw_reviews = raw_dir + 'yelp_academic_dataset_review.json'
    #business_data = raw_dir + 'yelp_academic_dataset_business.json'
    #split_business_by_state(business_data, outfile=parsed_dir + 'businesses')
    #get_restaurants('../data/parsed/businesses_WI')
    #get_reviews_for_businesses(parsed_dir + 'businesses_WI_restaurants', raw_reviews)
    #get_attributes('../data/yelp_data/yelp_academic_dataset_business.json', '../data/parsed/attributes')
    '''
    Creates a bag of words representation based on the WI restaurants and reviews
    '''
    
    business_file = 'yelp_academic_dataset_business.json'
    review_file = 'yelp_academic_dataset_review.json'
    #bag_of_words = md.create_bag_of_wods(raw_dir + business_file,
    #                                     raw_dir + review_file)


    #bag_of_ngrams = md.create_bag_of_ngrams(parsed_dir + 'WI_test_restaurants',
    #parsed_dir + 'WI_test_reviews')


    #bag_of_ngrams.make_tfidf_matrix()
    #print(bag_of_ngrams.datamatrix)
    #print(bag_of_ngrams.datamatrix.shape)

    #print(bag_of_ngrams.labels)
    #print(bag_of_ngrams.labels.shape)
    #exit()
    # get_reviews_for_state('../data/parsed/businesses_TX', '../data/yelp_data/yelp_academic_dataset_review.json')
    #all_sets = split_data(parsed_dir + 'businesses_WI_restaurants', parsed_dir + 'businesses_WI_restaurants_reviews', 2, 5)
    #bag_of_words = md.create_bag_of_words(ba_aggr=all_sets['train1'], attribute="Price Range")
    bag_of_words = md.create_bag_of_words(parsed_dir + 'businesses_WI_restaurants',
                                           parsed_dir + 'businesses_WI_restaurants_reviews')
    bag_of_words.make_sparse_datamtrix()
    bag_of_words.make_tfidf_matrix()
    #print(len(bag_of_words.datamatrix.data))
    print(bag_of_words.datamatrix.shape)
    print(bag_of_words.labels[:, 0])

    #json_data = open(parsed_dir + 'businesses_WI_restaurants').read()
    #data = json.load(json_data)
 # get_reviews_for_state('../data/parsed/businesses_TX', '../data/yelp_data/yelp_academic_dataset_review.json')
 
    '''
    Used this for the development of the One Vs Rest model. Want to confer on architecture of program before I move things around
    Figured it'd either be like this or maybe Class setup that calls the method and prints out internally.
    '''
    #print(bag_of_words.labels[0:10])
    #X_train, X_test, y_train, y_test = train_test_split(bag_of_words.datamatrix, bag_of_words.labels, test_size=0.3)
    #one_v_rest = mlm.one_vs_rest(X_train,y_train)
    #first10Predict = one_v_rest.predict(bag_of_words.datamatrix[0:10])
    #print (first10Predict)
    #print(one_v_rest.score(X_test,y_test))


(1107, 91088)
  (0, 0)	2.0
  (1, 0)	2.0
  (2, 0)	2.0
  (3, 0)	2.0
  (4, 0)	2.0
  (5, 0)	2.0
  (6, 0)	2.0
  (7, 0)	2.0
  (8, 0)	2.0
  (9, 0)	2.0
  (10, 0)	2.0
  (11, 0)	2.0
  (12, 0)	2.0
  (13, 0)	2.0
  (14, 0)	2.0
  (15, 0)	1.0
  (16, 0)	2.0
  (17, 0)	2.0
  (18, 0)	2.0
  (20, 0)	2.0
  (21, 0)	2.0
  (22, 0)	2.0
  (23, 0)	2.0
  (24, 0)	2.0
  (25, 0)	2.0
  :	:
  (1082, 0)	2.0
  (1083, 0)	2.0
  (1084, 0)	2.0
  (1085, 0)	2.0
  (1086, 0)	2.0
  (1087, 0)	2.0
  (1088, 0)	2.0
  (1089, 0)	2.0
  (1090, 0)	2.0
  (1091, 0)	2.0
  (1092, 0)	2.0
  (1093, 0)	2.0
  (1094, 0)	2.0
  (1095, 0)	2.0
  (1096, 0)	2.0
  (1097, 0)	1.0
  (1098, 0)	2.0
  (1099, 0)	2.0
  (1100, 0)	2.0
  (1101, 0)	2.0
  (1102, 0)	2.0
  (1103, 0)	2.0
  (1104, 0)	2.0
  (1105, 0)	2.0
  (1106, 0)	2.0


In [16]:
# %load mlmodels.py
import nltk
import business
from business import Business
import json
#import xgboost
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC


def one_vs_rest(x_train,y_train):
    one_v_rest = OneVsRestClassifier(LinearSVC(random_state=0)).fit(x_train, y_train)
    return one_v_rest




In [None]:
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(bag_of_words.datamatrix, bag_of_words.labels, test_size=0.3)

print("tr_err_rate \t test_err rate")
for c in range(0, y_train.shape[1]): 
    y_tr = []
    y_te = []
    ct_data = 0
    
    # get training data
    for r_tr in range(0, y_train.shape[0]):
        y_lb_tr = y_train[r_tr,c]
        if y_lb_tr > 0:
            ct_data = ct_data + 1
        y_tr.append(y_lb_tr)    
            
            
    # get test data        
    for r_te in range(0, y_test.shape[0]):
        y_te.append(y_test[r_te,c])
        
    # check if valid training data is more than 70%  
    if ct_data/y_train.shape[0] < 0.7:
        print("not enough data")
    else:
        one_v_rest = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_tr)
        print("%.2f" % (100*(1-one_v_rest.score(X_train, y_tr))) + " \t " + "%.2f" % (100*(1-one_v_rest.score(X_test, y_te))))
    


tr_err_rate 	 test_err rate
6.07 	 7.81
not enough data
not enough data
