In [1]:
#import libraries
import json
import numpy as np
from sklearn.externals import joblib

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from collections import defaultdict

In [2]:
# load predict topic model
import predict_topic
predict_topic.init('nmf_model.joblib', 'vectorizer.joblib')

# load predict sentiment model
import predict_sentiment


In [3]:
# load data and set #data
total_data= 200000
data_filename= 'restaurant_review.json'

In [4]:
# returns topic and sentiment dictionary {D[topic#]['SA'] = #} per review
def rating_per_review(review):
    #print review
    d = {}
    i = 0
    for sentence in review.split("."):
        SA = predict_sentiment.get_sentiment(sentence)[0]
        i += 1
        topic = predict_topic.predict_topic(sentence)
        d[topic] = d.get(topic,{})
        d[topic][SA] = d[topic].get(SA,0)
        d[topic][SA] += 1   
    return d    

In [5]:
# for given data, returns average rating of a business considering both topic and sentiment analysis
def rating_per_business(documents):
    #{ i : 0 for i in range(4) }
    final_d = {}
    aggregate_rating = { i : 0.0 for i in range(5) }
    for review in documents:
        r = rating_per_review(review["text"])
        advanced_r = r
        
        # combine ratings for all reviews of a business
        for topic in r:
            final_d[topic] = final_d.get(topic,{})           
            for SA in r[topic]:  
                final_d[topic][SA] = final_d[topic].get(SA,0)
                final_d[topic][SA] += r[topic][SA]
    
    # find the average SA per topic (P / P + N)
    for topic in final_d:
               # print d[topic]
            positive = final_d[topic]['P'] if 'P' in final_d[topic] else 0
            negative = final_d[topic]['N'] if 'N' in final_d[topic] else 0

            aggregate_rating[topic] = (positive*1.0)/(positive+ negative)*1.0
    # return aggregate rating for entire business
    # {0: 1.0, 1: 0.8571428571428571, 2: 0.0, 3: 1.0, 4: 0.0}
    return aggregate_rating  


In [6]:
# load data
with open(data_filename) as f:
        head = [json.loads(next(f)) for x in range(total_data)]
        
businesses = {}
users = {}
users_business_topic_rating =defaultdict(list)
business_id_map ={}
id_business_map ={}
count_business = 0
count_user = 0
user_id_map = {}
id_user_map = {}


# loop through each review in the data
for i in head:
    b_id = i["business_id"]
    # create business->id and id->business map
    if b_id not in business_id_map:
        business_id_map[b_id] = count_business 
        id_business_map[count_business] = b_id
        count_business += 1
    # businesses contains all the reviews for a given business
    # each businesses[b_id] will have an array of reviews for that particular b_id
    businesses[b_id] = businesses.get(b_id, [])
    businesses[b_id].append(i)
    
    # create user->id and id->user map
    u_id = i["user_id"]
    if u_id not in user_id_map:
        user_id_map[u_id] = count_user 
        id_user_map[count_user] = u_id
        count_user += 1
    # users contains all the reviews given by a user
    # each users[u_id] will have an array of reviews given by that particular u_id
    users[u_id] = users.get(u_id, [])
    users[u_id].append(i)
    text = i["text"]
    
    # for each review, get the average rating per topic (based on sentiment analysis)
    rating_for_topic = rating_per_business([i])
    # loop through topics of the particular review
    for topic in rating_for_topic:
        # store per topic rating separately, topic: [[user, business, rating]]
        # {0: [[0, 0, 0.5],
        #      [0, 1, 1.0]...
        users_business_topic_rating[topic].append([user_id_map[u_id],business_id_map[b_id],rating_for_topic[topic]])


In [7]:
#users_business_topic_rating

In [8]:
# get average rating per user
users_rating_per_topic = defaultdict(dict)
for x in users:
    users_rating_per_topic[x] =   rating_per_business(users[x])
    print (x,users_rating_per_topic[x] )
    

In [9]:
from pyspark import SparkContext
sc = SparkContext("local", "Recommend")

In [10]:
# train model per topic
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
rank = 10
numIterations = 10
topic_model = {}
# train each model using topic-specific data and store in topic_model
for topic in range(5):
    ratings = users_business_topic_rating[topic][:140000]
    ratings = sc.parallelize(ratings)
    model = ALS.train(ratings, rank, numIterations)
    topic_model[topic] = model
    
    

In [11]:
# compute MSE per topic
for topic in range(5):
    # retrieve test data for the topic
    test_ratings = users_business_topic_rating[topic][1200:]
    test_ratings = sc.parallelize(test_ratings)
    # format testdata to get (user_id, business_id)
    testdata = test_ratings.map(lambda x: (x[0], x[1]))
    # retrieve topic specific trained model
    model = topic_model[topic]
    # predict
    predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = test_ratings.map(lambda r: ((r[0], r[1]), r[2])).leftOuterJoin(predictions)
    # take the average rating of a user in case there is no prediction (when the user has not been seen in the training data)
    ratesAndPreds =ratesAndPreds.map(lambda r:((r[0][0], r[0][1]), (r[1][0],r[1][1] or users_rating_per_topic[id_user_map[r[0][0]]][topic]) ))
    # calculate MSE
    MSE = (ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    print("Mean Squared Error = " + str(MSE))


Mean Squared Error = 0.016426485424
Mean Squared Error = 0.069038751528
Mean Squared Error = 0.095547248426
Mean Squared Error = 0.071963895695
Mean Squared Error = 0.0


In [16]:
# predict for specific existing user

import operator
new_user = 'qI9WTIXOi2OTSG4bsc55fw'
print(new_user)
avg_topic_rating_for_user = users_rating_per_topic[new_user]
selected_topic = max(avg_topic_rating_for_user.iteritems(), key=operator.itemgetter(1))[0]
# select model based on average rating of user (aka which topic the user prefers the most)
selected_model = topic_model[selected_topic]

qI9WTIXOi2OTSG4bsc55fw


In [17]:
# rate all businesses using above model and pick top 10

test_data_for_selected_user = []

for business in id_business_map:
    test_data_for_selected_user.append((user_id_map[new_user], business))
test_data_for_selected_user = sc.parallelize(test_data_for_selected_user)
predictions = selected_model.predictAll(test_data_for_selected_user)#.map(lambda r: ((r[0], r[1]), r[2]))

In [19]:
# sort predictions
sorted_predictions = sorted(predictions.collect(),key = lambda x: x[1], reverse = True)
sorted_predictions

In [20]:
# predict top 10
for x in sorted_predictions[:10]:
    print (id_user_map[x[0]],id_business_map[x[1]], x[2])
