In [1]:
import findspark
findspark.init()

import findspark
findspark.init()
import pyspark
import random
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col
import pyspark.sql.functions
from pyspark.sql.functions import sum

In [2]:
spark = SparkSession.builder.master("local").appName("Yelp").getOrCreate()

In [5]:
sqlContext = SQLContext(spark)
sc = spark.sparkContext
training_data = sc.textFile("tempe_train.csv")
test_data = sc.textFile("tempe_test.csv")

In [6]:
header = training_data.first()
header_test = test_data.first()

train_rdd = training_data.filter(lambda z: z != header) \
    .map(lambda x: x.split(',')).map(lambda x: (x[0], x[1], x[2]))

test_rdd = test_data.filter(lambda z: z != header) \
    .map(lambda x: x.split(',')).map(lambda x: (x[0], x[1], x[2]))

all_dist_busId_test = test_rdd.map(lambda x: x[1]).distinct()
all_dist_busId_train = train_rdd.map(lambda x: x[1]).distinct()

all_dist_busId_final = all_dist_busId_test.union(all_dist_busId_train).distinct()  # .collect()

all_busId_dict = {}
uniqueId_forAll_busId = all_dist_busId_final.zipWithUniqueId()
all_busId_dict = uniqueId_forAll_busId.collectAsMap()

all_dist_userId_test = test_rdd.map(lambda x: x[0]).distinct()
all_dist_userId_train = train_rdd.map(lambda x: x[0]).distinct()

all_dist_userId_final = all_dist_userId_test.union(all_dist_userId_train).distinct()

all_userId_dict = {}
uniqueId_forAll_userId = all_dist_userId_final.zipWithUniqueId()
all_userId_dict = uniqueId_forAll_userId.collectAsMap()



In [45]:
final_train_rdd2 = train_rdd.map(lambda x: ((all_userId_dict[x[0]], all_busId_dict[x[1]], x[2])))

final_train_rdd2.collect()

final_ratings = final_train_rdd2.map(lambda x: Rating(x[0], x[1], x[2]))

rank = 3
numIterations = 20

model = ALS.train(final_ratings, rank, numIterations, 0.01, seed=300)

final_test_rdd = test_rdd.map(lambda x: ((all_userId_dict[x[0]], x[0]), (all_busId_dict[x[1]], x[1]), x[2]))

final_test_rdd2 = test_rdd.map(lambda x: (all_userId_dict[x[0]], all_busId_dict[x[1]]))

predictions = model.predictAll(final_test_rdd2).map(lambda r: ((r[0], r[1]), r[2]))

In [8]:
print(predictions.take(5))

[((19021, 134), 3.7304237155072144), ((4926, 182), 3.756680551052405), ((4992, 1873), 3.605760805970192), ((4992, 1384), 3.2747931462352415), ((16132, 526), 0.6716863503201616)]


In [28]:
predictions_to_4 = predictions.map(lambda x: ((x[0][0], x[0][1]), float(x[1]) * 5/5))

In [29]:
print(predictions_to_4.take(5))

[((19021, 134), 3.7304237155072144), ((4926, 182), 3.756680551052405), ((4992, 1873), 3.605760805970192), ((4992, 1384), 3.274793146235241), ((16132, 526), 0.6716863503201616)]


In [30]:
tempe_business_sentiments = sc.textFile("tempe_sentiments.csv")


In [39]:
business_sentiments = tempe_business_sentiments.map(lambda x: x.split(',')).\
    map(lambda y: (all_busId_dict[y[0]], float(y[1]) * 1))

print(business_sentiments.take(5))

[(498, 0.44350135496684484), (724, 0.44899668715987745), (275, 0.4638065046818245), (331, 0.45467577934015224), (416, 0.44911617120107017)]


In [40]:
business_sentiments_map = business_sentiments.collectAsMap()

In [41]:
predictions_to_5 = predictions_to_4.map(lambda x: ((x[0][0], x[0][1]), (x[1] + business_sentiments_map[x[0][1]])))

In [42]:
print(predictions_to_5.take(5))


[((19021, 134), 4.198342974544855), ((4926, 182), 4.222502760388946), ((4992, 1873), 4.075064379638195), ((4992, 1384), 3.753532395876722), ((16132, 526), 1.1148299575704432)]


In [43]:
import math
import collections

In [46]:
final_test_rdd = test_rdd.map(lambda x: ((all_userId_dict[x[0]], all_busId_dict[x[1]]), (x[0], x[1], x[2])))
ratesAndPreds = final_test_rdd.join(predictions_to_5)

MSE = ratesAndPreds.map(lambda x: ((float(x[1][0][2]) - x[1][1]) ** 2)).mean()

#print(MSE)


abs_diff_between_rate_and_pred = ratesAndPreds.map(lambda x: abs(float(x[1][0][2]) - x[1][1]))

abs_diff_between_rate_and_pred.take(10)

zero_to_one = abs_diff_between_rate_and_pred.filter(lambda x: 0 <= x <= 1.0).count()
one_to_two = abs_diff_between_rate_and_pred.filter(lambda x: 1.0 <= x <= 2.0).count()
two_to_three = abs_diff_between_rate_and_pred.filter(lambda x: 2.0 <= x <= 3.0).count()
three_to_four = abs_diff_between_rate_and_pred.filter(lambda x: 3.0 <= x <= 4.0).count()
greater_than_four = abs_diff_between_rate_and_pred.filter(lambda x: x >= 4.0).count()

print("Root Mean Squared Error = " + str(math.sqrt(MSE)))
print(">=0 and <1: " + str(zero_to_one))
print(">=1 and <2: " + str(one_to_two))
print(">=2 and <3: " + str(two_to_three))
print(">=3 and <4: " + str(three_to_four))
print(">=4: " + str(greater_than_four))

final_results_dict = ratesAndPreds.map(lambda x: ((x[1][0][0], x[1][0][1]), x[1][1])).collectAsMap()

test_final_results_dict = collections.OrderedDict(sorted(final_results_dict.items()))


f = open("tempe_ModelBasedCF.txt", 'w+')

for k, v in test_final_results_dict.items():
    f.write(str(k[0]) + "," + str(k[1]) + "," + str(v) + '\n')

Root Mean Squared Error = 1.4779735847492919
>=0 and <1: 13110
>=1 and <2: 5685
>=2 and <3: 2382
>=3 and <4: 1189
>=4: 223
