In [1]:
import os
import sys

spark_path = "/opt/spark/"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")

In [2]:
from pyspark import SparkContext
from pyspark import SparkConf

In [3]:
sc = SparkContext("local", "test")

In [4]:
sc

<pyspark.context.SparkContext at 0x7f0872d5c990>

In [5]:
data = sc.textFile('/home/ubuntu/paktour_cust_prod')

In [6]:
data.take(5)

[u',product_id,cust_id,rating',
 u'0,18,6093827,1',
 u'1,18,110380,1',
 u'2,18,131057,1',
 u'3,18,133977,1']

In [7]:
data = data.zipWithIndex().filter(lambda tup: tup[1] > 0).map(lambda tup: tup[0])

In [8]:
data.take(5)

[u'0,18,6093827,1',
 u'1,18,110380,1',
 u'2,18,131057,1',
 u'3,18,133977,1',
 u'4,18,171879,1']

In [9]:
data.count()

12817

In [10]:
clean_data = data.map(lambda x:x.split(','))

In [11]:
clean_data.take(5)

[[u'0', u'18', u'6093827', u'1'],
 [u'1', u'18', u'110380', u'1'],
 [u'2', u'18', u'131057', u'1'],
 [u'3', u'18', u'133977', u'1'],
 [u'4', u'18', u'171879', u'1']]

In [12]:
ratings = clean_data.map(lambda y: int(y[3]))

In [13]:
ratings.take(5)

[1, 1, 1, 1, 1]

In [14]:
cust_id = clean_data.map(lambda y: int(y[2]))

In [15]:
cust_id.take(5)

[6093827, 110380, 131057, 133977, 171879]

In [17]:
cust_id.distinct().count()

1042

In [18]:
product_id = clean_data.map(lambda y : int(y[1]))

In [19]:
product_id.take(5)

[18, 18, 18, 18, 18]

In [20]:
product_id.distinct().count()

22

In [21]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [22]:
pkr = data.map(lambda l: l.split(','))

In [23]:
ratings_rdd = pkr.map(lambda x: Rating(int(x[2]),int(x[1]), float(x[3])))

In [24]:
ratings_rdd.take(10)

[Rating(user=6093827, product=18, rating=1.0),
 Rating(user=110380, product=18, rating=1.0),
 Rating(user=131057, product=18, rating=1.0),
 Rating(user=133977, product=18, rating=1.0),
 Rating(user=171879, product=18, rating=1.0),
 Rating(user=194181, product=18, rating=1.0),
 Rating(user=233893, product=8, rating=1.0),
 Rating(user=25936, product=18, rating=1.0),
 Rating(user=2884394, product=0, rating=1.0),
 Rating(user=2889669, product=18, rating=1.0)]

In [30]:
train, test = ratings_rdd.randomSplit([0.7,0.3],7856)

In [80]:
train.take(10)

[Rating(user=6093827, product=18, rating=1.0),
 Rating(user=110380, product=18, rating=1.0),
 Rating(user=131057, product=18, rating=1.0),
 Rating(user=133977, product=18, rating=1.0),
 Rating(user=194181, product=18, rating=1.0),
 Rating(user=233893, product=8, rating=1.0),
 Rating(user=25936, product=18, rating=1.0),
 Rating(user=2884394, product=0, rating=1.0),
 Rating(user=2889669, product=18, rating=1.0),
 Rating(user=2917725, product=0, rating=1.0)]

In [77]:
test.take(10)

[Rating(user=171879, product=18, rating=1.0),
 Rating(user=2897162, product=18, rating=1.0),
 Rating(user=2988180, product=18, rating=1.0),
 Rating(user=3015879, product=18, rating=1.0),
 Rating(user=3113366, product=0, rating=1.0),
 Rating(user=3183957, product=18, rating=1.0),
 Rating(user=17094, product=18, rating=1.0),
 Rating(user=186761, product=18, rating=1.0),
 Rating(user=194181, product=18, rating=1.0),
 Rating(user=252601, product=16, rating=1.0)]

In [31]:
train.count()

8952

In [32]:
test.count()

3865

In [33]:
train.cache()

PythonRDD[32] at RDD at PythonRDD.scala:48

In [34]:
test.cache()

PythonRDD[33] at RDD at PythonRDD.scala:48

In [35]:
rank = 5
numIterations = 10

In [36]:
model = ALS.trainImplicit(train, rank, numIterations, seed=1234)

In [37]:
model.productFeatures().first()

(0,
 array('d', [-0.39466553926467896, 1.2001080513000488, 1.607826590538025, 0.25164759159088135, 0.22817805409431458]))

In [38]:
model.userFeatures().first()

(9880,
 array('d', [-0.0012007708428427577, 0.00026443350361660123, -0.001196755445562303, 2.4438581021968275e-05, 9.264115942642093e-05]))

In [39]:
model.recommendUsers(18,10)

[Rating(user=5920534, product=18, rating=21.203112316943255),
 Rating(user=5230430, product=18, rating=20.66856537226009),
 Rating(user=5222617, product=18, rating=20.66856537226009),
 Rating(user=5273747, product=18, rating=20.40318478614303),
 Rating(user=5688023, product=18, rating=20.12365626193903),
 Rating(user=5926374, product=18, rating=19.56808102554572),
 Rating(user=5695327, product=18, rating=19.56808102554572),
 Rating(user=5969687, product=18, rating=19.001521098613303),
 Rating(user=5819774, product=18, rating=18.803799666911612),
 Rating(user=98467, product=18, rating=18.423647168775986)]

In [40]:
model.recommendProducts(6093827,10)

[Rating(user=6093827, product=18, rating=8.126538989529486),
 Rating(user=6093827, product=0, rating=0.08086517869625887),
 Rating(user=6093827, product=20, rating=0.02845241618164044),
 Rating(user=6093827, product=1, rating=0.015480153609008784),
 Rating(user=6093827, product=8, rating=0.013914613321859282),
 Rating(user=6093827, product=6, rating=0.007473584648103455),
 Rating(user=6093827, product=17, rating=0.005608635156515785),
 Rating(user=6093827, product=5, rating=0.003573862051404905),
 Rating(user=6093827, product=9, rating=0.0015142052569148967),
 Rating(user=6093827, product=10, rating=0.0014442033768933271)]

In [41]:
user_rec = model.recommendProductsForUsers(25).collect()

In [44]:
user_rec[5]

(5930882,
 (Rating(user=5930882, product=4, rating=4.704511190178094),
  Rating(user=5930882, product=10, rating=0.9819770585555956),
  Rating(user=5930882, product=0, rating=0.06116753002386144),
  Rating(user=5930882, product=9, rating=0.012094476989011671),
  Rating(user=5930882, product=16, rating=0.01134953970003394),
  Rating(user=5930882, product=12, rating=0.005203388060541117),
  Rating(user=5930882, product=8, rating=0.0018939326353730745),
  Rating(user=5930882, product=20, rating=0.00023553555155199704),
  Rating(user=5930882, product=17, rating=7.599832199737272e-05),
  Rating(user=5930882, product=6, rating=7.526155218133329e-05),
  Rating(user=5930882, product=15, rating=1.6523481415939342e-13),
  Rating(user=5930882, product=7, rating=-1.3693262315356038e-33),
  Rating(user=5930882, product=19, rating=-1.8602471734464353e-22),
  Rating(user=5930882, product=21, rating=-4.890465618097322e-14),
  Rating(user=5930882, product=13, rating=-0.00036825409728252767),
  Rating(u

In [45]:
model.predict(6093827, 18)

8.126538989529486

In [46]:
pred_input = train.map(lambda x:(x[0],x[1]))

In [47]:
pred_input.take(5)

[(6093827, 18), (110380, 18), (131057, 18), (133977, 18), (194181, 18)]

In [130]:
pred = model.predictAll(pred_input).distinct()

In [131]:
pred.count()

1140

In [132]:
pred.take(10)

[Rating(user=2935298, product=18, rating=5.615681836066915),
 Rating(user=6066007, product=18, rating=4.737447060300018),
 Rating(user=6035126, product=4, rating=9.640016068494987),
 Rating(user=36843, product=18, rating=5.615681836066915),
 Rating(user=4488376, product=4, rating=11.843196248679135),
 Rating(user=4637001, product=11, rating=3.8792295181574046),
 Rating(user=5715314, product=13, rating=0.0005157008344005956),
 Rating(user=5697684, product=10, rating=4.742016850221945),
 Rating(user=5784947, product=18, rating=9.704355372663466),
 Rating(user=5700252, product=18, rating=3.8372818366900945)]

In [133]:
true_reorg = train.map(lambda x:((x[0],x[1]), x[2]))

In [134]:
true_reorg.take(5)

[((6093827, 18), 1.0),
 ((110380, 18), 1.0),
 ((131057, 18), 1.0),
 ((133977, 18), 1.0),
 ((194181, 18), 1.0)]

In [135]:
pred_reorg = pred.map(lambda x:((x[0],x[1]), x[2]))

In [136]:
pred_reorg.take(5)

[((2935298, 18), 5.615681836066915),
 ((6066007, 18), 4.737447060300018),
 ((6035126, 4), 9.640016068494987),
 ((36843, 18), 5.615681836066915),
 ((4488376, 4), 11.843196248679135)]

In [139]:
true_pred = true_reorg.join(pred_reorg).distinct()

In [140]:
true_pred.count()

1140

In [141]:
true_pred.take(10)

[((5939760, 10), (1.0, 3.840286517778763)),
 ((5809016, 18), (1.0, 8.126538989529486)),
 ((5930550, 16), (1.0, 1.0442273593396028)),
 ((5962544, 18), (1.0, 4.737447060300018)),
 ((5934639, 10), (1.0, 5.622091706032647)),
 ((5934626, 4), (1.0, 4.721417968448088)),
 ((5914036, 18), (1.0, 4.737447060300018)),
 ((483803, 4), (1.0, 3.8265610014965805)),
 ((5797168, 2), (1.0, 1.8505177657880005)),
 ((4934184, 18), (1.0, 13.349134531874578))]

In [142]:
from math import sqrt

In [143]:
MSE = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [144]:
RMSE = sqrt(MSE)

In [145]:
RMSE

6.615543338033035

In [113]:
test_input = test.map(lambda x:(x[0],x[1]))

In [114]:
test_input.take(5)

[(171879, 18), (2897162, 18), (2988180, 18), (3015879, 18), (3113366, 0)]

In [149]:
pred_test = model.predictAll(test_input).distinct()

In [150]:
pred_test.take(5)

[Rating(user=2935298, product=18, rating=5.615681836066915),
 Rating(user=6066007, product=18, rating=4.737447060300018),
 Rating(user=6035126, product=4, rating=9.640016068494987),
 Rating(user=36843, product=18, rating=5.615681836066915),
 Rating(user=4488376, product=4, rating=11.843196248679135)]

In [151]:
pred_test.count()

1083

In [152]:
test_reorg = test.map(lambda x:((x[0],x[1]), x[2]))

In [153]:
test_reorg.take(5)

[((171879, 18), 1.0),
 ((2897162, 18), 1.0),
 ((2988180, 18), 1.0),
 ((3015879, 18), 1.0),
 ((3113366, 0), 1.0)]

In [154]:
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))

In [155]:
pred_reorg.take(5)

[((2935298, 18), 5.615681836066915),
 ((6066007, 18), 4.737447060300018),
 ((6035126, 4), 9.640016068494987),
 ((36843, 18), 5.615681836066915),
 ((4488376, 4), 11.843196248679135)]

In [159]:
test_pred = test_reorg.join(pred_reorg).distinct()

In [160]:
test_pred.take(5)

[((5939760, 10), (1.0, 3.840286517778763)),
 ((5809016, 18), (1.0, 8.126538989529486)),
 ((5930550, 16), (1.0, 1.0442273593396028)),
 ((5962544, 18), (1.0, 4.737447060300018)),
 ((5934639, 10), (1.0, 5.622091706032647))]

In [161]:
test_pred.count()

1083

In [162]:
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [163]:
test_RMSE = sqrt(test_MSE)

In [164]:
test_RMSE

6.690816574782381