In [1]:
import os
import sys

spark_path = "/opt/spark/"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")

from pyspark import SparkContext
from pyspark import SparkConf

sc = SparkContext("local", "test")

In [2]:
sc

<pyspark.context.SparkContext at 0x7f59045eafd0>

In [3]:
data = sc.textFile("/home/ubuntu/sql_trans_data/prod_int")

In [4]:
data.take(5)

[u'0,18,6093827,1',
 u'1,18,110380,1',
 u'2,18,131057,1',
 u'3,18,133977,1',
 u'4,18,171879,1']

In [5]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [6]:
mls = data.map(lambda l: l.split(','))

In [7]:
ratings = mls.map(lambda x: Rating(int(x[2]),int(x[1]), float(x[3])))

In [8]:
train, test = ratings.randomSplit([0.7,0.3],7856)

In [9]:
train.count()

8952

In [10]:
train.take(10)

[Rating(user=6093827, product=18, rating=1.0),
 Rating(user=110380, product=18, rating=1.0),
 Rating(user=131057, product=18, rating=1.0),
 Rating(user=133977, product=18, rating=1.0),
 Rating(user=194181, product=18, rating=1.0),
 Rating(user=233893, product=8, rating=1.0),
 Rating(user=25936, product=18, rating=1.0),
 Rating(user=2884394, product=0, rating=1.0),
 Rating(user=2889669, product=18, rating=1.0),
 Rating(user=2917725, product=0, rating=1.0)]

In [11]:
test.count()

3865

In [12]:
test.take(10)

[Rating(user=171879, product=18, rating=1.0),
 Rating(user=2897162, product=18, rating=1.0),
 Rating(user=2988180, product=18, rating=1.0),
 Rating(user=3015879, product=18, rating=1.0),
 Rating(user=3113366, product=0, rating=1.0),
 Rating(user=3183957, product=18, rating=1.0),
 Rating(user=17094, product=18, rating=1.0),
 Rating(user=186761, product=18, rating=1.0),
 Rating(user=194181, product=18, rating=1.0),
 Rating(user=252601, product=16, rating=1.0)]

In [13]:
rank = 5

In [14]:
numIterations = 10

In [15]:
model = ALS.trainImplicit(train, rank, numIterations, seed=1234)

In [16]:
model.recommendUsers(18,10)

[Rating(user=5920534, product=18, rating=21.203112316943255),
 Rating(user=5230430, product=18, rating=20.66856537226009),
 Rating(user=5222617, product=18, rating=20.66856537226009),
 Rating(user=5273747, product=18, rating=20.40318478614303),
 Rating(user=5688023, product=18, rating=20.12365626193903),
 Rating(user=5926374, product=18, rating=19.56808102554572),
 Rating(user=5695327, product=18, rating=19.56808102554572),
 Rating(user=5969687, product=18, rating=19.001521098613303),
 Rating(user=5819774, product=18, rating=18.803799666911612),
 Rating(user=98467, product=18, rating=18.423647168775986)]

In [17]:
model.recommendProducts(6093827,10)

[Rating(user=6093827, product=18, rating=8.126538989529486),
 Rating(user=6093827, product=0, rating=0.08086517869625887),
 Rating(user=6093827, product=20, rating=0.02845241618164044),
 Rating(user=6093827, product=1, rating=0.015480153609008784),
 Rating(user=6093827, product=8, rating=0.013914613321859282),
 Rating(user=6093827, product=6, rating=0.007473584648103455),
 Rating(user=6093827, product=17, rating=0.005608635156515785),
 Rating(user=6093827, product=5, rating=0.003573862051404905),
 Rating(user=6093827, product=9, rating=0.0015142052569148967),
 Rating(user=6093827, product=10, rating=0.0014442033768933271)]

In [18]:
model.recommendProducts(110380,10)

[Rating(user=110380, product=18, rating=6.385107447077223),
 Rating(user=110380, product=0, rating=0.06725401802569575),
 Rating(user=110380, product=20, rating=0.022351146186237136),
 Rating(user=110380, product=1, rating=0.012557237077898575),
 Rating(user=110380, product=8, rating=0.011093915332444249),
 Rating(user=110380, product=6, rating=0.005870481568039169),
 Rating(user=110380, product=17, rating=0.00440527589917609),
 Rating(user=110380, product=5, rating=0.0028332307333731044),
 Rating(user=110380, product=10, rating=0.0019913592683229453),
 Rating(user=110380, product=16, rating=0.0014657296838397893)]

In [19]:
user_rec = model.recommendProductsForUsers(25).collect()

In [20]:
len(user_rec)

1042

In [21]:
user_rec[1]

(4739814,
 (Rating(user=4739814, product=4, rating=3.8265610014965805),
  Rating(user=4739814, product=0, rating=0.043881886003016124),
  Rating(user=4739814, product=9, rating=0.009563914620168534),
  Rating(user=4739814, product=16, rating=0.00861046153131556),
  Rating(user=4739814, product=8, rating=0.0013243181244931845),
  Rating(user=4739814, product=20, rating=0.00013336890217498752),
  Rating(user=4739814, product=17, rating=4.670241599482245e-05),
  Rating(user=4739814, product=6, rating=4.6481336262653916e-05),
  Rating(user=4739814, product=15, rating=1.2247990014284581e-13),
  Rating(user=4739814, product=7, rating=-9.396239537925962e-34),
  Rating(user=4739814, product=19, rating=-1.250103452816004e-22),
  Rating(user=4739814, product=21, rating=-3.2710903814098587e-14),
  Rating(user=4739814, product=13, rating=-0.00024732039461868374),
  Rating(user=4739814, product=12, rating=-0.0003631066199264246),
  Rating(user=4739814, product=10, rating=-0.0007263769157073163),
  

In [22]:
model.predict(6093827, 18)

8.126538989529486

In [23]:
pred_input = train.map(lambda x:(x[0],x[1]))

In [24]:
pred_input.take(10)

[(6093827, 18),
 (110380, 18),
 (131057, 18),
 (133977, 18),
 (194181, 18),
 (233893, 8),
 (25936, 18),
 (2884394, 0),
 (2889669, 18),
 (2917725, 0)]

In [25]:
pred = model.predictAll(pred_input)

In [26]:
pred1 = pred.distinct()

In [27]:
pred1.take(5)

[Rating(user=2935298, product=18, rating=5.615681836066915),
 Rating(user=6066007, product=18, rating=4.737447060300018),
 Rating(user=6035126, product=4, rating=9.640016068494987),
 Rating(user=36843, product=18, rating=5.615681836066915),
 Rating(user=4488376, product=4, rating=11.843196248679135)]

In [28]:
true_reorg = train.map(lambda x:((x[0],x[1]), x[2]))

In [29]:
true_reorg.take(5)

[((6093827, 18), 1.0),
 ((110380, 18), 1.0),
 ((131057, 18), 1.0),
 ((133977, 18), 1.0),
 ((194181, 18), 1.0)]

In [30]:
pred_reorg = pred1.map(lambda x:((x[0],x[1]), x[2]))

In [31]:
pred_reorg.take(10)

[((2935298, 18), 5.615681836066915),
 ((6066007, 18), 4.737447060300018),
 ((6035126, 4), 9.640016068494987),
 ((36843, 18), 5.615681836066915),
 ((4488376, 4), 11.843196248679135),
 ((4637001, 11), 3.8792295181574046),
 ((5715314, 13), 0.0005157008344005956),
 ((5697684, 10), 4.742016850221945),
 ((5784947, 18), 9.704355372663466),
 ((5700252, 18), 3.8372818366900945)]

In [32]:
true_pred = true_reorg.join(pred_reorg)

In [33]:
true_pred.count()

8952

In [34]:
true_pred1 = true_pred.distinct()

In [35]:
true_pred1.take(5)

[((5939760, 10), (1.0, 3.840286517778763)),
 ((5809016, 18), (1.0, 8.126538989529486)),
 ((5930550, 16), (1.0, 1.0442273593396028)),
 ((5962544, 18), (1.0, 4.737447060300018)),
 ((5934639, 10), (1.0, 5.622091706032647))]

In [36]:
true_pred1.count()

1140

In [37]:
from math import sqrt

In [38]:
MSE = true_pred1.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [39]:
RMSE = sqrt(MSE)

In [40]:
RMSE

6.615543338033035

In [41]:
test_input = test.map(lambda x:(x[0],x[1]))

In [42]:
pred_test = model.predictAll(test_input)

In [43]:
pred_test1 = pred_test.distinct()

In [44]:
pred_test1.take(5)

[Rating(user=2935298, product=18, rating=5.615681836066915),
 Rating(user=6066007, product=18, rating=4.737447060300018),
 Rating(user=6035126, product=4, rating=9.640016068494987),
 Rating(user=36843, product=18, rating=5.615681836066915),
 Rating(user=4488376, product=4, rating=11.843196248679135)]

In [45]:
pred_test1.count()

1083

In [46]:
test_reorg = test.map(lambda x:((x[0],x[1]), x[2]))

In [47]:
pred_reorg = pred_test1.map(lambda x:((x[0],x[1]), x[2]))

In [48]:
test_pred = test_reorg.join(pred_reorg)

In [49]:
test_pred1 = test_pred.distinct()

In [50]:
test_pred1.take(5)

[((5939760, 10), (1.0, 3.840286517778763)),
 ((5809016, 18), (1.0, 8.126538989529486)),
 ((5930550, 16), (1.0, 1.0442273593396028)),
 ((5962544, 18), (1.0, 4.737447060300018)),
 ((5934639, 10), (1.0, 5.622091706032647))]

In [51]:
test_pred1.count()

1083

In [52]:
test_MSE = test_pred1.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [53]:
test_RMSE = sqrt(test_MSE)

In [54]:
test_RMSE

6.690816574782381