In [1]:
import os
import sys

spark_path = "/opt/spark/"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.3-src.zip")

In [2]:
from pyspark import SparkContext
from pyspark import SparkConf

In [3]:
sc = SparkContext("local", "test")

In [4]:
sc

<pyspark.context.SparkContext at 0x7fa5db7ed850>

In [5]:
data = sc.textFile('/home/ubuntu/client_demo/client_data')

In [6]:
data.take(5)

[u'0,3893506,5,1',
 u'1,3793254,5,1',
 u'2,6148794,5,1',
 u'3,6149220,17,1',
 u'4,6149432,17,1']

In [7]:
data.count()

4428

In [8]:
clean_data = data.map(lambda x:x.split(','))

In [9]:
clean_data.take(5)

[[u'0', u'3893506', u'5', u'1'],
 [u'1', u'3793254', u'5', u'1'],
 [u'2', u'6148794', u'5', u'1'],
 [u'3', u'6149220', u'17', u'1'],
 [u'4', u'6149432', u'17', u'1']]

In [10]:
ratings = clean_data.map(lambda y: int(y[3]))

In [11]:
ratings.take(5)

[1, 1, 1, 1, 1]

In [16]:
cust_id = clean_data.map(lambda y: int(y[1]))

In [17]:
cust_id.take(5)

[3893506, 3793254, 6148794, 6149220, 6149432]

In [18]:
cust_id.distinct().count()

3087

In [19]:
product_id = clean_data.map(lambda y : int(y[2]))

In [20]:
product_id.take(5)

[5, 5, 5, 17, 17]

In [21]:
product_id.distinct().count()

18

In [22]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [23]:
pkr = data.map(lambda l: l.split(','))

In [40]:
ratings_rdd = pkr.map(lambda x: Rating(int(x[1]),int(x[2]), float(x[3])))

In [41]:
ratings_rdd.take(10)

[Rating(user=3893506, product=5, rating=1.0),
 Rating(user=3793254, product=5, rating=1.0),
 Rating(user=6148794, product=5, rating=1.0),
 Rating(user=6149220, product=17, rating=1.0),
 Rating(user=6149432, product=17, rating=1.0),
 Rating(user=6149434, product=17, rating=1.0),
 Rating(user=6149306, product=17, rating=1.0),
 Rating(user=6149487, product=17, rating=1.0),
 Rating(user=6149972, product=17, rating=1.0),
 Rating(user=6150546, product=16, rating=1.0)]

In [42]:
train, test = ratings_rdd.randomSplit([0.7,0.3],7856)

In [43]:
train.take(10)

[Rating(user=3893506, product=5, rating=1.0),
 Rating(user=3793254, product=5, rating=1.0),
 Rating(user=6148794, product=5, rating=1.0),
 Rating(user=6149220, product=17, rating=1.0),
 Rating(user=6149434, product=17, rating=1.0),
 Rating(user=6149306, product=17, rating=1.0),
 Rating(user=6149487, product=17, rating=1.0),
 Rating(user=6149972, product=17, rating=1.0),
 Rating(user=6150546, product=16, rating=1.0),
 Rating(user=6150881, product=5, rating=1.0)]

In [44]:
test.take(10)

[Rating(user=6149432, product=17, rating=1.0),
 Rating(user=6150640, product=5, rating=1.0),
 Rating(user=6150741, product=5, rating=1.0),
 Rating(user=6151910, product=5, rating=1.0),
 Rating(user=6149278, product=0, rating=1.0),
 Rating(user=6152619, product=17, rating=1.0),
 Rating(user=6152605, product=15, rating=1.0),
 Rating(user=6153514, product=5, rating=1.0),
 Rating(user=6155594, product=16, rating=1.0),
 Rating(user=6156018, product=5, rating=1.0)]

In [45]:
train.count()

3105

In [46]:
test.count()

1323

In [47]:
train.cache()

PythonRDD[273] at RDD at PythonRDD.scala:48

In [48]:
test.cache()

PythonRDD[274] at RDD at PythonRDD.scala:48

In [49]:
rank = 5
numIterations = 10

In [50]:
model = ALS.trainImplicit(train, rank, numIterations, seed=1234)

In [51]:
model.productFeatures().first()

(0,
 array('d', [-0.7588036060333252, 0.04100150614976883, -0.44809016585350037, 0.058412328362464905, -0.01026863045990467]))

In [52]:
model.userFeatures().first()

(2942914,
 array('d', [0.31526196002960205, 0.11034972965717316, -0.6776371002197266, -0.9516424536705017, 0.23508934676647186]))

In [53]:
model.recommendUsers(5,10)

[Rating(user=6174453, product=5, rating=4.473476219009033),
 Rating(user=6159515, product=5, rating=4.473476219009033),
 Rating(user=6192551, product=5, rating=3.6618448707906364),
 Rating(user=6153473, product=5, rating=3.6618448707906364),
 Rating(user=6164548, product=5, rating=3.6618448707906364),
 Rating(user=6152401, product=5, rating=3.6618448707906364),
 Rating(user=6194356, product=5, rating=3.6618448707906364),
 Rating(user=6196094, product=5, rating=3.6618448707906364),
 Rating(user=6202046, product=5, rating=3.6618448707906364),
 Rating(user=6202123, product=5, rating=3.6618448707906364)]

In [54]:
model.recommendProducts(6174453,10)

[Rating(user=6174453, product=5, rating=4.473476219009033),
 Rating(user=6174453, product=17, rating=0.08132307497178104),
 Rating(user=6174453, product=9, rating=0.0787295478578347),
 Rating(user=6174453, product=0, rating=0.04633956587192367),
 Rating(user=6174453, product=7, rating=0.02058602961345457),
 Rating(user=6174453, product=6, rating=8.989022924973337e-05),
 Rating(user=6174453, product=11, rating=1.3965404940256192e-06),
 Rating(user=6174453, product=13, rating=-1.9798479837181924e-13),
 Rating(user=6174453, product=16, rating=-1.5745973094315711e-09),
 Rating(user=6174453, product=1, rating=-0.00023624591212737314)]

In [55]:
user_rec = model.recommendProductsForUsers(25).collect()

In [56]:
user_rec[5]

(6384461,
 (Rating(user=6384461, product=17, rating=0.9748988702702939),
  Rating(user=6384461, product=0, rating=0.0143337025800499),
  Rating(user=6384461, product=5, rating=0.0036510107803227054),
  Rating(user=6384461, product=2, rating=0.0014364013326553936),
  Rating(user=6384461, product=9, rating=0.001389210550876263),
  Rating(user=6384461, product=6, rating=0.0009143814519004381),
  Rating(user=6384461, product=8, rating=0.00061081695245501),
  Rating(user=6384461, product=11, rating=3.1198044105870386e-07),
  Rating(user=6384461, product=13, rating=-4.439715612445885e-14),
  Rating(user=6384461, product=16, rating=-3.532266335910306e-10),
  Rating(user=6384461, product=1, rating=-5.877382141879553e-05),
  Rating(user=6384461, product=10, rating=-6.065337173995103e-05),
  Rating(user=6384461, product=15, rating=-7.060672346943537e-05),
  Rating(user=6384461, product=14, rating=-0.00016355757940065717),
  Rating(user=6384461, product=3, rating=-0.00017533257695423856),
  Ratin

In [57]:
model.predict(6384461, 17)

0.9748988702702939

In [58]:
pred_input = train.map(lambda x:(x[0],x[1]))

In [59]:
pred_input.take(5)

[(3893506, 5), (3793254, 5), (6148794, 5), (6149220, 17), (6149434, 17)]

In [60]:
pred = model.predictAll(pred_input).distinct()

In [61]:
pred.count()

2449

In [62]:
pred.take(10)

[Rating(user=6318969, product=13, rating=4.669173131429221e-24),
 Rating(user=6259231, product=7, rating=0.9781195515862344),
 Rating(user=6303430, product=5, rating=0.984296331591888),
 Rating(user=6343733, product=12, rating=0.4742517647434568),
 Rating(user=6284504, product=17, rating=0.9748988702702939),
 Rating(user=6328893, product=5, rating=0.984296331591888),
 Rating(user=6382938, product=9, rating=0.9782879078708778),
 Rating(user=6205543, product=5, rating=0.984296331591888),
 Rating(user=6342766, product=5, rating=0.984296331591888),
 Rating(user=6260622, product=9, rating=0.9782879078708778)]

In [63]:
true_reorg = train.map(lambda x:((x[0],x[1]), x[2]))

In [64]:
true_reorg.take(5)

[((3893506, 5), 1.0),
 ((3793254, 5), 1.0),
 ((6148794, 5), 1.0),
 ((6149220, 17), 1.0),
 ((6149434, 17), 1.0)]

In [65]:
pred_reorg = pred.map(lambda x:((x[0],x[1]), x[2]))

In [66]:
pred_reorg.take(5)

[((6318969, 13), 4.669173131429221e-24),
 ((6259231, 7), 0.9781195515862344),
 ((6303430, 5), 0.984296331591888),
 ((6343733, 12), 0.4742517647434568),
 ((6284504, 17), 0.9748988702702939)]

In [67]:
true_pred = true_reorg.join(pred_reorg).distinct()

In [68]:
true_pred.count()

2449

In [69]:
true_pred.take(10)

[((6204582, 7), (1.0, 0.9781195515862344)),
 ((6261683, 7), (1.0, 2.7658980805433924)),
 ((6177454, 17), (1.0, 1.8851355613289007)),
 ((6220949, 9), (1.0, 3.5815742387368985)),
 ((6202728, 5), (1.0, 0.984296331591888)),
 ((6216098, 12), (1.0, 0.4742517647434568)),
 ((6300732, 17), (1.0, 0.9748988702702939)),
 ((6302075, 5), (1.0, 1.920350452564406)),
 ((6200264, 17), (1.0, 0.9748988702702939)),
 ((6209960, 17), (1.0, 2.737033367185215))]

In [70]:
from math import sqrt

In [71]:
MSE = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [72]:
RMSE = sqrt(MSE)

In [73]:
RMSE

0.6641876244510859

In [74]:
test_input = test.map(lambda x:(x[0],x[1]))

In [75]:
test_input.take(5)

[(6149432, 17), (6150640, 5), (6150741, 5), (6151910, 5), (6149278, 0)]

In [76]:
pred_test = model.predictAll(test_input).distinct()

In [77]:
pred_test.take(5)

[Rating(user=6208598, product=7, rating=0.9781195515862344),
 Rating(user=6320790, product=17, rating=0.9748988702702939),
 Rating(user=6235612, product=9, rating=0.9782879078708778),
 Rating(user=6195233, product=17, rating=1.8851355613289007),
 Rating(user=6205543, product=5, rating=0.984296331591888)]

In [78]:
pred_test.count()

458

In [79]:
test_reorg = test.map(lambda x:((x[0],x[1]), x[2]))

In [80]:
test_reorg.take(5)

[((6149432, 17), 1.0),
 ((6150640, 5), 1.0),
 ((6150741, 5), 1.0),
 ((6151910, 5), 1.0),
 ((6149278, 0), 1.0)]

In [81]:
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))

In [82]:
pred_reorg.take(5)

[((6208598, 7), 0.9781195515862344),
 ((6320790, 17), 0.9748988702702939),
 ((6235612, 9), 0.9782879078708778),
 ((6195233, 17), 1.8851355613289007),
 ((6205543, 5), 0.984296331591888)]

In [83]:
test_pred = test_reorg.join(pred_reorg).distinct()

In [84]:
test_pred.take(5)

[((6312532, 5), (1.0, 0.984296331591888)),
 ((6312402, 5), (1.0, 0.984296331591888)),
 ((6202728, 5), (1.0, 0.984296331591888)),
 ((6230432, 17), (1.0, 0.9748988702702939)),
 ((6310794, 5), (1.0, 0.984296331591888))]

In [85]:
test_pred.count()

458

In [86]:
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [87]:
test_RMSE = sqrt(test_MSE)

In [88]:
test_RMSE

0.8099566852076174