In [1]:
import os
import sys

spark_path = "/opt/spark/"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.3-src.zip")

In [2]:
from pyspark import SparkContext
from pyspark import SparkConf

In [3]:
sc = SparkContext("local", "test")

In [4]:
sc

<pyspark.context.SparkContext at 0x7f41f0cc2d10>

In [5]:
data = sc.textFile('/home/ubuntu/client_demo/client_data')

In [6]:
data.take(5)

[u'0,3893506,5,1',
 u'1,3793254,5,1',
 u'2,6148794,5,1',
 u'3,6149220,17,1',
 u'4,6149432,17,1']

In [7]:
data.count()

4428

In [8]:
clean_data = data.map(lambda x:x.split(','))

In [9]:
clean_data.take(5)

[[u'0', u'3893506', u'5', u'1'],
 [u'1', u'3793254', u'5', u'1'],
 [u'2', u'6148794', u'5', u'1'],
 [u'3', u'6149220', u'17', u'1'],
 [u'4', u'6149432', u'17', u'1']]

In [10]:
ratings = clean_data.map(lambda y: int(y[3]))

In [11]:
ratings.take(5)

[1, 1, 1, 1, 1]

In [12]:
cust_id = clean_data.map(lambda y: int(y[1]))

In [13]:
cust_id.take(5)

[3893506, 3793254, 6148794, 6149220, 6149432]

In [14]:
cust_id.distinct().count()

3087

In [15]:
product_id = clean_data.map(lambda y : int(y[2]))

In [16]:
product_id.take(5)

[5, 5, 5, 17, 17]

In [17]:
product_id.distinct().count()

18

In [18]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [19]:
pkr = data.map(lambda l: l.split(','))

In [20]:
ratings_rdd = pkr.map(lambda x: Rating(int(x[1]),int(x[2]), float(x[3])))

In [21]:
ratings_rdd.take(10)

[Rating(user=3893506, product=5, rating=1.0),
 Rating(user=3793254, product=5, rating=1.0),
 Rating(user=6148794, product=5, rating=1.0),
 Rating(user=6149220, product=17, rating=1.0),
 Rating(user=6149432, product=17, rating=1.0),
 Rating(user=6149434, product=17, rating=1.0),
 Rating(user=6149306, product=17, rating=1.0),
 Rating(user=6149487, product=17, rating=1.0),
 Rating(user=6149972, product=17, rating=1.0),
 Rating(user=6150546, product=16, rating=1.0)]

In [22]:
train, test = ratings_rdd.randomSplit([0.7,0.3],7856)

In [23]:
train.take(10)

[Rating(user=3893506, product=5, rating=1.0),
 Rating(user=3793254, product=5, rating=1.0),
 Rating(user=6148794, product=5, rating=1.0),
 Rating(user=6149220, product=17, rating=1.0),
 Rating(user=6149434, product=17, rating=1.0),
 Rating(user=6149306, product=17, rating=1.0),
 Rating(user=6149487, product=17, rating=1.0),
 Rating(user=6149972, product=17, rating=1.0),
 Rating(user=6150546, product=16, rating=1.0),
 Rating(user=6150881, product=5, rating=1.0)]

In [24]:
test.take(10)

[Rating(user=6149432, product=17, rating=1.0),
 Rating(user=6150640, product=5, rating=1.0),
 Rating(user=6150741, product=5, rating=1.0),
 Rating(user=6151910, product=5, rating=1.0),
 Rating(user=6149278, product=0, rating=1.0),
 Rating(user=6152619, product=17, rating=1.0),
 Rating(user=6152605, product=15, rating=1.0),
 Rating(user=6153514, product=5, rating=1.0),
 Rating(user=6155594, product=16, rating=1.0),
 Rating(user=6156018, product=5, rating=1.0)]

In [25]:
train.count()

3105

In [26]:
test.count()

1323

In [27]:
train.cache()

PythonRDD[23] at RDD at PythonRDD.scala:48

In [28]:
test.cache()

PythonRDD[24] at RDD at PythonRDD.scala:48

In [29]:
rank = 5
numIterations = 10

In [30]:
model = ALS.trainImplicit(train, rank, numIterations, seed=1234)

In [31]:
model.productFeatures().first()

(0,
 array('d', [-0.7588036060333252, 0.04100150614976883, -0.44809016585350037, 0.058412328362464905, -0.01026863045990467]))

In [None]:
model.userFeatures().first()

In [None]:
model.recommendUsers(5,10)

In [None]:
model.recommendProducts(6174453,10)

In [32]:
user_rec = model.recommendProductsForUsers(25).collect()

In [33]:
user_rec[5]

(6384461,
 (Rating(user=6384461, product=17, rating=0.9748988702702939),
  Rating(user=6384461, product=0, rating=0.0143337025800499),
  Rating(user=6384461, product=5, rating=0.0036510107803227054),
  Rating(user=6384461, product=2, rating=0.0014364013326553936),
  Rating(user=6384461, product=9, rating=0.001389210550876263),
  Rating(user=6384461, product=6, rating=0.0009143814519004381),
  Rating(user=6384461, product=8, rating=0.00061081695245501),
  Rating(user=6384461, product=11, rating=3.1198044105870386e-07),
  Rating(user=6384461, product=13, rating=-4.439715612445885e-14),
  Rating(user=6384461, product=16, rating=-3.532266335910306e-10),
  Rating(user=6384461, product=1, rating=-5.877382141879553e-05),
  Rating(user=6384461, product=10, rating=-6.065337173995103e-05),
  Rating(user=6384461, product=15, rating=-7.060672346943537e-05),
  Rating(user=6384461, product=14, rating=-0.00016355757940065717),
  Rating(user=6384461, product=3, rating=-0.00017533257695423856),
  Ratin

In [None]:
len(user_rec)

In [None]:
model.predict(6384461, 17)

In [None]:
pred_input = train.map(lambda x:(x[0],x[1]))

In [None]:
pred_input.take(5)

In [None]:
pred = model.predictAll(pred_input).distinct()

In [None]:
pred.count()

In [None]:
pred.take(10)

In [None]:
true_reorg = train.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
true_reorg.take(5)

In [None]:
pred_reorg = pred.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
pred_reorg.take(5)

In [None]:
true_pred = true_reorg.join(pred_reorg).distinct()

In [None]:
true_pred.count()

In [None]:
true_pred.take(10)

In [None]:
from math import sqrt

In [None]:
MSE = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [None]:
RMSE = sqrt(MSE)

In [None]:
RMSE

In [None]:
test_input = test.map(lambda x:(x[0],x[1]))

In [None]:
test_input.take(5)

In [None]:
pred_test = model.predictAll(test_input).distinct()

In [None]:
pred_test.take(5)

In [None]:
pred_test.count()

In [None]:
test_reorg = test.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
test_reorg.take(5)

In [None]:
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
pred_reorg.take(5)

In [None]:
test_pred = test_reorg.join(pred_reorg).distinct()

In [None]:
test_pred.take(5)

In [None]:
test_pred.count()

In [None]:
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [None]:
test_RMSE = sqrt(test_MSE)

In [None]:
test_RMSE