In [1]:
import os
import sys

spark_path = "/opt/spark/"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.3-src.zip")

In [2]:
from pyspark import SparkContext
from pyspark import SparkConf

In [3]:
sc = SparkContext("local", "test")

In [4]:
sc

<pyspark.context.SparkContext at 0x7fc32b6db750>

In [5]:
data = sc.textFile('/home/ubuntu/client_demo/client_data_cat')

In [6]:
data.take(5)

[u'0,7,5,1', u'1,6,5,1', u'2,10,5,1', u'3,15,17,1', u'4,21,17,1']

In [7]:
data.count()

4428

In [8]:
clean_data = data.map(lambda x:x.split(','))

In [9]:
clean_data.take(5)

[[u'0', u'7', u'5', u'1'],
 [u'1', u'6', u'5', u'1'],
 [u'2', u'10', u'5', u'1'],
 [u'3', u'15', u'17', u'1'],
 [u'4', u'21', u'17', u'1']]

In [10]:
ratings = clean_data.map(lambda y: int(y[3]))

In [11]:
ratings.take(5)

[1, 1, 1, 1, 1]

In [12]:
cust_id = clean_data.map(lambda y: int(y[1]))

In [13]:
cust_id.take(5)

[7, 6, 10, 15, 21]

In [14]:
cust_id.distinct().count()

3087

In [15]:
product_id = clean_data.map(lambda y : int(y[2]))

In [16]:
product_id.take(5)

[5, 5, 5, 17, 17]

In [17]:
product_id.distinct().count()

18

In [18]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [19]:
pkr = data.map(lambda l: l.split(','))

In [20]:
ratings_rdd = pkr.map(lambda x: Rating(int(x[1]),int(x[2]), float(x[3])))

In [21]:
ratings_rdd.take(10)

[Rating(user=7, product=5, rating=1.0),
 Rating(user=6, product=5, rating=1.0),
 Rating(user=10, product=5, rating=1.0),
 Rating(user=15, product=17, rating=1.0),
 Rating(user=21, product=17, rating=1.0),
 Rating(user=22, product=17, rating=1.0),
 Rating(user=18, product=17, rating=1.0),
 Rating(user=24, product=17, rating=1.0),
 Rating(user=28, product=17, rating=1.0),
 Rating(user=30, product=16, rating=1.0)]

In [22]:
train, test = ratings_rdd.randomSplit([0.7,0.3],7856)

In [23]:
train.take(10)

[Rating(user=7, product=5, rating=1.0),
 Rating(user=6, product=5, rating=1.0),
 Rating(user=10, product=5, rating=1.0),
 Rating(user=15, product=17, rating=1.0),
 Rating(user=22, product=17, rating=1.0),
 Rating(user=18, product=17, rating=1.0),
 Rating(user=24, product=17, rating=1.0),
 Rating(user=28, product=17, rating=1.0),
 Rating(user=30, product=16, rating=1.0),
 Rating(user=36, product=5, rating=1.0)]

In [24]:
test.take(10)

[Rating(user=21, product=17, rating=1.0),
 Rating(user=32, product=5, rating=1.0),
 Rating(user=33, product=5, rating=1.0),
 Rating(user=50, product=5, rating=1.0),
 Rating(user=16, product=0, rating=1.0),
 Rating(user=60, product=17, rating=1.0),
 Rating(user=59, product=15, rating=1.0),
 Rating(user=74, product=5, rating=1.0),
 Rating(user=102, product=16, rating=1.0),
 Rating(user=113, product=5, rating=1.0)]

In [25]:
train.count()

3105

In [26]:
test.count()

1323

In [27]:
train.cache()

PythonRDD[23] at RDD at PythonRDD.scala:48

In [28]:
test.cache()

PythonRDD[24] at RDD at PythonRDD.scala:48

In [29]:
rank = 5
numIterations = 10

In [30]:
model = ALS.trainImplicit(train, rank, numIterations, seed=1234)

In [31]:
model.productFeatures().first()

(0,
 array('d', [-0.7588036060333252, 0.04100150614976883, -0.44809016585350037, 0.058412328362464905, -0.01026863045990467]))

In [32]:
model.userFeatures().first()

(0,
 array('d', [0.31526196002960205, 0.11034972965717316, -0.6776371002197266, -0.9516424536705017, 0.23508934676647186]))

In [33]:
model.recommendUsers(5,10)

[Rating(user=364, product=5, rating=4.473476219009033),
 Rating(user=155, product=5, rating=4.473476219009033),
 Rating(user=611, product=5, rating=3.6618448707906364),
 Rating(user=73, product=5, rating=3.6618448707906364),
 Rating(user=216, product=5, rating=3.6618448707906364),
 Rating(user=56, product=5, rating=3.6618448707906364),
 Rating(user=627, product=5, rating=3.6618448707906364),
 Rating(user=644, product=5, rating=3.6618448707906364),
 Rating(user=696, product=5, rating=3.6618448707906364),
 Rating(user=698, product=5, rating=3.6618448707906364)]

In [34]:
model.recommendProducts(364,10)

[Rating(user=364, product=5, rating=4.473476219009033),
 Rating(user=364, product=17, rating=0.08132307497178104),
 Rating(user=364, product=9, rating=0.0787295478578347),
 Rating(user=364, product=0, rating=0.04633956587192367),
 Rating(user=364, product=7, rating=0.02058602961345457),
 Rating(user=364, product=6, rating=8.989022924973337e-05),
 Rating(user=364, product=11, rating=1.3965404940256192e-06),
 Rating(user=364, product=13, rating=-1.9798479837181924e-13),
 Rating(user=364, product=16, rating=-1.5745973094315711e-09),
 Rating(user=364, product=1, rating=-0.00023624591212737314)]

In [41]:
user_rec = model.recommendProductsForUsers(20)

In [42]:
user_rec.count()

2370

In [43]:
prod_rec = model.recommendUsersForProducts(10)

In [44]:
prod_rec.count()

17

In [None]:
model.predict(2493, 17)

In [None]:
pred_input = train.map(lambda x:(x[0],x[1]))

In [None]:
pred_input.take(5)

In [None]:
pred = model.predictAll(pred_input).distinct()

In [None]:
pred.count()

In [None]:
pred.take(10)

In [None]:
true_reorg = train.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
true_reorg.take(5)

In [None]:
pred_reorg = pred.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
pred_reorg.take(5)

In [None]:
true_pred = true_reorg.join(pred_reorg).distinct()

In [None]:
true_pred.count()

In [None]:
true_pred.take(10)

In [None]:
from math import sqrt

In [None]:
MSE = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [None]:
RMSE = sqrt(MSE)

In [None]:
RMSE

In [None]:
test_input = test.map(lambda x:(x[0],x[1]))

In [None]:
test_input.take(5)

In [None]:
pred_test = model.predictAll(test_input).distinct()

In [None]:
pred_test.take(5)

In [None]:
pred_test.count()

In [None]:
test_reorg = test.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
test_reorg.take(5)

In [None]:
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
pred_reorg.take(5)

In [None]:
test_pred = test_reorg.join(pred_reorg).distinct()

In [None]:
test_pred.take(5)

In [None]:
test_pred.count()

In [None]:
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [None]:
test_RMSE = sqrt(test_MSE)

In [None]:
test_RMSE

In [None]:
len(user_rec)

In [None]:
user_rec[0]

In [None]:
user_rec[0][1][16][0:3]

In [None]:
rec = []

In [None]:
for x in range(len(user_rec)):
    for y in range(17):
            v = user_rec[x][1][y][0:3]
            d = list(v)
            rec.append(d)

In [None]:
rec[2]

In [None]:
len(rec)