In [1]:
import os
import sys

spark_path = "/opt/spark/"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")

from pyspark import SparkContext
from pyspark import SparkConf

sc = SparkContext("local", "test")

In [2]:
sc

<pyspark.context.SparkContext at 0x7f19403f3110>

In [5]:
data = sc.textFile("/home/ubuntu/sql_trans_data/prod_int")

In [6]:
data.first()

u'0,18,6093827,1'

In [7]:
data.count()

12817

In [8]:
clean_data = data.map(lambda x:x.split(','))

In [9]:
clean_data.take(5)

[[u'0', u'18', u'6093827', u'1'],
 [u'1', u'18', u'110380', u'1'],
 [u'2', u'18', u'131057', u'1'],
 [u'3', u'18', u'133977', u'1'],
 [u'4', u'18', u'171879', u'1']]

In [10]:
rate = clean_data.map(lambda y: int(y[3]))

In [11]:
rate.take(5)

[1, 1, 1, 1, 1]

In [12]:
rate.mean()

1.0

In [13]:
users = clean_data.map(lambda y: int(y[2]))

In [14]:
users.distinct().count()

1042

In [15]:
prod = clean_data.map(lambda y : int(y[1]))

In [16]:
prod.distinct().count()

22

In [17]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [18]:
mls = data.map(lambda l: l.split(','))

In [19]:
ratings = mls.map(lambda x: Rating(int(x[2]),int(x[1]), float(x[3])))

In [20]:
ratings.take(5)

[Rating(user=6093827, product=18, rating=1.0),
 Rating(user=110380, product=18, rating=1.0),
 Rating(user=131057, product=18, rating=1.0),
 Rating(user=133977, product=18, rating=1.0),
 Rating(user=171879, product=18, rating=1.0)]

In [21]:
train, test = ratings.randomSplit([0.7,0.3],7856)

In [22]:
train.count()

8952

In [23]:
test.count() 

3865

In [24]:
train.cache()

PythonRDD[23] at RDD at PythonRDD.scala:48

In [25]:
test.cache()

PythonRDD[24] at RDD at PythonRDD.scala:48

In [26]:
rank = 5

In [27]:
numIterations = 10

In [28]:
model = ALS.train(train, rank, numIterations)

In [29]:
model.productFeatures().first()

(0,
 array('d', [-0.31673702597618103, -0.1078350841999054, 0.29361799359321594, 0.42980921268463135, 0.429941326379776]))

In [30]:
model.userFeatures().first()

(9880,
 array('d', [0.48689135909080505, -0.026064211502671242, 0.4611184298992157, 0.4464118182659149, 0.7518187761306763]))

In [31]:
model.recommendUsers(18,10)

[Rating(user=110380, product=18, rating=0.9704080664878593),
 Rating(user=5273747, product=18, rating=0.9678817949454945),
 Rating(user=33393, product=18, rating=0.9673271994421575),
 Rating(user=46643, product=18, rating=0.9673271994421575),
 Rating(user=33203, product=18, rating=0.9673271994421575),
 Rating(user=36843, product=18, rating=0.9673271994421575),
 Rating(user=35703, product=18, rating=0.9673271994421575),
 Rating(user=36572, product=18, rating=0.9673271994421575),
 Rating(user=38782, product=18, rating=0.9673271994421575),
 Rating(user=17094, product=18, rating=0.9673271994421575)]

In [32]:
model.recommendProducts(6093827,10)

[Rating(user=6093827, product=9, rating=1.647656703977314),
 Rating(user=6093827, product=8, rating=1.171318006708086),
 Rating(user=6093827, product=16, rating=1.0967139880786374),
 Rating(user=6093827, product=0, rating=1.0159458848591814),
 Rating(user=6093827, product=2, rating=1.0149173950354147),
 Rating(user=6093827, product=1, rating=0.9966455236189922),
 Rating(user=6093827, product=18, rating=0.9673271994421575),
 Rating(user=6093827, product=20, rating=0.9637151197737379),
 Rating(user=6093827, product=6, rating=0.9568025980664192),
 Rating(user=6093827, product=3, rating=0.9566818646616697)]

In [33]:
model.predict(6093827, 18)

0.9673271994421575

In [34]:
pred_input = train.map(lambda x:(x[0],x[1]))

In [35]:
pred = model.predictAll(pred_input) 

In [36]:
true_reorg = train.map(lambda x:((x[0],x[1]), x[2]))

In [37]:
pred_reorg = pred.map(lambda x:((x[0],x[1]), x[2]))


In [38]:
true_pred = true_reorg.join(pred_reorg)

In [39]:
from math import sqrt

In [40]:
MSE = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [41]:
RMSE = sqrt(MSE)

In [42]:
RMSE

0.029285067207448962

In [43]:
test_input = test.map(lambda x:(x[0],x[1])) 

In [44]:
pred_test = model.predictAll(test_input)

In [45]:
test_reorg = test.map(lambda x:((x[0],x[1]), x[2]))

In [46]:
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))

In [47]:
test_pred = test_reorg.join(pred_reorg)

In [48]:
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [49]:
test_RMSE = sqrt(test_MSE)

In [50]:
test_RMSE

0.030440131728224722