## Connecting jupyter notebook with Pyspark

In [1]:
import os
import sys

spark_path = "/opt/spark/"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")

from pyspark import SparkContext
from pyspark import SparkConf

sc = SparkContext()

In [2]:
sc

<pyspark.context.SparkContext at 0x7f463ede7f50>

In [3]:
data = sc.textFile("/home/ubuntu/sql_trans_data/prod_int")

In [4]:
data.first()

u'0,18,6093827,1'

In [5]:
data.count()

12817

In [6]:
clean_data = data.map(lambda x:x.split(','))

In [7]:
clean_data.take(5)

[[u'0', u'18', u'6093827', u'1'],
 [u'1', u'18', u'110380', u'1'],
 [u'2', u'18', u'131057', u'1'],
 [u'3', u'18', u'133977', u'1'],
 [u'4', u'18', u'171879', u'1']]

In [8]:
rate = clean_data.map(lambda y: int(y[3]))

In [9]:
rate.take(5)

[1, 1, 1, 1, 1]

In [10]:
users = clean_data.map(lambda y: int(y[2]))

In [11]:
users.take(5)

[6093827, 110380, 131057, 133977, 171879]

In [12]:
type(users)

pyspark.rdd.PipelinedRDD

In [13]:
users.distinct().count()

1042

In [14]:
prod = clean_data.map(lambda y : int(y[1]))

In [15]:
prod.distinct().count()

22

In [16]:
prod.take(5)

[18, 18, 18, 18, 18]

### Importing pyspark libraries ###

In [17]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [18]:
mls = data.map(lambda l: l.split(','))

### Converting into RATINGS table which takes USER, PRODUCT, RATING

In [19]:
ratings = mls.map(lambda x: Rating(int(x[2]),int(x[1]), float(x[3])))

In [20]:
ratings.take(5)

[Rating(user=6093827, product=18, rating=1.0),
 Rating(user=110380, product=18, rating=1.0),
 Rating(user=131057, product=18, rating=1.0),
 Rating(user=133977, product=18, rating=1.0),
 Rating(user=171879, product=18, rating=1.0)]

### Train and Test Data ###

In [21]:
train, test = ratings.randomSplit([0.7,0.3],7856)

In [22]:
train.count()

9011

In [23]:
test.count() 

3806

In [24]:
train.cache()

PythonRDD[21] at RDD at PythonRDD.scala:48

In [25]:
test.cache()

PythonRDD[22] at RDD at PythonRDD.scala:48

# Building ALS Model

In [26]:
rank = 5

In [27]:
numIterations = 10

In [28]:
model = ALS.train(train, rank, numIterations)

In [29]:
model.productFeatures().first()

(0,
 array('d', [-0.27195557951927185, 0.12122367322444916, 0.5701993107795715, 0.04255702719092369, -0.4566425383090973]))

In [30]:
model.userFeatures().first()

(25936,
 array('d', [-0.3827344477176666, 0.6472576856613159, 1.0456136465072632, -1.115449070930481, -0.5860641002655029]))

### Recommending 1 product to 10 USERS

In [31]:
model.recommendUsers(18,10)

[Rating(user=110380, product=18, rating=0.9683423749185396),
 Rating(user=5273747, product=18, rating=0.9666871532439214),
 Rating(user=5985297, product=18, rating=0.9664072032721154),
 Rating(user=48865, product=18, rating=0.9664072032721154),
 Rating(user=5888497, product=18, rating=0.9664072032721154),
 Rating(user=5362369, product=18, rating=0.9664072032721154),
 Rating(user=5921041, product=18, rating=0.9664072032721154),
 Rating(user=444785, product=18, rating=0.9664072032721154),
 Rating(user=5933313, product=18, rating=0.9664072032721154),
 Rating(user=5982225, product=18, rating=0.9664072032721154)]

### Recommending 10 PRODUCTS to 1 USER

In [32]:
model.recommendProducts(6093827,10)

[Rating(user=6093827, product=12, rating=1.598101954579504),
 Rating(user=6093827, product=2, rating=1.085047053931742),
 Rating(user=6093827, product=3, rating=1.0231703202747173),
 Rating(user=6093827, product=0, rating=0.9989095052052206),
 Rating(user=6093827, product=1, rating=0.9807907864390204),
 Rating(user=6093827, product=18, rating=0.9664072032721154),
 Rating(user=6093827, product=20, rating=0.9633011046996591),
 Rating(user=6093827, product=6, rating=0.9597417497602403),
 Rating(user=6093827, product=15, rating=0.6948202493734517),
 Rating(user=6093827, product=16, rating=0.6461031083352351)]

In [33]:
user_rec = model.recommendProductsForUsers(25).collect()

In [35]:
user_rec[2]

(5940992,
 (Rating(user=5940992, product=12, rating=1.5758266347623486),
  Rating(user=5940992, product=2, rating=1.0843256653239366),
  Rating(user=5940992, product=3, rating=1.0224980066544385),
  Rating(user=5940992, product=0, rating=0.9986368515704873),
  Rating(user=5940992, product=1, rating=0.9796277606523816),
  Rating(user=5940992, product=18, rating=0.9543945097501862),
  Rating(user=5940992, product=20, rating=0.9510248800864094),
  Rating(user=5940992, product=6, rating=0.9472897097830173),
  Rating(user=5940992, product=15, rating=0.712230553875381),
  Rating(user=5940992, product=16, rating=0.6348607231185577),
  Rating(user=5940992, product=7, rating=0.5441554834968767),
  Rating(user=5940992, product=11, rating=0.5251948160366964),
  Rating(user=5940992, product=19, rating=0.4556456902862003),
  Rating(user=5940992, product=5, rating=0.006466963001510728),
  Rating(user=5940992, product=21, rating=-0.03173497883362986),
  Rating(user=5940992, product=10, rating=-0.1125

In [36]:
type(user_rec)

list

In [37]:
import pandas as pd

In [38]:
len(user_rec)

1042

In [39]:
rec = []

In [40]:
for x in range(1042):
    for y in range(22):
            v = user_rec[x][1][y][0:3]
            d = list(v)
            rec.append(d)

In [41]:
len(rec)

22924

In [42]:
rec[0:5]

[[6101760, 12, 1.598101954579504],
 [6101760, 2, 1.085047053931742],
 [6101760, 3, 1.0231703202747173],
 [6101760, 0, 0.9989095052052206],
 [6101760, 1, 0.9807907864390204]]

In [45]:
rec_df = pd.DataFrame(rec, columns=('cust_id', 'rec_product_id', 'rating'))

In [46]:
rec_df.head(6)

Unnamed: 0,cust_id,rec_product_id,rating
0,6101760,12,1.598102
1,6101760,2,1.085047
2,6101760,3,1.02317
3,6101760,0,0.99891
4,6101760,1,0.980791
5,6101760,18,0.966407


### Predicting the rating between a user and product

In [None]:
model.predict(6093827, 18)

In [None]:
pred_input = train.map(lambda x:(x[0],x[1]))

In [None]:
pred = model.predictAll(pred_input) 

### Predicting all the ratings

In [None]:
pred.take(5)

In [None]:
true_reorg = train.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
true_reorg.take(5)

In [None]:
pred_reorg = pred.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
pred_reorg.take(5)

In [None]:
true_pred = true_reorg.join(pred_reorg)

### Actual and Predicted ratings ###

In [None]:
true_pred.take(5)

### Recommendation Evaluation ###

In [None]:
from math import sqrt

In [None]:
MSE = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [None]:
RMSE = sqrt(MSE)

In [None]:
RMSE

In [None]:
test_input = test.map(lambda x:(x[0],x[1])) 

In [None]:
pred_test = model.predictAll(test_input)

In [None]:
test_reorg = test.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
test_pred = test_reorg.join(pred_reorg)

In [None]:
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [None]:
test_RMSE = sqrt(test_MSE)

In [None]:
test_RMSE