## Connecting jupyter notebook with Pyspark

In [1]:
import os
import sys

spark_path = "/opt/spark/"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")

from pyspark import SparkContext
from pyspark import SparkConf

sc = SparkContext("local", "test")

In [2]:
sc

<pyspark.context.SparkContext at 0x7f11e8014f90>

In [26]:
data = sc.textFile("/home/ubuntu/prod_int")

In [27]:
data.first()

u'0,18,6093827,1'

In [5]:
data.count()

12817

In [6]:
clean_data = data.map(lambda x:x.split(','))

In [7]:
clean_data.take(5)

[[u'0', u'18', u'6093827', u'1'],
 [u'1', u'18', u'110380', u'1'],
 [u'2', u'18', u'131057', u'1'],
 [u'3', u'18', u'133977', u'1'],
 [u'4', u'18', u'171879', u'1']]

In [8]:
data_list = clean_data.collect()

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(data_list, columns=("index","product_id", "cust_id", "rating"))

In [None]:
df.head()

In [None]:
df[df.duplicated(['product_id', 'cust_id'], keep=False)].groupby(('cust_id', 'product_id')).count()

In [None]:
df.loc[:"product_id"]

In [None]:
def get_stats(group):
    return {'count': group.count()}

In [None]:
df['product_id'].groupby(df['cust_id']).apply(get_stats).unstack()

In [None]:
grp.

In [None]:
clean_data.take(5)

In [None]:
rate = clean_data.map(lambda y: int(y[3]))

In [None]:
rate.take(5)

In [None]:
users = clean_data.map(lambda y: int(y[2]))

In [None]:
users.take(5)

In [None]:
type(users)

In [None]:
users.distinct().count()

In [None]:
prod = clean_data.map(lambda y : int(y[1]))

In [None]:
prod.distinct().count()

In [None]:
prod.take(5)

### Importing pyspark libraries ###

In [9]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [10]:
mls = data.map(lambda l: l.split(','))

### Converting into RATINGS table which takes USER, PRODUCT, RATING

In [11]:
ratings = mls.map(lambda x: Rating(int(x[2]),int(x[1]), float(x[3])))

In [12]:
ratings.take(10)

[Rating(user=6093827, product=18, rating=1.0),
 Rating(user=110380, product=18, rating=1.0),
 Rating(user=131057, product=18, rating=1.0),
 Rating(user=133977, product=18, rating=1.0),
 Rating(user=171879, product=18, rating=1.0),
 Rating(user=194181, product=18, rating=1.0),
 Rating(user=233893, product=8, rating=1.0),
 Rating(user=25936, product=18, rating=1.0),
 Rating(user=2884394, product=0, rating=1.0),
 Rating(user=2889669, product=18, rating=1.0)]

In [None]:
rat_dist = ratings.distinct()

In [None]:
rat_dist.collect()

### Train and Test Data ###

In [13]:
train, test = ratings.randomSplit([0.7,0.3],7856)

In [14]:
train.count()

8952

In [15]:
test.count() 

3865

In [16]:
train.cache()

PythonRDD[9] at RDD at PythonRDD.scala:48

In [17]:
test.cache()

PythonRDD[10] at RDD at PythonRDD.scala:48

# Building ALS Model

In [18]:
rank = 5

In [19]:
numIterations = 10

In [20]:
model = ALS.train(train, rank, numIterations, seed=1234)

In [21]:
model.productFeatures().first()

(0,
 array('d', [0.21219994127750397, 0.4538339674472809, 0.5483592748641968, 0.3008931577205658, 0.011649126186966896]))

In [22]:
model.userFeatures().first()

(9880,
 array('d', [0.40670064091682434, -0.18512223660945892, -0.800706148147583, 0.290196031332016, 0.21232886612415314]))

### Recommending 1 product to 10 USERS

In [23]:
model.recommendUsers(18,10)

[Rating(user=110380, product=18, rating=0.9720420212506538),
 Rating(user=5273747, product=18, rating=0.9676849635974458),
 Rating(user=33203, product=18, rating=0.9671326256930186),
 Rating(user=36572, product=18, rating=0.9671326256930186),
 Rating(user=36843, product=18, rating=0.9671326256930186),
 Rating(user=38782, product=18, rating=0.9671326256930186),
 Rating(user=41656, product=18, rating=0.9671326256930186),
 Rating(user=25936, product=18, rating=0.9671326256930186),
 Rating(user=18818, product=18, rating=0.9671326256930186),
 Rating(user=39150, product=18, rating=0.9671326256930186)]

### Recommending 10 PRODUCTS to 1 USER

In [24]:
model.recommendProducts(6093827,10)

[Rating(user=6093827, product=11, rating=1.0790226640706733),
 Rating(user=6093827, product=3, rating=1.058707745921379),
 Rating(user=6093827, product=1, rating=1.0252500659929247),
 Rating(user=6093827, product=9, rating=1.0087828931727782),
 Rating(user=6093827, product=2, rating=1.005962089615221),
 Rating(user=6093827, product=0, rating=1.0020988642840674),
 Rating(user=6093827, product=18, rating=0.9671326256930186),
 Rating(user=6093827, product=20, rating=0.9635470369972088),
 Rating(user=6093827, product=12, rating=0.9496698103668879),
 Rating(user=6093827, product=6, rating=0.9482423445586228)]

In [25]:
model.recommendProducts(110380,10)

[Rating(user=110380, product=11, rating=1.0799223724134805),
 Rating(user=110380, product=9, rating=1.0573775179302856),
 Rating(user=110380, product=12, rating=1.0424448454579704),
 Rating(user=110380, product=3, rating=0.9906744335512521),
 Rating(user=110380, product=1, rating=0.9785075952742659),
 Rating(user=110380, product=18, rating=0.9720420212506538),
 Rating(user=110380, product=20, rating=0.9702727019239066),
 Rating(user=110380, product=6, rating=0.9590593824906416),
 Rating(user=110380, product=0, rating=0.9554782531419261),
 Rating(user=110380, product=2, rating=0.9485189564381329)]

In [1]:
user_rec = model.recommendProductsForUsers(25).collect()

NameError: name 'model' is not defined

In [None]:
user_rec[0]

In [None]:
len(user_rec)

In [None]:
type(user_rec)

In [None]:
import pandas as pd

In [None]:
len(user_rec)

In [None]:
rec = []

In [None]:
for x in range(len(user_rec)):
    for y in range(21):
            v = user_rec[x][1][y][0:3]
            d = list(v)
            rec.append(d)

In [None]:
len(rec)

In [None]:
rec[0:5]

In [None]:
rec_df = pd.DataFrame(rec, columns=('cust_id', 'rec_product_id', 'rating'))

In [None]:
rec_df.head(6)

### Predicting the rating between a user and product

In [None]:
model.predict(6093827, 18)

In [None]:
pred_input = train.map(lambda x:(x[0],x[1]))

In [None]:
pred_input.take(10)

In [None]:
pred = model.predictAll(pred_input) 

### Predicting all the ratings

In [None]:
pred.take(20)

In [None]:
true_reorg = train.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
true_reorg.take(5)

In [None]:
pred_reorg = pred.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
pred_reorg.take(5)

In [None]:
true_pred = true_reorg.join(pred_reorg)

In [None]:
true_pred.count()

### Actual and Predicted ratings ###

In [None]:
true_pred.take(10)

In [None]:
len(true_pred_dis)

In [None]:
true_pred_dis[0][1]

In [None]:
actual_pred = []

In [None]:
for x in range(1140):
    y = true_pred_dis[x][1]
    actual_pred.append(y)

In [None]:
actual_pred[1]

In [None]:
predicted = []

In [None]:
for x in range(1140):
    y = true_pred_dis[x][1][1]
    predicted.append(y)

In [None]:
predicted[0:10]

### Recommendation Evaluation ###

In [None]:
from math import sqrt

In [None]:
MSE = true_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [None]:
RMSE = sqrt(MSE)

In [None]:
RMSE

In [None]:
test_input = test.map(lambda x:(x[0],x[1]))

In [None]:
pred_test = model.predictAll(test_input)

In [None]:
test_reorg = test.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))

In [None]:
test_pred = test_reorg.join(pred_reorg)

In [None]:
test_pred.take(10)

In [None]:
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()

In [None]:
test_RMSE = sqrt(test_MSE)

In [None]:
test_RMSE