In [1]:
import pandas as pd

In [2]:
prod_views_df = pd.read_csv('/home/ubuntu/recommendation_new/client', index_col=None)

In [3]:
prod_views_df.head(6)

Unnamed: 0.1,Unnamed: 0,cust_id,product_id,rating
0,0,3893506,com.gopaktor.subscription.v1.premium.1m,1
1,1,3793254,com.gopaktor.subscription.v1.premium.1m,1
2,2,6148794,com.gopaktor.subscription.v1.premium.1m,1
3,3,6149220,com.gopaktor.subscription.v4.premium.1m,1
4,4,6149432,com.gopaktor.subscription.v4.premium.1m,1
5,5,6149434,com.gopaktor.subscription.v4.premium.1m,1


In [4]:
prod_views_df.drop(prod_views_df.columns[[0]], axis=1, inplace = True)

In [5]:
prod_views_df.head()

Unnamed: 0,cust_id,product_id,rating
0,3893506,com.gopaktor.subscription.v1.premium.1m,1
1,3793254,com.gopaktor.subscription.v1.premium.1m,1
2,6148794,com.gopaktor.subscription.v1.premium.1m,1
3,6149220,com.gopaktor.subscription.v4.premium.1m,1
4,6149432,com.gopaktor.subscription.v4.premium.1m,1


In [6]:
prod_views_df.shape

(4428, 3)

In [7]:
from sklearn import preprocessing

In [8]:
user_id_le = preprocessing.LabelEncoder()
product_id_le = preprocessing.LabelEncoder()

In [9]:
user_id_le.fit(prod_views_df.cust_id)

LabelEncoder()

In [10]:
product_id_le.fit(prod_views_df.product_id)

LabelEncoder()

In [11]:
print('Number of unique users: ', str(len(user_id_le.classes_)))
print('Number of unique products: ', str(len(product_id_le.classes_)))

('Number of unique users: ', '3087')
('Number of unique products: ', '18')


In [12]:
n_prod_views_df = prod_views_df

In [13]:
n_prod_views_df.cust_id = user_id_le.transform(prod_views_df.cust_id)

In [14]:
n_prod_views_df.product_id = product_id_le.transform(prod_views_df.product_id)

In [15]:
n_prod_views_df.head()

Unnamed: 0,cust_id,product_id,rating
0,7,5,1
1,6,5,1
2,10,5,1
3,15,17,1
4,21,17,1


In [16]:
import findspark

In [17]:
findspark.init()

In [18]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [19]:
from pyspark.mllib.recommendation import ALS

In [20]:
sc = SparkContext(appName = "BuildProductRecommendations")

In [21]:
sqlContext = SQLContext(sc)

In [22]:
n_prod_views_rdd = sqlContext.createDataFrame(n_prod_views_df).rdd

In [23]:
n_prod_views_rdd.take(5)

[Row(cust_id=7, product_id=5, rating=1),
 Row(cust_id=6, product_id=5, rating=1),
 Row(cust_id=10, product_id=5, rating=1),
 Row(cust_id=15, product_id=17, rating=1),
 Row(cust_id=21, product_id=17, rating=1)]

In [24]:
training_rdd, validation_rdd, test_rdd = n_prod_views_rdd.randomSplit([6,2,2], 1345)

In [25]:
validation_for_predict_rdd = validation_rdd.map(lambda x : (x[0], x[1]))

In [26]:
test_for_predict_rdd = test_rdd.map(lambda x : (x[0], x[1]))

In [27]:
validation_for_predict_rdd.take(5)

[(7, 5), (21, 17), (30, 16), (74, 5), (113, 5)]

In [28]:
training_rdd.take(5)

[Row(cust_id=6, product_id=5, rating=1),
 Row(cust_id=10, product_id=5, rating=1),
 Row(cust_id=15, product_id=17, rating=1),
 Row(cust_id=22, product_id=17, rating=1),
 Row(cust_id=18, product_id=17, rating=1)]

In [29]:
print('Training RDD\n', training_rdd.take(5))
print('\nValidation for prediction RDD\n', validation_for_predict_rdd.take(5))
print('\nTest for Prediction RDD\n', test_for_predict_rdd.take(5))

('Training RDD\n', [Row(cust_id=6, product_id=5, rating=1), Row(cust_id=10, product_id=5, rating=1), Row(cust_id=15, product_id=17, rating=1), Row(cust_id=22, product_id=17, rating=1), Row(cust_id=18, product_id=17, rating=1)])
('\nValidation for prediction RDD\n', [(7, 5), (21, 17), (30, 16), (74, 5), (113, 5)])
('\nTest for Prediction RDD\n', [(24, 17), (28, 17), (33, 5), (48, 5), (16, 0)])


In [30]:
import math

In [31]:
seed = 49247
iterations = 10
lambdas = [0.01, 0.1]
ranks = [16]
alphas = [1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 40.0, 80.0]
errors = [0 for x in range(len(alphas) * len(ranks) * len(lambdas))]
err_index = 0

In [32]:
for lambda_ in  lambdas:
    for rank in ranks:
        for alpha in alphas:
            model = ALS.trainImplicit(training_rdd, rank, seed=seed, iterations=iterations,
                                           lambda_ = lambda_, alpha = alpha)
            predictions = model.predictAll(validation_for_predict_rdd).map(lambda r : ((r[0], r[1]), r[2]))
            views_and_preds = validation_rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
            error = math.sqrt(views_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
            errors[err_index] = error
            err_index += 1
            print('For rank {0} at alpha: {1} and lambda: {2}, the RMSE is{3}'.format(rank, alpha, lambda_, error))

For rank 16 at alpha: 1.0 and lambda: 0.01, the RMSE is0.3618524676
For rank 16 at alpha: 2.0 and lambda: 0.01, the RMSE is0.320635344415
For rank 16 at alpha: 4.0 and lambda: 0.01, the RMSE is0.304084575444
For rank 16 at alpha: 8.0 and lambda: 0.01, the RMSE is0.298953713989
For rank 16 at alpha: 16.0 and lambda: 0.01, the RMSE is0.29777185821
For rank 16 at alpha: 32.0 and lambda: 0.01, the RMSE is0.297652481366
For rank 16 at alpha: 40.0 and lambda: 0.01, the RMSE is0.297683583103
For rank 16 at alpha: 80.0 and lambda: 0.01, the RMSE is0.29784487528
For rank 16 at alpha: 1.0 and lambda: 0.1, the RMSE is0.347145244249
For rank 16 at alpha: 2.0 and lambda: 0.1, the RMSE is0.31411679644
For rank 16 at alpha: 4.0 and lambda: 0.1, the RMSE is0.301544573613
For rank 16 at alpha: 8.0 and lambda: 0.1, the RMSE is0.29732915119
For rank 16 at alpha: 16.0 and lambda: 0.1, the RMSE is0.295813446016
For rank 16 at alpha: 32.0 and lambda: 0.1, the RMSE is0.295446642748
For rank 16 at alpha: 40.0

In [None]:
predictions = model.recommendProducts(6,10)

In [None]:
predictions

In [38]:
all_user = model.recommendProductsForUsers(20)

In [39]:
all_user.count()

774

In [40]:
all_user.take(5)

[(96,
  (Rating(user=96, product=15, rating=0.9606531729819345),
   Rating(user=96, product=1, rating=0.0013493345938071916),
   Rating(user=96, product=0, rating=0.00040243511720881864),
   Rating(user=96, product=5, rating=6.589012819243484e-05),
   Rating(user=96, product=11, rating=3.592912558986888e-05),
   Rating(user=96, product=14, rating=-0.00010497835261521343),
   Rating(user=96, product=12, rating=-0.00010607336281896404),
   Rating(user=96, product=9, rating=-0.00013881282123665364),
   Rating(user=96, product=7, rating=-0.00021604348097167825),
   Rating(user=96, product=8, rating=-0.0002665368430527504),
   Rating(user=96, product=16, rating=-0.0003544141334719149),
   Rating(user=96, product=6, rating=-0.0006783216777427655),
   Rating(user=96, product=17, rating=-0.0007002553741833548),
   Rating(user=96, product=10, rating=-0.0010782224520942407),
   Rating(user=96, product=2, rating=-0.0011421241666532966),
   Rating(user=96, product=13, rating=-0.0019091638084096363