In [1]:
import os
import sys

spark_path = "/opt/spark/"

os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.3-src.zip")

In [2]:
from pyspark import SparkContext
from pyspark import SparkConf

In [3]:
sc = SparkContext("local", "test")

In [4]:
sc

<pyspark.context.SparkContext at 0x7f533c134e10>

In [5]:
data = sc.textFile('/home/ubuntu/client_demo/client_data')

In [6]:
data.take(5)

[u'0,3893506,5,1',
 u'1,3793254,5,1',
 u'2,6148794,5,1',
 u'3,6149220,17,1',
 u'4,6149432,17,1']

In [16]:
data.count()

4428

In [8]:
clean_data = data.map(lambda x:x.split(','))

In [9]:
clean_data.take(5)

[[u'0', u'3893506', u'5', u'1'],
 [u'1', u'3793254', u'5', u'1'],
 [u'2', u'6148794', u'5', u'1'],
 [u'3', u'6149220', u'17', u'1'],
 [u'4', u'6149432', u'17', u'1']]

In [10]:
rate = clean_data.map(lambda y: int(y[3]))

In [11]:
rate.take(5)

[1, 1, 1, 1, 1]

In [12]:
users = clean_data.map(lambda y: int(y[1]))

In [13]:
users.take(5)

[3893506, 3793254, 6148794, 6149220, 6149432]

In [15]:
users.distinct().count()

3087

In [17]:
products = clean_data.map(lambda y : int(y[2]))

In [18]:
products.take(5)

[5, 5, 5, 17, 17]

In [19]:
products.distinct().count()

18

In [20]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [23]:
cli = data.map(lambda l: l.split(','))

In [24]:
cli.take(5)

[[u'0', u'3893506', u'5', u'1'],
 [u'1', u'3793254', u'5', u'1'],
 [u'2', u'6148794', u'5', u'1'],
 [u'3', u'6149220', u'17', u'1'],
 [u'4', u'6149432', u'17', u'1']]

In [25]:
ratings = cli.map(lambda x: Rating(int(x[1]),int(x[2]), float(x[3])))

In [26]:
ratings.take(5)

[Rating(user=3893506, product=5, rating=1.0),
 Rating(user=3793254, product=5, rating=1.0),
 Rating(user=6148794, product=5, rating=1.0),
 Rating(user=6149220, product=17, rating=1.0),
 Rating(user=6149432, product=17, rating=1.0)]

In [27]:
train, test = ratings.randomSplit([0.7,0.3],4567)

In [28]:
train.count(), test.count()

(3070, 1358)

In [29]:
train.cache(), test.cache()

(PythonRDD[23] at RDD at PythonRDD.scala:48,
 PythonRDD[24] at RDD at PythonRDD.scala:48)

In [30]:
rank = 5 
numIterations = 10 

In [31]:
als_model = ALS.trainImplicit(train, rank, numIterations)

In [37]:
als_model_test = ALS.trainImplicit(test, rank, numIterations)

In [32]:
als_model.recommendProducts(3893506,5)

[Rating(user=3893506, product=5, rating=1.8894254683920302),
 Rating(user=3893506, product=0, rating=0.054241911169662205),
 Rating(user=3893506, product=12, rating=0.0036363513896504607),
 Rating(user=3893506, product=2, rating=0.0001436881167315848),
 Rating(user=3893506, product=8, rating=0.00012993597361245918)]

In [33]:
als_model.recommendUsers(5,5)

[Rating(user=6155796, product=5, rating=4.310742689548564),
 Rating(user=6176509, product=5, rating=3.5520600900707366),
 Rating(user=6156187, product=5, rating=3.5520600900707366),
 Rating(user=6162458, product=5, rating=3.5520600900707366),
 Rating(user=6173154, product=5, rating=3.5520600900707366)]

In [40]:
use_rec = als_model.recommendProductsForUsers(25).distinct()

In [41]:
use_rec.count()

2314

In [44]:
use_rec_test = als_model_test.recommendProductsForUsers(25).distinct().collect()

In [46]:
len(use_rec_test)

1192