# Load Spark Session

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Recommendation Systems") \
    .getOrCreate()

In [3]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x0000022B6736AF28>


# Pre Processing

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
from pyspark.ml.recommendation import ALS

In [6]:
from pyspark.sql import Row

In [7]:
lines = spark.read.text("dataset5.txt").rdd

In [8]:
print(lines.take(5))

[Row(value='1\t6\t5\t887431973'), Row(value='1\t10\t3\t875693118'), Row(value='1\t12\t5\t878542960'), Row(value='1\t14\t5\t874965706'), Row(value='1\t17\t3\t875073198')]


In [9]:
part = lines.map(lambda row: row.value.split("\t"))

In [10]:
print(part.take(5))

[['1', '6', '5', '887431973'], ['1', '10', '3', '875693118'], ['1', '12', '5', '878542960'], ['1', '14', '5', '874965706'], ['1', '17', '3', '875073198']]


In [11]:
data = part.map(lambda p: Row(userId=int(p[0]), itemId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))

In [12]:
print(data.take(5))

[Row(itemId=6, rating=5.0, timestamp=887431973, userId=1), Row(itemId=10, rating=3.0, timestamp=875693118, userId=1), Row(itemId=12, rating=5.0, timestamp=878542960, userId=1), Row(itemId=14, rating=5.0, timestamp=874965706, userId=1), Row(itemId=17, rating=3.0, timestamp=875073198, userId=1)]


In [13]:
rec = spark.createDataFrame(data)

In [14]:
rec.show()

+------+------+---------+------+
|itemId|rating|timestamp|userId|
+------+------+---------+------+
|     6|   5.0|887431973|     1|
|    10|   3.0|875693118|     1|
|    12|   5.0|878542960|     1|
|    14|   5.0|874965706|     1|
|    17|   3.0|875073198|     1|
|    20|   4.0|887431883|     1|
|    23|   4.0|875072895|     1|
|    24|   3.0|875071713|     1|
|    27|   2.0|876892946|     1|
|    31|   3.0|875072144|     1|
|    33|   4.0|878542699|     1|
|    36|   2.0|875073180|     1|
|    39|   4.0|875072173|     1|
|    44|   5.0|878543541|     1|
|    47|   4.0|875072125|     1|
|    49|   3.0|878542478|     1|
|    51|   4.0|878543275|     1|
|    53|   3.0|876893206|     1|
|    54|   3.0|878543308|     1|
|    56|   4.0|875072716|     1|
+------+------+---------+------+
only showing top 20 rows



# Create Model and Result

In [15]:
(training, test) = rec.randomSplit([0.8, 0.2])

In [16]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="itemId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [17]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.4740823056361259


In [18]:
recuser = model.recommendForAllUsers(10)

In [19]:
recmovie = model.recommendForAllItems(10)

In [20]:
recuser.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   148|[[770, 7.830649],...|
|   392|[[1100, 7.861853]...|
|   243|[[727, 5.6191583]...|
|    31|[[582, 8.057912],...|
|   251|[[671, 8.962639],...|
|   451|[[763, 7.378364],...|
|    85|[[664, 4.876044],...|
|   137|[[813, 9.26441], ...|
|    65|[[1100, 9.796213]...|
|   458|[[114, 4.69701], ...|
|    53|[[408, 8.038853],...|
|   255|[[408, 9.5308], [...|
|   133|[[512, 5.617704],...|
|   296|[[1039, 6.736469]...|
|   322|[[101, 8.41664], ...|
|    78|[[224, 10.928649]...|
|   321|[[512, 5.6258135]...|
|   362|[[664, 6.4836144]...|
|   375|[[770, 7.517901],...|
|   155|[[895, 7.667574],...|
+------+--------------------+
only showing top 20 rows



In [21]:
recmovie.show()

+------+--------------------+
|itemId|     recommendations|
+------+--------------------+
|   471|[[127, 7.2769604]...|
|  1591|[[445, 3.9397576]...|
|  1342|[[432, 2.0495114]...|
|   463|[[282, 8.47162], ...|
|   833|[[164, 5.753483],...|
|   496|[[261, 10.102193]...|
|   148|[[225, 6.911752],...|
|  1088|[[432, 3.6580672]...|
|  1238|[[341, 2.5881758]...|
|   540|[[127, 6.5051794]...|
|   392|[[261, 8.603705],...|
|  1522|[[436, 1.9089429]...|
|   243|[[127, 4.931908],...|
|   623|[[261, 6.427389],...|
|  1084|[[412, 5.9489646]...|
|  1025|[[182, 9.078432],...|
|  1395|[[432, 2.0495114]...|
|   737|[[282, 11.114335]...|
|  1127|[[176, 4.9422536]...|
|  1270|[[282, 3.4825702]...|
+------+--------------------+
only showing top 20 rows

