In [3]:
import findspark
findspark.init('/home/ubuntu/spark-2.2.0-bin-hadoop2.7')

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rec').getOrCreate()

In [7]:
file = '/movielens_ratings.csv'
folder = '/home/ubuntu/data/raw'
data = spark.read.csv(folder+file, header=True, inferSchema=True)
data.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [9]:
#Person 0 gave movie 2, a 3.0 rating
#userID 0 gave movie 3 a rating of 1.0
data.show(5)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
+-------+------+------+
only showing top 5 rows



In [12]:
data.describe().show()
data.columns

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



['movieId', 'rating', 'userId']

In [13]:
import pandas as pd
import numpy as np
#Feature Correlations

from pyspark.sql.functions import corr, covar_pop
important_columns = ['movieId', 'rating', 'userId']

corr_df = pd.DataFrame(columns=important_columns, index=important_columns)
for feature in important_columns:
    for feature2 in important_columns:
        #new_data.select(corr(feature,feature2)).show()
        corr_df[feature2][feature] = data.select(corr(feature,feature2)).head()[0]
corr_df.head()


Unnamed: 0,movieId,rating,userId
movieId,1.0,0.0365686,0.00326711
rating,0.0365686,1.0,0.0564114
userId,0.00326711,0.0564114,1.0


# Split Data

In [17]:
data_train, data_test = data.randomSplit([0.8,0.2])
data_train.describe().show()

+-------+------------------+------------------+-----------------+
|summary|           movieId|            rating|           userId|
+-------+------------------+------------------+-----------------+
|  count|              1188|              1188|             1188|
|   mean| 49.34848484848485|1.7516835016835017|14.51010101010101|
| stddev|29.147328279052385|1.1587354148298872|8.611471297083183|
|    min|                 0|               1.0|                0|
|    max|                99|               5.0|               29|
+-------+------------------+------------------+-----------------+



# Make ALS Model

In [33]:
from pyspark.ml.recommendation import ALS

In [34]:
als = ALS(maxIter=5, regParam=0.01, 
          userCol='userId', 
          itemCol='movieId', 
          ratingCol='rating')
als_trained = als.fit(data_train)


# Evaluate Model

In [35]:
predictions = als_trained.transform(data_test)
predictions.show()
#User 27 gave movie 31 a 1.0 score (0.8 prediction)
#User 12 gave movie 31 a 4.0 score (2.1 prediction)
#Movie 85 sucks

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    27|   0.825664|
|     31|   4.0|    12|  2.1081233|
|     31|   1.0|    13|  1.6430783|
|     31|   1.0|     4|  1.2927738|
|     31|   3.0|     8| -0.3394109|
|     31|   3.0|     7|  1.8622601|
|     31|   1.0|     0| 0.27617615|
|     85|   3.0|     6| 0.20896734|
|     85|   1.0|     5|  2.5142992|
|     85|   1.0|    15| 0.60128534|
|     85|   5.0|     8| -1.5788807|
|     85|   1.0|    23|  -1.662189|
|     85|   4.0|     7| 0.22075252|
|     85|   3.0|    21| 0.66022676|
|     65|   1.0|    19|  1.2252744|
|     65|   1.0|     4| 0.13463509|
|     53|   1.0|    12|  1.1106814|
|     53|   3.0|    13| -0.5959698|
|     53|   1.0|    25|-0.21666104|
|     53|   5.0|    21|  2.6518016|
+-------+------+------+-----------+
only showing top 20 rows



In [36]:
#Create regression evaluation
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', 
                                predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('RMSE: ', rmse)
predictions.describe().show()

#RMSE of 1.7 on a 1-5 scale sucks

RMSE:  1.7078394179966152
+-------+------------------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|        prediction|
+-------+------------------+------------------+------------------+------------------+
|  count|               313|               313|               313|               313|
|   mean| 49.62300319488818|1.8594249201277955|13.904153354632587|1.2495884822708325|
| stddev|28.169218195974587|1.2882788862416283| 8.509633417456003|1.2405238763651232|
|    min|                 0|               1.0|                 0|        -1.7079314|
|    max|                98|               5.0|                29|          4.296867|
+-------+------------------+------------------+------------------+------------------+



In [37]:
#EValuation
from pyspark.ml.evaluation import RegressionEvaluator


# Predictions for Specific Users

In [38]:
single_user = data_test.filter(data_test['userId'] == 11).select(['movieId','userId'])
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      0|    11|
|     23|    11|
|     37|    11|
|     40|    11|
|     50|    11|
|     59|    11|
|     61|    11|
|     62|    11|
|     66|    11|
|     71|    11|
|     76|    11|
|     77|    11|
|     78|    11|
|     81|    11|
|     90|    11|
+-------+------+



In [41]:
#What movies should we recommend to this dude 11?
recommendations = als_trained.transform(single_user)
recommendations.orderBy('prediction',ascending=False).show()

+-------+------+------------+
|movieId|userId|  prediction|
+-------+------+------------+
|     76|    11|    4.120113|
|     50|    11|   2.8945935|
|     66|    11|    2.474843|
|     37|    11|   1.6769702|
|     81|    11|    1.585211|
|     23|    11|   1.4453884|
|     62|    11|   1.3736115|
|     90|    11|   1.1445829|
|     77|    11|   0.9150108|
|      0|    11|   0.5708736|
|     78|    11|   0.5675636|
|     61|    11|  0.51073366|
|     71|    11|    0.503191|
|     59|    11|-0.012784004|
|     40|    11|  -0.8294847|
+-------+------+------------+

