# Creating sparkSession & it's Object

In [1]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("ALSWork1").getOrCreate()


In [2]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# Creating data frame object with the file readed

In [3]:
df= spark.read.csv("/home/tushar/Desktop/gitRepos/Movie-Lens-recommendation-ALS-/ml-latest-small/ratings.csv", inferSchema=True,header=True)
df.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [4]:
#checking any missing or nan values
df.describe().show()

+-------+------------------+----------------+------------------+--------------------+
|summary|            userId|         movieId|            rating|           timestamp|
+-------+------------------+----------------+------------------+--------------------+
|  count|            100836|          100836|            100836|              100836|
|   mean|326.12756356856676|19435.2957177992| 3.501556983616962|1.2059460873684695E9|
| stddev| 182.6184914635004|35530.9871987003|1.0425292390606342|2.1626103599513078E8|
|    min|                 1|               1|               0.5|           828124615|
|    max|               610|          193609|               5.0|          1537799250|
+-------+------------------+----------------+------------------+--------------------+



In [53]:
# dividing data into 70 to 30 ratio
training, test = df.randomSplit([0.7,0.3])
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative=True) # using non-negative as True to prevent non-negative prediction

# this below is used in order to drop any rows in the DataFrame of predictions that contain NaN values
als.setColdStartStrategy("drop")

ALS_485a841a248b9ed92757

In [55]:
# now fitting the model to trainning data
model = als.fit(training)

In [56]:
# making prediction on the test set
predictions = model.transform(test)

In [57]:
predictions.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   385|    471|   4.0| 850766697| 3.2162707|
|   602|    471|   4.0| 840876085| 2.8254867|
|   500|    471|   1.0|1005528017|  4.002975|
|   610|    471|   4.0|1479544381| 3.2390215|
|   520|    471|   5.0|1326609921| 3.2856066|
|   273|    471|   5.0| 835861348| 3.3858485|
|   216|    471|   3.0| 975212641| 3.9241257|
|   260|    471|   4.5|1109409455| 3.4699805|
|   373|    471|   5.0| 846830388| 4.3931475|
|   357|    471|   3.5|1348627082|  4.055018|
|   599|    833|   1.5|1519330029| 1.5111393|
|   169|   1088|   4.5|1059427717|  4.336215|
|    41|   1088|   1.5|1458939142| 3.2766645|
|   286|   1088|   3.5|1119561544|  3.535578|
|   387|   1088|   1.5|1095040878|  2.577851|
|   381|   1088|   3.5|1168664508|  3.854556|
|   583|   1088|   3.5|1481474480| 3.9458833|
|   414|   1088|   3.0| 961514273| 3.2335653|
|   188|   1088|   4.0| 962560253|

In [58]:
# metric name we are using for continuous prediction
#rmse here is root mean square error 
evaluator = RegressionEvaluator(metricName="rmse",labelCol="rating", predictionCol="prediction")

In [59]:
rmse = evaluator.evaluate(predictions)
print(rmse)

# Looking for all movie by user in test dataset


In [63]:
# showing below is prediction as how will user like / dislike the movie
id=int(input("enter user id:"))
user = test.filter(test['userId']==id).select(['movieId','userId'])
user.show()

enter user id11
+-------+------+
|movieId|userId|
+-------+------+
|    165|    11|
|    208|    11|
|    292|    11|
|    377|    11|
|    466|    11|
|    474|    11|
|    529|    11|
|    736|    11|
|   1100|    11|
|   1101|    11|
|   1385|    11|
|   1391|    11|
|   1438|    11|
|   1518|    11|
|   1597|    11|
|   1604|    11|
|   1616|    11|
|   1687|    11|
|   1882|    11|
|   1918|    11|
+-------+------+



# Movie is predicated as through its previous searches

In [67]:
recommeded = model.transform(user)
#recommeded.show()

# user id will like this movie as predicted 
recommeded.orderBy(['movieId','prediction']).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|    165|    11| 3.6684828|
|    208|    11|  2.711164|
|    292|    11|  3.586186|
|    377|    11| 3.5656059|
|    466|    11| 2.2957363|
|    474|    11|  4.100631|
|    529|    11| 3.7298448|
|    736|    11| 3.1874573|
|   1100|    11| 3.3059604|
|   1101|    11| 3.7830806|
|   1385|    11| 2.9659758|
|   1391|    11| 2.7141316|
|   1438|    11| 2.7901266|
|   1518|    11| 3.6351182|
|   1597|    11|  3.470031|
|   1616|    11| 3.7754712|
|   1687|    11|  3.297877|
|   1882|    11| 2.4247594|
|   1918|    11| 3.2869189|
+-------+------+----------+

