<a href="https://colab.research.google.com/github/viniciusnatanss/datetime_and_pandas/blob/master/ciandt01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# install jdk
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [0]:
# downloading spark/hadoop
!wget -q https://archive.apache.org/dist/spark/spark-2.3.4/spark-2.3.4-bin-hadoop2.6.tgz

In [3]:
!ls

sample_data  spark-2.3.4-bin-hadoop2.6.tgz


In [0]:
# unzip file
!tar xf spark-2.3.4-bin-hadoop2.6.tgz

In [0]:
# install library
!pip install -q findspark

In [0]:
# load os and setup environment parameters
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.4-bin-hadoop2.6"

In [0]:
# load library and create a spark cluster
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [8]:
# loading our database
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

--2020-04-30 21:09:58--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2020-04-30 21:09:59 (7.67 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]



In [9]:
!ls

ml-latest-small.zip  spark-2.3.4-bin-hadoop2.6
sample_data	     spark-2.3.4-bin-hadoop2.6.tgz


In [10]:
# unzip file
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [0]:
# import library
import pandas as pd

In [12]:
!ls ml-latest-small

links.csv  movies.csv  ratings.csv  README.txt	tags.csv


In [0]:
# loading dataset from .csv file
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')

In [14]:
# sizing of sample
len(ratings_df)

100836

In [15]:
ratings_df.head(20)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [16]:
# view content of file
pd.read_csv('ml-latest-small/movies.csv')

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [0]:
# loading in spark dataframe
ratings_sdf = spark.read.format('csv').options(header='true', inferSchema='true') \
    .load('ml-latest-small/ratings.csv')

In [18]:
ratings_sdf.show(10)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
+------+-------+------+---------+
only showing top 10 rows



In [0]:
# loading libraries to improve ou Recommendation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [0]:
# creating traning and test datasets
(training, test) = ratings_sdf.randomSplit([0.8, 0.2])

In [0]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [22]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.0832568042658028


In [23]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.head(5)

[Row(userId=471, recommendations=[Row(movieId=52435, rating=9.165359497070312), Row(movieId=2469, rating=7.743799686431885), Row(movieId=3270, rating=7.608659267425537), Row(movieId=8638, rating=7.497716903686523), Row(movieId=3030, rating=7.465417385101318), Row(movieId=89753, rating=7.294299125671387), Row(movieId=8042, rating=7.223275184631348), Row(movieId=2730, rating=7.12715482711792), Row(movieId=158783, rating=7.1158857345581055), Row(movieId=1635, rating=7.028524875640869)]),
 Row(userId=463, recommendations=[Row(movieId=97304, rating=7.538291931152344), Row(movieId=2986, rating=7.250640392303467), Row(movieId=166534, rating=7.196099281311035), Row(movieId=556, rating=6.839203357696533), Row(movieId=143859, rating=6.833131790161133), Row(movieId=5013, rating=6.790536403656006), Row(movieId=112623, rating=6.7399187088012695), Row(movieId=55276, rating=6.692137718200684), Row(movieId=86644, rating=6.629750728607178), Row(movieId=3438, rating=6.626420974731445)]),
 Row(userId=496

In [24]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show(10)

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[[399, 5.531799],...|
|   4900|[[461, 13.276947]...|
|   6620|[[485, 7.9614954]...|
|   7340|[[335, 5.3420205]...|
|  32460|[[598, 9.589021],...|
|  54190|[[231, 9.348149],...|
|    471|[[296, 8.233962],...|
|   1591|[[231, 7.3192863]...|
| 140541|[[289, 6.9959207]...|
|   1342|[[147, 7.860112],...|
+-------+--------------------+
only showing top 10 rows



In [25]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings_sdf.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.head(10)

[Row(userId=471, recommendations=[Row(movieId=52435, rating=9.165359497070312), Row(movieId=2469, rating=7.743799686431885), Row(movieId=3270, rating=7.608659267425537), Row(movieId=8638, rating=7.497716903686523), Row(movieId=3030, rating=7.465417385101318), Row(movieId=89753, rating=7.294299125671387), Row(movieId=8042, rating=7.223275184631348), Row(movieId=2730, rating=7.12715482711792), Row(movieId=158783, rating=7.1158857345581055), Row(movieId=1635, rating=7.028524875640869)]),
 Row(userId=463, recommendations=[Row(movieId=97304, rating=7.538291931152344), Row(movieId=2986, rating=7.250640392303467), Row(movieId=166534, rating=7.196099281311035), Row(movieId=556, rating=6.839203357696533), Row(movieId=143859, rating=6.833131790161133), Row(movieId=5013, rating=6.790536403656006), Row(movieId=112623, rating=6.7399187088012695), Row(movieId=55276, rating=6.692137718200684), Row(movieId=86644, rating=6.629750728607178), Row(movieId=3438, rating=6.626420974731445)]),
 Row(userId=148

In [26]:
# Generate top 10 user recommendations for a specified set of movies
movies = ratings_sdf.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
movieSubSetRecs.head()

Row(movieId=1580, recommendations=[Row(userId=399, rating=5.531798839569092), Row(userId=231, rating=5.192637920379639), Row(userId=485, rating=5.139752388000488), Row(userId=335, rating=5.000114440917969), Row(userId=60, rating=4.966618537902832), Row(userId=154, rating=4.952124118804932), Row(userId=544, rating=4.929132461547852), Row(userId=543, rating=4.922102928161621), Row(userId=147, rating=4.858897686004639), Row(userId=93, rating=4.800593852996826)])