In [1]:
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
import pandas as pd
import numpy as np

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
ratings_df = pd.read_csv('data/training.csv')
train_df = ratings_df.sort_values('timestamp', ascending=True)[:-100000]

In [4]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 4 columns):
user         800000 non-null int64
movie        800000 non-null int64
rating       800000 non-null int64
timestamp    800000 non-null int64
dtypes: int64(4)
memory usage: 24.4 MB


In [5]:
ratings_spark_df = spark.createDataFrame(ratings_df)

In [6]:
ratings_spark_df = ratings_spark_df.drop('timestamp')

In [7]:
als_model = ALS(
    itemCol='user',
    userCol='movie',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10
    )

In [10]:
from src.recommender import MovieRecommender as MR

In [11]:
ratings_fit = als_model.fit(ratings_spark_df)

In [12]:
requests = spark.read.csv('data/requests.csv')

In [14]:
ratings_fit.transform(ratings_spark_df).show()

+----+-----+------+----------+
|user|movie|rating|prediction|
+----+-----+------+----------+
| 833|  593|     5| 4.2482715|
| 833| 2840|     3| 3.2312734|
| 833| 3798|     4| 3.8320644|
| 833| 1517|     2| 2.7849631|
| 833| 3827|     4| 3.6565695|
| 833|  266|     5| 3.9061263|
| 833| 1035|     5| 4.2161407|
| 833|  316|     4|  3.681997|
| 833|  318|     5|  4.504649|
| 833| 1271|     4|  4.161791|
| 833| 2688|     5| 3.6521587|
| 833| 3101|     3|  3.915731|
| 833|  527|     4|  4.604813|
| 833| 1704|     4| 4.2295113|
| 833| 2858|     5| 4.0516725|
| 833|  457|     3| 4.1674733|
| 833| 2028|     5| 4.3511543|
| 833|  344|     2| 2.6099944|
| 833| 2762|     4| 4.4686275|
| 833| 1193|     5|   4.20213|
+----+-----+------+----------+
only showing top 20 rows

