In [1]:
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
import pandas as pd
import numpy as np

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
ratings_df = pd.read_csv('data/training.csv')
train_df = ratings_df.sort_values('timestamp', ascending=True)[:-100000]

In [4]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 4 columns):
user         800000 non-null int64
movie        800000 non-null int64
rating       800000 non-null int64
timestamp    800000 non-null int64
dtypes: int64(4)
memory usage: 24.4 MB


In [5]:
ratings_spark_df = spark.createDataFrame(ratings_df)

In [6]:
ratings_spark_df = ratings_spark_df.drop('timestamp')

In [7]:
als_model = ALS(
    itemCol='user',
    userCol='movie',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10
    )

In [10]:
from src.recommender import MovieRecommender as MR

In [11]:
ratings_fit = als_model.fit(ratings_spark_df)

In [45]:
requests = spark.read.csv('data/requests.csv', header=True, schema = schema, inferSchema=False)

In [26]:
def casting_function(row):
    id, movie = row
    return int(id), int(movie)

In [27]:
type(requests)

pyspark.sql.dataframe.DataFrame

In [46]:
requests = requests.rdd.map(casting_function)

In [37]:
from pyspark.sql.types import *

In [47]:
schema = StructType( [
        StructField('user', IntegerType(), True),
        StructField('movie', IntegerType(), True)])

In [48]:
requests = spark.createDataFrame(requests, schema)

In [49]:
requests.printSchema()

root
 |-- user: integer (nullable = true)
 |-- movie: integer (nullable = true)



In [50]:
requests.show()

+----+-----+
|user|movie|
+----+-----+
|4958| 1924|
|4958| 3264|
|4958| 2634|
|4958| 1407|
|4958| 2399|
|4958| 3489|
|4958| 2043|
|4958| 2453|
|5312| 3267|
|5948| 3098|
|5948| 1180|
|3158| 2648|
| 403| 1036|
|3693|  468|
|5950| 1262|
|5950| 3555|
|5950| 3793|
|5950| 3578|
|5950| 3948|
|5950| 3893|
+----+-----+
only showing top 20 rows



In [53]:
preds = ratings_fit.transform(requests).collect()

In [69]:
preds_df = pd.DataFrame(preds, columns=['user', 'movie', 'rating'])

In [70]:
preds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200209 entries, 0 to 200208
Data columns (total 3 columns):
user      200209 non-null int64
movie     200209 non-null int64
rating    104439 non-null float64
dtypes: float64(1), int64(2)
memory usage: 4.6 MB


In [71]:
preds_df

Unnamed: 0,user,movie,rating
0,148,1088,
1,148,1580,
2,148,2122,
3,148,2142,
4,148,2366,
5,148,3175,
6,148,858,
7,148,1127,
8,148,1721,
9,148,3698,


In [72]:
pd.DataFrame.to_csv(preds_df, 'submit.csv', na_rep='NaN', index=False)