In [1]:
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
import pandas as pd
import numpy as np

In [6]:
spark = SparkSession.builder.master('local[4]').getOrCreate()

In [7]:
ratings_df = pd.read_csv('data/training.csv')
train_df = ratings_df.sort_values('timestamp', ascending=True)[:-100000]
test_df = ratings_df.sort_values('timestamp', ascending=True)[-100000:]

In [8]:
train_spark_df = spark.createDataFrame(train_df)
test_spark_df = spark.createDataFrame(test_df)

In [9]:
train_spark_df = train_spark_df.drop('timestamp')
test_spark_df = test_spark_df.drop('timestamp')

In [117]:
ALS?

In [10]:
als_model = ALS(
    itemCol='user',
    userCol='movie',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=100,
    maxIter=10
    )

In [10]:
from src.recommender import MovieRecommender as MR

In [11]:
ratings_fit = als_model.fit(train_spark_df)

In [14]:
from pyspark.sql.types import *

In [15]:
schema = StructType( [
        StructField('user', IntegerType(), True),
        StructField('movie', IntegerType(), True)])

In [16]:
requests = spark.read.csv('data/requests.csv', header=True, schema = schema, inferSchema=False)

In [17]:
def casting_function(row):
    id, movie = row
    return int(id), int(movie)

In [18]:
type(requests)

pyspark.sql.dataframe.DataFrame

In [19]:
requests = requests.rdd.map(casting_function)

In [20]:
requests = spark.createDataFrame(requests, schema)

In [21]:
requests.printSchema()

root
 |-- user: integer (nullable = true)
 |-- movie: integer (nullable = true)



In [50]:
requests.show()

+----+-----+
|user|movie|
+----+-----+
|4958| 1924|
|4958| 3264|
|4958| 2634|
|4958| 1407|
|4958| 2399|
|4958| 3489|
|4958| 2043|
|4958| 2453|
|5312| 3267|
|5948| 3098|
|5948| 1180|
|3158| 2648|
| 403| 1036|
|3693|  468|
|5950| 1262|
|5950| 3555|
|5950| 3793|
|5950| 3578|
|5950| 3948|
|5950| 3893|
+----+-----+
only showing top 20 rows



In [39]:
preds = ratings_fit.transform(test_spark_df)

In [49]:
preds.show()

+----+-----+------+----------+
|user|movie|rating|prediction|
+----+-----+------+----------+
| 833|  593|     5|       NaN|
| 833| 2840|     3|       NaN|
| 833| 3798|     4|       NaN|
| 833| 1517|     2|       NaN|
| 833| 3827|     4|       NaN|
| 833|  266|     5|       NaN|
| 833| 1035|     5|       NaN|
| 833|  316|     4|       NaN|
| 833|  318|     5|       NaN|
| 833| 1271|     4|       NaN|
| 833| 2688|     5|       NaN|
| 833| 3101|     3|       NaN|
| 833|  527|     4|       NaN|
| 833| 1704|     4|       NaN|
| 833| 2858|     5|       NaN|
| 833|  457|     3|       NaN|
| 833| 2028|     5|       NaN|
| 833|  344|     2|       NaN|
| 833| 2762|     4|       NaN|
| 833| 1193|     5|       NaN|
+----+-----+------+----------+
only showing top 20 rows



In [61]:
preds_rating = preds.select('rating', 'prediction')

In [58]:
from pyspark.ml.param import Params

In [60]:
paramap = Params()

In [25]:
from pyspark.ml.evaluation import RegressionEvaluator

In [65]:
evaluator = RegressionEvaluator(labelCol='rating', predictionCol='prediction')

In [66]:
evaluator.evaluate(preds_rating)

nan

In [105]:
preds_df = pd.DataFrame(preds, columns=['user', 'movie', 'rating'])

In [106]:
preds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200209 entries, 0 to 200208
Data columns (total 3 columns):
user      200209 non-null int64
movie     200209 non-null int64
rating    104439 non-null float64
dtypes: float64(1), int64(2)
memory usage: 4.6 MB


In [72]:
pd.DataFrame.to_csv(preds_df, 'submit.csv', na_rep='NaN', index=False)

In [70]:
ratings_pandas_df = train_spark_df.toPandas()

In [71]:
avg_movie_ratings_pandas_df = ratings_pandas_df.groupby('movie').mean()

In [72]:
avg_movie_ratings_pandas_df.drop('user', axis=1, inplace=True)

In [107]:
preds_df_nonan = preds_df.join(avg_movie_ratings_pandas_df, on='movie', how='left', rsuffix='movie')

In [85]:
avg_movie_ratings_pandas_df.columns

Index([u'rating'], dtype='object')

In [82]:
ratings_pandas_df

Unnamed: 0,user,movie,rating
0,6040,858,4
1,6040,593,5
2,6040,2384,4
3,6040,1961,4
4,6040,2019,5
5,6040,1419,3
6,6040,573,4
7,6040,3111,5
8,6040,213,5
9,6040,3505,4


In [108]:
preds_df_nonan['rating'].fillna(preds_df_nonan.ratingmovie)

0         3.334552
1         3.730519
2         2.519337
3         2.803681
4         3.691943
5         3.766690
6         4.517886
7         3.690180
8         3.585072
9         3.187192
10        3.983509
11        3.340267
12        3.179832
13        3.500000
14        2.068376
15        3.709184
16        2.262195
17        3.733945
18        3.000000
19        3.510204
20        3.986348
21        3.228814
22        4.119298
23        3.683721
24        2.634573
25        4.359833
26        3.671932
27        3.685484
28        3.862182
29        3.296954
            ...   
200179    3.527469
200180    3.171310
200181    4.151720
200182    2.999188
200183    4.193201
200184    2.817289
200185    3.379818
200186    2.726467
200187    3.565174
200188    2.947255
200189    3.195085
200190    2.904865
200191    3.257633
200192    3.743273
200193    2.629044
200194    3.963777
200195    2.980510
200196    4.240873
200197    3.903620
200198    4.120227
200199    3.581655
200200    3.

In [109]:
preds_df_nonan['rating'] = preds_df_nonan['rating'].fillna(preds_df_nonan.ratingmovie)

In [110]:
preds_df2 = preds_df_nonan.drop('ratingmovie', axis=1)

In [111]:
preds_df2

Unnamed: 0,user,movie,rating
0,148,1088,3.334552
1,148,1580,3.730519
2,148,2122,2.519337
3,148,2142,2.803681
4,148,2366,3.691943
5,148,3175,3.766690
6,148,858,4.517886
7,148,1127,3.690180
8,148,1721,3.585072
9,148,3698,3.187192


In [101]:
pd.DataFrame.to_csv(preds_df2, 'submit.csv', na_rep='NaN', index=False)