In [1]:
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
import pandas as pd
import numpy as np

In [2]:
spark = SparkSession.builder.master('local[4]').getOrCreate()

In [3]:
ratings_df = pd.read_csv('data/training.csv')
train_df = ratings_df.sort_values('timestamp', ascending=True)[:-100000]
test_df = ratings_df.sort_values('timestamp', ascending=True)[-100000:]

In [4]:
train_spark_df = spark.createDataFrame(train_df)
test_spark_df = spark.createDataFrame(test_df)

In [5]:
train_spark_df = train_spark_df.drop('timestamp')
test_spark_df = test_spark_df.drop('timestamp')

In [117]:
ALS?

In [6]:
als_model = ALS(
    itemCol='user',
    userCol='movie',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=100,
    maxIter=10
    )

In [7]:
from src.recommender import MovieRecommender as MR

In [8]:
ratings_fit = als_model.fit(train_spark_df)

In [10]:
from pyspark.sql.types import *

In [11]:
schema = StructType( [
        StructField('user', IntegerType(), True),
        StructField('movie', IntegerType(), True)])

In [12]:
requests = spark.read.csv('data/requests.csv', header=True, schema = schema, inferSchema=False)

In [13]:
def casting_function(row):
    id, movie = row
    return int(id), int(movie)

In [14]:
type(requests)

pyspark.sql.dataframe.DataFrame

In [15]:
requests = requests.rdd.map(casting_function)

In [16]:
requests = spark.createDataFrame(requests, schema)

In [17]:
requests.printSchema()

root
 |-- user: integer (nullable = true)
 |-- movie: integer (nullable = true)



In [18]:
requests.show(1)

+----+-----+
|user|movie|
+----+-----+
|4958| 1924|
+----+-----+
only showing top 1 row



In [19]:
preds = ratings_fit.transform(test_spark_df)

In [20]:
preds.show(1)

+----+-----+------+----------+
|user|movie|rating|prediction|
+----+-----+------+----------+
| 833|  593|     5|       NaN|
+----+-----+------+----------+
only showing top 1 row



In [21]:
preds_to_pandas = preds.toPandas()

In [24]:
preditions_df = preds_to_pandas.drop('rating', axis = 1)
ratings_df = preds_to_pandas.drop('prediction', axis = 1)

In [32]:
#evaluate with this function
def compute_score(predictions, actual):
    """Look at 5% of most highly predicted movies for each user.
    Return the average actual rating of those movies.
    """
    df = pd.merge(predictions, actual, on=['user','movie']).fillna(1.0)
    #df = pd.concat([predictions.fillna(1.0), actual.actualrating], axis=1)

    # for each user
    g = df.groupby('user')

    # detect the top_5 movies as predicted by your algorithm
    top_5 = g.rating.transform(
        lambda x: x >= x.quantile(.95)
    )

    # return the mean of the actual score on those
    #print top_5
    return df[top_5==1].mean()

In [33]:
score_model = compute_score(preditions_df, ratings_df)
score_model

user          1154.052222
movie         1757.252727
prediction       1.730692
rating           4.888756
dtype: float64

In [58]:
#from pyspark.ml.param import Params
#paramap = Params()
#from pyspark.ml.evaluation import RegressionEvaluator

In [65]:
evaluator = RegressionEvaluator(labelCol='rating', predictionCol='prediction')

In [66]:
evaluator.evaluate(preds_rating)

nan

In [105]:
preds_df = pd.DataFrame(preds, columns=['user', 'movie', 'rating'])

In [106]:
preds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200209 entries, 0 to 200208
Data columns (total 3 columns):
user      200209 non-null int64
movie     200209 non-null int64
rating    104439 non-null float64
dtypes: float64(1), int64(2)
memory usage: 4.6 MB


In [72]:
pd.DataFrame.to_csv(preds_df, 'submit.csv', na_rep='NaN', index=False)

In [38]:
ratings_pandas_df = train_spark_df.toPandas()
avg_movie_ratings_pandas_df = ratings_pandas_df.groupby('movie').mean()
avg_movie_ratings_pandas_df.drop('user', axis=1, inplace=True)
preds_df_nonan = preds_df.join(avg_movie_ratings_pandas_df, on='movie', how='left', rsuffix='movie')

NameError: name 'preds_df' is not defined

In [35]:
pred

NameError: name 'preds_df' is not defined

In [85]:
avg_movie_ratings_pandas_df.columns

Index([u'rating'], dtype='object')

In [39]:
#ratings_pandas_df

In [40]:
#preds_df_nonan['rating'].fillna(preds_df_nonan.ratingmovie)

In [109]:
preds_df_nonan['rating'] = preds_df_nonan['rating'].fillna(preds_df_nonan.ratingmovie)

In [110]:
preds_df2 = preds_df_nonan.drop('ratingmovie', axis=1)

In [41]:
#preds_df2

In [101]:
pd.DataFrame.to_csv(preds_df2, 'submit.csv', na_rep='NaN', index=False)