### Prepare csv for als

In [7]:
from helpers_als import prepare_csv_for_als
# DON'T FORGET TO REMOVE HEADER Id,Prediction
prepare_csv_for_als("data/sampleSubmission_processed.csv")

### Load training file data_train.csv

In [20]:
from pyspark.sql import Row 
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .getOrCreate()

lines = spark.read.text("data/data_train_processed.csv").rdd # "data/data_train_woHeader.csv" "data/try_woHeader.csv")
parts = lines.map(lambda row: row.value.split(","))
ratingsRDD = parts.map(lambda p: Row(movieId=int(p[0]), userId=int(p[1]), 
                                     rating=float(p[2])))
ratings = spark.createDataFrame(ratingsRDD)

### Load submissionSample.csv for prediction by ALS

In [21]:
lines = spark.read.text("data/sampleSubmission_processed.csv").rdd 
parts = lines.map(lambda row: row.value.split(","))
ratingsRDD = parts.map(lambda p: Row(movieId=int(p[0]), userId=int(p[1]), 
                                     rating=float(p[2])))
submission_ratings = spark.createDataFrame(ratingsRDD)

### ALS without cross-validation

In [36]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")

(training, test) = ratings.randomSplit([0.8, 0.2])

In [38]:
from pyspark.ml.recommendation import ALS
# Build the recommendation model using ALS on the training data
als = ALS(rank=20, maxIter=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")  
model = als.fit(training)

In [39]:
# transform the model on desired data
predictions_te = model.transform(test)

In [42]:
rmse_te = evaluator.evaluate(predictions_te)
print("Root-mean-square error = " + str(rmse_te))
#spark.stop()

Root-mean-square error = 3.2957867503547407


In [43]:
predictions_tr = model.transform(training)

In [44]:
rmse_tr = evaluator.evaluate(predictions_tr)
print("Root-mean-square error = " + str(rmse_tr))

Root-mean-square error = 0.007158377883916877


### ALS with cross-validation

In [22]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
# Build the recommendation model using ALS on the training data
als_cv = ALS(rank=60, maxIter=14, regParam = 0.01, userCol="userId", itemCol="movieId", ratingCol="rating")  

paramGrid = ParamGridBuilder().build()

crossval = CrossValidator(
    estimator=als_cv,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=4)

In [23]:
model = crossval.fit(ratings)

In [32]:
model

CrossValidatorModel_4eb3b8aa18d411682097

In [24]:
# i m here
pred_cv = model.transform(submission_ratings)

In [30]:
rmse = evaluator.evaluate(pred_cv)

In [31]:
rmse

1.292028356338118

### Prepare file for submission

In [25]:
# save DataFrame as csv
pred_cv.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("data/sampleSubmission_new") # predictions.write.csv('data/pred_als_spark.csv')
# rename file just after previous operation

In [29]:
import glob
from helpers_als import make_submission
make_submission(input_file=glob.glob("data/sampleSubmission_new/*.csv"),output_file="data/newSubmission.csv")