In [1]:
# Useful starting lines
%matplotlib inline

import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

### My example - Arrange csv file

In [106]:
import fileinput

# DON'T FORGET TO REMOVE HEADER Id,Prediction
with fileinput.FileInput("data/data_train_woHeader.csv", inplace=True, backup='.txt') as file: # sampleSubmission_woHeader
    for line in file:
        print(line.replace("_", ","), end='')
        
with fileinput.FileInput("data/data_train_woHeader.csv", inplace=True, backup='.txt') as file:
    for line in file:
        print(line.replace("r", ""), end='')
        
with fileinput.FileInput("data/data_train_woHeader.csv", inplace=True, backup='.txt') as file:
    for line in file:
        print(line.replace("c", ""), end='')

### My example - Load submissionSample.csv for prediction by ALS

In [105]:
from pyspark.sql import Row 
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .getOrCreate()



In [21]:
lines = spark.read.text("data/sampleSubmission_woHeader.csv").rdd 
parts = lines.map(lambda row: row.value.split(","))
ratingsRDD = parts.map(lambda p: Row(movieId=int(p[0]), userId=int(p[1]), 
                                     rating=float(p[2])))
submission_ratings = spark.createDataFrame(ratingsRDD)

### My example - Use pyspark ALS

In [15]:
from pyspark.sql import Row 
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .getOrCreate()

lines = spark.read.text("data/data_train_woHeader.csv").rdd 
parts = lines.map(lambda row: row.value.split(","))
ratingsRDD = parts.map(lambda p: Row(movieId=int(p[0]), userId=int(p[1]), 
                                     rating=float(p[2])))
ratings = spark.createDataFrame(ratingsRDD)

In [None]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [47]:
from pyspark.ml.recommendation import ALS
# Build the recommendation model using ALS on the training data
als = ALS(rank=20, maxIter=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")  
model = als.fit(training)

In [48]:
predictions = model.transform(test)

In [49]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")

In [50]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))
# $example off$
spark.stop()

Root-mean-square error = 1.1231989279282748


### My example - Prepare file for submission

In [24]:
# save DataFrame as csv
pred_cv.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("data/sampleSubmission_new_to_be_processed") # predictions.write.csv('data/pred_als_spark.csv')

In [1]:
def deal_line(line):
    itemId, rating, userId, pred = line.split(',') # such is the ordering in the csv file created from 
    return int(itemId), int(userId), round(float(pred))

In [25]:
import csv
with open("data/sampleSubmission_new_to_be_processed/sampleSubmission_new_to_be_processed.csv", "r") as f:
    data = f.read().splitlines()
    data_transit = [deal_line(line) for line in data[1:]]
#f.close()

In [26]:
with open("data/sampleSubmission_als_20_10_cv_1.csv", 'w') as csvfile:
    fieldnames = ['Id', 'Prediction']
    writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
    writer.writeheader()
    
    data_transit.sort(key=lambda line: (line[1],line[0])) # order the coordinates by columns
    for item_ in data_transit:
        writer.writerow({'Id':'r'+'{}'.format(item_[0])+'_c'+'{}'.format(item_[1]),'Prediction':float(item_[2])})

In [27]:
with open("data/sampleSubmission_als_20_10_cv_1.csv","r") as f:
    reader = csv.reader(f,delimiter = ",")
    data = list(reader)
    row_count = len(data)

### My example - Cross-validation

In [16]:
from pyspark.ml.recommendation import ALS
# Build the recommendation model using ALS on the training data
als_perso = ALS(rank=20, maxIter=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")  

In [17]:
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = ParamGridBuilder().build() #\
                    #.addGrid(als_perso.rank, [8, 12]) \
                    #.addGrid(als_perso.maxIter, [10, 15]) \
                    #.build()

In [18]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")

In [19]:
from pyspark.ml.tuning import CrossValidator
crossval = CrossValidator(
    estimator=als_perso,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3)

In [20]:
model = crossval.fit(ratings)

In [22]:
pred_cv = model.transform(submission_ratings)

In [81]:
rmse = evaluator.evaluate(pred_cv)

In [82]:
rmse

0.8259579478838837

In [23]:
pred_cv.count()

1176952