In [170]:
from pyspark.ml.recommendation import ALS
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
import pandas as pd
sc = SparkContext()
sql_sc = SQLContext(sc)

In [182]:
from pyspark.ml.evaluation import RegressionEvaluator
spark = SparkSession.builder.appName('rec').getOrCreate()

RDD_df_ratings = sc.textFile("C://Users/user/Desktop/ml-latest-small/ratings1.csv")

In [115]:
header = RDD_df_ratings.first()

In [116]:
RDD_df_ratings.take(5)

[u'userId,movieId,rating', u'1,31,2.5', u'1,1029,3', u'1,1061,3', u'1,1129,2']

In [117]:
schema = header.replace("'", '')

In [118]:
fields = [StructField(field_name, StringType(), True) for field_name in schema.split(',')]
fields

[StructField(userId,StringType,true),
 StructField(movieId,StringType,true),
 StructField(rating,StringType,true)]

In [119]:
fields[0].dataType = IntegerType()
fields[1].dataType = IntegerType()
fields[2].dataType =FloatType()
#len(fields)

In [120]:
schema = StructType(fields)

In [121]:
header = RDD_df_ratings.filter(lambda l: "userId" in l)

In [122]:
header.collect()

[u'userId,movieId,rating']

In [123]:
noHeader= RDD_df_ratings.subtract(taxiHeader)

In [124]:
noHeader.first()

u'564,2302,4'

In [70]:
RDD_df_ratings.count()

100005

In [125]:
temp = noHeader.map(lambda k: k.split(','))

In [126]:
temp.first()

[u'564', u'2302', u'4']

In [127]:
temp1 = temp.map(lambda p:(int(p[0]), int(p[1]), float(p[2])))

In [145]:
temp1.take(5)

[(564, 2302, 4.0),
 (285, 1754, 4.0),
 (285, 1641, 3.0),
 (623, 8636, 3.5),
 (363, 2939, 3.0)]

In [143]:
schema

StructType(List(StructField(userId,IntegerType,true),StructField(movieId,IntegerType,true),StructField(rating,FloatType,true)))

In [148]:
df = sql_sc.createDataFrame(temp1,schema)

In [150]:
df.head(10)

[Row(userId=564, movieId=2302, rating=4.0),
 Row(userId=285, movieId=1754, rating=4.0),
 Row(userId=285, movieId=1641, rating=3.0),
 Row(userId=623, movieId=8636, rating=3.5),
 Row(userId=363, movieId=2939, rating=3.0),
 Row(userId=353, movieId=4826, rating=3.0),
 Row(userId=612, movieId=50872, rating=3.5),
 Row(userId=481, movieId=3629, rating=5.0),
 Row(userId=408, movieId=123, rating=5.0),
 Row(userId=529, movieId=7073, rating=3.0)]

In [176]:
training, test = df.randomSplit([0.8, 0.2])

In [173]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")

In [178]:
model = als.fit(training)

In [179]:
predictions = model.transform(test)

In [180]:
predicted_ratings_df = predictions.filter(predictions.prediction != float('nan'))

In [185]:
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")
error = reg_eval.evaluate(predicted_ratings_df)

In [191]:
print('RMSE: {0}'.format(error))
#Higher RMSE than the model trained on the pyspark dataframe.

RMSE: 1.09984646774
