In [1]:
import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import Row
import pyspark.sql.functions as func
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from sklearn.model_selection import KFold
import numpy as np

spark = SparkSession.builder \
    .master("local[2]") \
    .appName("COM6012 Assignment 1 Task2 QC") \
    .config("spark.driver.memory", "4g")\
    .getOrCreate()

sc = spark.sparkContext

sc.setCheckpointDir('checkpoint/')


In [2]:
lines = spark.read.text("ml-25m/ratings.csv").rdd
parts = lines.map(lambda row: row.value.split(","))

header = parts.first()
parts = parts.filter(lambda line: line != header)

ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

lines = spark.read.text("ml-25m/genome-scores.csv").rdd
parts = lines.map(lambda row: row.value.split(","))
header = parts.first()
parts = parts.filter(lambda line: line != header)

scoresRDD = parts.map(lambda p: Row(movieId=int(p[0]), tagId=int(p[1]),relevance=float(p[2])))
scores = spark.createDataFrame(scoresRDD)

lines = spark.read.text("ml-25m/genome-tags.csv").rdd
parts = lines.map(lambda row: row.value.split(","))
header = parts.first()
parts = parts.filter(lambda line: line != header)

tagsRDD = parts.map(lambda p: Row(tagId=int(p[0]), tag=str(p[1])))
tags = spark.createDataFrame(tagsRDD)

lines = spark.read.text("ml-25m/tags.csv").rdd
parts = lines.map(lambda row: row.value.split(","))
header = parts.first()
parts = parts.filter(lambda line: line != header)

count_tagsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),tag=str(p[2]), timestamp=int(p[3])))
count_tags = spark.createDataFrame(count_tagsRDD)


In [3]:
(fold_0, fold_1, fold_2) = ratings.randomSplit([1.0, 1.0, 1.0],seed=1)

data_list = [fold_0,fold_1,fold_2]
test_list = [fold_2,fold_0,fold_1]

In [4]:
als = ALS(maxIter=10, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")

In [5]:
def cross_validate(als_version):
    dfItemFactors_list = []
    fold_j = 0
    for fold_i in range(len(data_list)):
        if fold_j < len(data_list)-1:
            fold_j+= 1
        else:
            fold_j = 0
        first_train = data_list[fold_i].union(data_list[fold_j])
        model = als_version.fit(first_train)
        dfItemFactors=model.itemFactors
        dfItemFactors_list.append(dfItemFactors)
    return dfItemFactors_list

In [None]:
ItemFactors_list = cross_validate(als)

In [None]:
from pyspark.ml.linalg import Vectors
def transData(data):
    return data.rdd.map(lambda r: [r[0],Vectors.dense(r[1])]).toDF(['movieId','features'])
                                                                   
df_1_vec= transData(ItemFactors_list[0])
df_2_vec= transData(ItemFactors_list[1])
df_3_vec= transData(ItemFactors_list[2])

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import KMeansModel
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans().setK(25).setSeed(1)
model_1= kmeans.fit(df_1_vec)
model_2= kmeans.fit(df_2_vec)
model_3= kmeans.fit(df_3_vec)
predictions_1 = model_1.transform(df_1_vec)
predictions_2 = model_2.transform(df_2_vec)
predictions_3 = model_3.transform(df_3_vec)

In [None]:
movieid_cluster_1 = predictions_1.drop('features')
movieid_cluster_2 = predictions_2.drop('features')
movieid_cluster_3 = predictions_3.drop('features')

In [None]:
movieid_cluster_3.show()

In [None]:
largest_cluster_1 = predictions_1.groupBy('prediction').count().sort('count', ascending=False).limit(3)
largest_cluster_2 = predictions_2.groupBy('prediction').count().sort('count', ascending=False).limit(3)
largest_cluster_3 = predictions_3.groupBy('prediction').count().sort('count', ascending=False).limit(3)

In [None]:
print("First Split:")
largest_cluster_1.show()
print("Second Split:")
largest_cluster_2.show()
print("Third Split:")
largest_cluster_3.show()

In [None]:
#getting dataset 
first_cluster_1 = movieid_cluster_1.filter(movieid_cluster_1['prediction'].contains("11"))
second_cluster_1 = movieid_cluster_1.filter(movieid_cluster_1['prediction'].contains("13"))
third_cluster_1 = movieid_cluster_1.filter(movieid_cluster_1['prediction'].contains("14"))

first_cluster_2 = movieid_cluster_2.filter(movieid_cluster_2['prediction'].contains("15"))
second_cluster_2 = movieid_cluster_2.filter(movieid_cluster_2['prediction'].contains("22"))
third_cluster_2 = movieid_cluster_2.filter(movieid_cluster_2['prediction'].contains("11"))

first_cluster_3 = movieid_cluster_3.filter(movieid_cluster_3['prediction'].contains("16"))
second_cluster_3 = movieid_cluster_3.filter(movieid_cluster_3['prediction'].contains("3"))
third_cluster_3 = movieid_cluster_3.filter(movieid_cluster_3['prediction'].contains("0"))

In [None]:
#use inner join to have all the tags for the movies in the cluster 
first_cluster_scores_1 = first_cluster_1.join(scores, on=['movieId'], how='inner')
second_cluster_scores_1 = second_cluster_1.join(scores, on=['movieId'], how='inner')
third_cluster_scores_1 = third_cluster_1.join(scores, on=['movieId'], how='inner')

first_cluster_scores_2 = first_cluster_2.join(scores, on=['movieId'], how='inner')
second_cluster_scores_2 = second_cluster_2.join(scores, on=['movieId'], how='inner')
third_cluster_scores_2 = third_cluster_2.join(scores, on=['movieId'], how='inner')

first_cluster_scores_3 = first_cluster_3.join(scores, on=['movieId'], how='inner')
second_cluster_scores_3 = second_cluster_3.join(scores, on=['movieId'], how='inner')
third_cluster_scores_3 = third_cluster_3.join(scores, on=['movieId'], how='inner')

In [None]:
#find the top 3 tags by adding all the scores, fc,sc,tc refer to first,second,third clusters.
largest_tags_fc_1 = first_cluster_scores_1.groupBy('tagId').agg(func.sum('relevance'))
largest_tags_fc_2 = second_cluster_scores_1.groupBy('tagId').agg(func.sum('relevance'))
largest_tags_fc_3 = third_cluster_scores_1.groupBy('tagId').agg(func.sum('relevance'))

largest_tags_sc_1 = first_cluster_scores_2.groupBy('tagId').agg(func.sum('relevance'))
largest_tags_sc_2 = second_cluster_scores_2.groupBy('tagId').agg(func.sum('relevance'))
largest_tags_sc_3 = third_cluster_scores_2.groupBy('tagId').agg(func.sum('relevance'))

largest_tags_tc_1  = first_cluster_scores_3.groupBy('tagId').agg(func.sum('relevance'))
largest_tags_tc_2 = second_cluster_scores_3.groupBy('tagId').agg(func.sum('relevance'))
largest_tags_tc_3 = third_cluster_scores_3.groupBy('tagId').agg(func.sum('relevance'))

In [None]:
print("---------------------   First Split     --------------------------")
largest_tags_fc_1.sort('sum(relevance)', ascending=False).show(3,False)
largest_tags_fc_2.sort('sum(relevance)', ascending=False).show(3,False)
largest_tags_fc_3.sort('sum(relevance)', ascending=False).show(3,False)
print("---------------------   Second Split    --------------------------")
largest_tags_sc_1.sort('sum(relevance)', ascending=False).show(3,False)
largest_tags_sc_2.sort('sum(relevance)', ascending=False).show(3,False)
largest_tags_sc_3.sort('sum(relevance)', ascending=False).show(3,False)
print("---------------------   Third Split     --------------------------")
largest_tags_tc_1.sort('sum(relevance)', ascending=False).show(3,False)
largest_tags_tc_2.sort('sum(relevance)', ascending=False).show(3,False)
largest_tags_tc_3.sort('sum(relevance)', ascending=False).show(3,False)
print("------------------------------------------------------------------")

In [None]:
first_cluster_scores_1.count()

In [None]:
first_cluster_scores_1.filter(first_cluster_scores_1['tagId'].contains("742")).count()

In [None]:
first_cluster_scores_1.filter(first_cluster_scores_1['tagId'].contains("646")).count()

In [None]:
print("---------------------TOP 3 Tag for each cluster and split------------------------")
print("---------------------First Split First Cluster------------------------")
tags.filter(tags['tagId'].contains("742")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("742")).count())

tags.filter(tags['tagId'].contains("646")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("646")).count())

tags.filter(tags['tagId'].contains("468")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("468")).count())

print("---------------------First Split Second Cluster------------------------")
tags.filter(tags['tagId'].contains("742")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("742")).count())

tags.filter(tags['tagId'].contains("646")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("646")).count())

tags.filter(tags['tagId'].contains("188")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("188")).count())


print("---------------------First Split Third Cluster------------------------")
tags.filter(tags['tagId'].contains("742")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("742")).count())

tags.filter(tags['tagId'].contains("807")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("807")).count())

tags.filter(tags['tagId'].contains("792")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("792")).count())

print("---------------------Second Split First Cluster------------------------")
tags.filter(tags['tagId'].contains("742")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("742")).count())

tags.filter(tags['tagId'].contains("646")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("646")).count())

tags.filter(tags['tagId'].contains("323")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("323")).count())

print("---------------------Second Split Second Cluster------------------------")
tags.filter(tags['tagId'].contains("742")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("742")).count())

tags.filter(tags['tagId'].contains("807")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("807")).count())

tags.filter(tags['tagId'].contains("646")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("646")).count())

print("---------------------Second Split Third Cluster------------------------")
tags.filter(tags['tagId'].contains("742")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("742")).count())

tags.filter(tags['tagId'].contains("646")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("646")).count())

tags.filter(tags['tagId'].contains("1104")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("1104")).count())

print("---------------------Third Split First Cluster------------------------")
tags.filter(tags['tagId'].contains("742")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("742")).count())

tags.filter(tags['tagId'].contains("646")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("646")).count())

tags.filter(tags['tagId'].contains("445")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("445")).count())

print("---------------------Third Split Second Cluster------------------------")
tags.filter(tags['tagId'].contains("742")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("742")).count())

tags.filter(tags['tagId'].contains("702")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("702")).count())

tags.filter(tags['tagId'].contains("1104")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("1104")).count())

print("---------------------Third Split Third Cluster------------------------")
tags.filter(tags['tagId'].contains("742")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("742")).count())

tags.filter(tags['tagId'].contains("646")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("646")).count())

tags.filter(tags['tagId'].contains("188")).show()
# print('Respective Number of movies having the tags:',count_tags.filter(count_tags['tagId'].contains("188")).count())
