In [5]:
import os
import pyspark
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DoubleType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, lit
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [17]:
'''additional function for fp growth'''
from pyspark.sql.functions import collect_set, count
from pyspark.ml.fpm import FPGrowth

In [7]:
'''
load pyspark
'''
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
spark = SparkSession.builder.appName("Spark_Processor").config("spark.redis.port", "6379").config("spark.jars", "spark-redis-branch-2.4/target/spark-redis_2.11-2.5.0-SNAPSHOT-jar-with-dependencies.jar").getOrCreate()
conf = pyspark.SparkConf()
sc = spark.sparkContext

In [26]:
read_likedMoviedf = spark.read.format("org.apache.spark.sql.redis").option("table", "likedMovie").option("key.column", "key").load()
read_likedMoviedf.show()

+-------+--------------------+------+------+
|movieId|                 key|userId|rating|
+-------+--------------------+------+------+
|   6503|User ID: 149977, ...|149977|   5.0|
|   4226|User ID: 730, Mov...|   730|   5.0|
| 122912|User ID: 137122, ...|137122|   4.5|
|   4226|User ID: 27592, M...| 27592|   5.0|
|   1222|User ID: 27813, M...| 27813|   5.0|
|   1293|User ID: 109671, ...|109671|   5.0|
|  72998|User ID: 55013, M...| 55013|   5.0|
|   2046|User ID: 35213, M...| 35213|   5.0|
|  83613|User ID: 70678, M...| 70678|   4.5|
|   4890|User ID: 132357, ...|132357|   5.0|
|  87306|User ID: 63360, M...| 63360|   5.0|
|    924|User ID: 139148, ...|139148|   4.5|
|   1213|User ID: 74028, M...| 74028|   5.0|
|   1125|User ID: 105083, ...|105083|   5.0|
|    538|User ID: 15598, M...| 15598|   5.0|
|   1580|User ID: 34169, M...| 34169|   5.0|
|    589|User ID: 95905, M...| 95905|   5.0|
|   1089|User ID: 85327, M...| 85327|   5.0|
|   3396|User ID: 110253, ...|110253|   5.0|
|  72998|U

In [27]:
# https://databricks.com/blog/2018/09/18/simplify-market-basket-analysis-using-fp-growth-on-databricks.html
df = read_likedMoviedf.groupBy('userId').agg(collect_set('movieId').alias('items'))

In [28]:
df.show()

+------+--------------------+
|userId|               items|
+------+--------------------+
|   148|[589, 2067, 750, ...|
|   463|              [1391]|
|   833|[79091, 91658, 32...|
|  1088|              [1270]|
|  1238|[4226, 91658, 122...|
|  1342|[40819, 4226, 122...|
|  1580|             [68135]|
|  1645|          [589, 377]|
|  1829|     [368, 932, 908]|
|  1959|   [48385, 849, 377]|
|  2122|[4226, 1222, 1208...|
|  2142|              [1608]|
|  2366|              [7162]|
|  2866|              [1213]|
|  3175|         [750, 1292]|
|  3794|[4226, 56145, 750...|
|  3918|[4701, 589, 4226,...|
|  3997|[1204, 49772, 668...|
|  4900|[4226, 1617, 1049...|
|  4935|              [4226]|
+------+--------------------+
only showing top 20 rows



In [29]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

In [30]:
associationRules = model.associationRules

In [32]:
associationRules.show()

+----------+----------+----------+----+
|antecedent|consequent|confidence|lift|
+----------+----------+----------+----+
+----------+----------+----------+----+



In [33]:
df.show()

+------+--------------------+
|userId|               items|
+------+--------------------+
|   148|[589, 2067, 750, ...|
|   463|              [1391]|
|   833|[79091, 91658, 32...|
|  1088|              [1270]|
|  1238|[4226, 91658, 122...|
|  1342|[40819, 4226, 122...|
|  1580|             [68135]|
|  1645|          [589, 377]|
|  1829|     [368, 932, 908]|
|  1959|   [48385, 849, 377]|
|  2122|[4226, 1222, 1208...|
|  2142|              [1608]|
|  2366|              [7162]|
|  2866|              [1213]|
|  3175|         [750, 1292]|
|  3794|[4226, 56145, 750...|
|  3918|[4701, 589, 4226,...|
|  3997|[1204, 49772, 668...|
|  4900|[4226, 1617, 1049...|
|  4935|              [4226]|
+------+--------------------+
only showing top 20 rows



In [34]:
model.freqItemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+

