In [2]:
#
# Copyright © 2019 Sunho Kim. All rights reserved.
#

In [3]:
cd ..

/gorani/gorani/backend/dataserver


In [4]:
from pyspark.sql import SparkSession, DataFrame

spark = SparkSession\
    .builder\
    .appName('Recommend Books')\
    .getOrCreate()
sc = spark.sparkContext

In [5]:
# parameters
book_number = 1

In [6]:
from gorani.spark import read_api_all, read_data_all

rate_df = read_api_all(spark, 'rates')\
        .where('kind = "recommended_book"')
rate_df.show()

eb_df = read_data_all(spark, 'experienced_books')
eb_df.show()

rb_df = read_data_all(spark, 'readable_books')
rb_df.show()

rcb_df = read_api_all(spark, 'recommended_books')
rcb_df.show()

cluster_df = read_data_all(spark, 'book_cluster')
cluster_df.show()

+--------------------+-------+---------+----------------+----+--------------------+--------------------+
|                  id|user_id|target_id|            kind|rate|          created_at|          updated_at|
+--------------------+-------+---------+----------------+----+--------------------+--------------------+
|ae142f49-b63e-443...|     10|        2|recommended_book|  -1|2019-05-25 21:13:...|2019-05-25 22:42:...|
+--------------------+-------+---------+----------------+----+--------------------+--------------------+

+-------+-------+
|user_id|book_id|
+-------+-------+
+-------+-------+

+-------+-------+
|user_id|book_id|
+-------+-------+
+-------+-------+

+---+-------+-------+----------+----------+
| id|user_id|book_id|created_at|updated_at|
+---+-------+-------+----------+----------+
+---+-------+-------+----------+----------+

+---+-------+
| id|cluster|
+---+-------+
|  5|      1|
|  6|      1|
|  1|      0|
|  2|      0|
|  4|      1|
|  3|      1|
+---+-------+



In [9]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

cluster_count_df = rate_df.join(cluster_df, rate_df['target_id'] == cluster_df['id'], 'inner')\
    .select(F.col('target_id').alias('book_id'), 'user_id', 'cluster', 'rate')\
    .groupBy('user_id', 'cluster').agg(F.sum('rate').alias('count'))\
    .select('user_id', 'cluster', F.when(F.col('count') < 0, F.lit(0)).otherwise(F.col('count').cast(IntegerType())).alias('count'))

cluster_count_sum_df = cluster_count_df.groupBy('user_id')\
                    .agg(F.sum('count').alias('sum'))

cluster_count_df = cluster_count_df.join(cluster_count_sum_df.alias('f'),\
                    cluster_count_df['user_id'] == cluster_count_sum_df['user_id'], 'left')\
                    .drop(F.col('f.user_id'))

rcbn_df = rcb_df.groupBy('user_id')\
    .agg(F.count(F.lit(1)).alias('old_rec'))\

cluster_count_df = cluster_count_df.join(rcbn_df.alias('f2'),\
        cluster_count_df['user_id'] == rcbn_df['user_id'], 'left')\
        .drop(F.col('f2.user_id'))\
        .select('user_id', 'cluster', 'count', 'sum', F.when(F.isnull('old_rec'), 0).otherwise(F.col('old_rec')).alias('old_rec'))

cluster_count_df.show()

+-------+-------+-----+---+-------+
|user_id|cluster|count|sum|old_rec|
+-------+-------+-----+---+-------+
|     10|      0|    0|  0|      0|
+-------+-------+-----+---+-------+



In [10]:
df = cluster_df.join(rb_df, cluster_df['id'] == rb_df['book_id'], 'inner').drop(F.col('id'))
df = df.join(eb_df, (df['book_id'] == eb_df['book_id']) & (df['user_id'] == eb_df['user_id']), 'left_anti')
candidate_df = df.join(rate_df, (df['book_id'] == rate_df['target_id']) & (df['user_id'] == rate_df['user_id']), 'left_anti')
candidate_df.show()

+-------+-------+-------+
|cluster|user_id|book_id|
+-------+-------+-------+
+-------+-------+-------+



In [11]:
need_cluster_df = cluster_count_df.select('user_id', 'cluster', 'old_rec',\
(F.col('count') / F.when(F.col('sum') == 0, F.lit(0.1)).otherwise(F.col('sum'))\
* (F.lit(book_number) - F.col('old_rec'))).cast(IntegerType()).alias('need'))

need_cluster_df.show()
need_cluster = need_cluster_df.collect()

+-------+-------+-------+----+
|user_id|cluster|old_rec|need|
+-------+-------+-------+----+
|     10|      0|      0|   0|
+-------+-------+-------+----+



In [16]:
from gorani.spark import write_api
from gorani.utils import uuid

out = []
for row in need_cluster:
    out += candidate_df.where((F.col('cluster') == row['cluster']) & (F.col('user_id') == row['user_id']))\
                .orderBy(F.rand())\
                .limit(row['need'])\
                .drop(F.col('cluster'))\
                .collect()
    if row['need'] != (book_number - row['old_rec']):
        need = book_number - row['need']
        out += candidate_df\
            .orderBy(F.rand())\
            .limit(need)\
            .drop(F.col('cluster'))\
            .collect()

if len(out) != 0:
    result_df = spark.createDataFrame(out)
    result_df = result_df.withColumn('id', uuid())\
        .withColumn('updated_at', F.current_timestamp())\
        .withColumn('created_at', F.current_timestamp())
    write_api('recommended_books', result_df)
    result_df.show()

print('success')

+---+
| id|
+---+
|  6|
+---+

success
