In [1]:
#
# Copyright © 2019 Sunho Kim. All rights reserved.
#

In [2]:
cd ..

/gorani/gorani/backend/dataserver


In [3]:
from pyspark.sql import SparkSession, DataFrame

spark = SparkSession\
    .builder\
    .appName('Compute UWPP')\
    .getOrCreate()
sc = spark.sparkContext

In [60]:
# parameters
uw_threshold = 2
words_in_page = 250

In [18]:
from gorani.spark import read_data_all

df = read_data_all(spark, 'books', cache=True)
df.show()

+---+--------------------+--------------------+
| id|                name|             content|
+---+--------------------+--------------------+
|  1|                  It|[table, title, pa...|
|  2|If It’s for My Da...|[table, cover, pr...|
|  4|If It’s for My Da...|[table, cover, yo...|
|  5|To Kill A Mocking...|[dedication, part...|
|  6|            Twilight|[first, sight, mo...|
|  3|If It’s for My Da...|[table, cover, yo...|
+---+--------------------+--------------------+



In [30]:
import pyspark.sql.functions as F
from pyspark.sql import types as T

def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

def _slice_content(col, words_in_page):
    return list(chunks(col, words_in_page))

slice_content = F.udf(_slice_content, T.ArrayType(T.ArrayType(T.StringType())))

page_df = df.select('id', F.explode(slice_content('content', F.lit(words_in_page))).alias('content'))
page_df.show()

+---+--------------------+------------+
| id|             content|original_len|
+---+--------------------+------------+
|  1|[table, title, pa...|         250|
|  1|[peter, buick, ri...|         250|
|  1|[final, showdown,...|         250|
|  1|[along, diagonal,...|         250|
|  1|[jerk, darkness, ...|         250|
|  1|[would, hear, som...|         250|
|  1|[life, thick, phl...|         250|
|  1|[hair, eye, slit,...|         250|
|  1|[shit, dismayed, ...|         250|
|  1|[indeed, therefor...|         250|
|  1|[although, forty-...|         250|
|  1|[time, found, imp...|         250|
|  1|[wa, without, muc...|         250|
|  1|[one, touch, wa, ...|         250|
|  1|[wa, said, tissue...|         250|
|  1|[wa, would, surpr...|         250|
|  1|[amazing, quiet, ...|         250|
|  1|[engineering, fir...|         250|
|  1|[said, tell, know...|         250|
|  1|[im, prize, even,...|         250|
+---+--------------------+------------+
only showing top 20 rows



In [88]:
from pyspark.sql import Row
from gorani.spark import write_data
df2 = read_data_all(spark, 'user_known_words')
user_known_maps = df2.rdd.map(lambda x: (x['user_id'], (x['word'], x['score'])))\
        .groupByKey().map(lambda x: (x[0], dict((y, z) for y, z in x[1])))\
        .collect()
user_known_maps_bd = sc.broadcast(user_known_maps)
uw_threshold_bd = sc.broadcast(uw_threshold)
words_in_page_bd = sc.broadcast(words_in_page)

def known(known_map, word):
    if word not in known_map:
        return 0
    return known_map[word]

def uwpp(row):
    return [Row(user_id=id,\
                uwpp=len([word for word in row['content'] if known(nmap, word) <= uw_threshold_bd.value])/words_in_page_bd.value,\
                book_id=row['id']) for id, nmap in user_known_maps_bd.value]
uwpp_df = page_df.rdd.flatMap(uwpp).toDF()
res_df = uwpp_df.groupBy('book_id', 'user_id').agg(F.mean('uwpp').alias('auwpp'))

write_data('user_auwpps', res_df)
print('success')

success
