In [1]:
#
# Copyright © 2019 Sunho Kim. All rights reserved.
#

In [2]:
cd ..

/gorani/gorani/backend/dataserver


In [3]:
from pyspark.sql import SparkSession, DataFrame

spark = SparkSession\
    .builder\
    .appName('Compute AUWPP')\
    .getOrCreate()
sc = spark.sparkContext

In [4]:
# parameters
uw_threshold = 2
words_in_page = 250

In [5]:
from gorani.spark import read_data_all

df = read_data_all(spark, 'books', cache=True)
df.show()

+---+--------------------+--------------------+
| id|                name|             content|
+---+--------------------+--------------------+
|  1|                  It|[table, title, pa...|
|  2|If It’s for My Da...|[table, cover, pr...|
|  4|If It’s for My Da...|[table, cover, yo...|
|  5|To Kill A Mocking...|[dedication, part...|
|  6|            Twilight|[first, sight, mo...|
|  3|If It’s for My Da...|[table, cover, yo...|
+---+--------------------+--------------------+



In [6]:
import pyspark.sql.functions as F
from pyspark.sql import types as T

def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

def _slice_content(col, words_in_page):
    return list(chunks(col, words_in_page))

slice_content = F.udf(_slice_content, T.ArrayType(T.ArrayType(T.StringType())))

page_df = df.select('id', F.explode(slice_content('content', F.lit(words_in_page))).alias('content'))
page_df.show()

+---+--------------------+
| id|             content|
+---+--------------------+
|  1|[table, title, pa...|
|  1|[peter, buick, ri...|
|  1|[final, showdown,...|
|  1|[along, diagonal,...|
|  1|[jerk, darkness, ...|
|  1|[would, hear, som...|
|  1|[life, thick, phl...|
|  1|[hair, eye, slit,...|
|  1|[shit, dismayed, ...|
|  1|[indeed, therefor...|
|  1|[although, forty-...|
|  1|[time, found, imp...|
|  1|[wa, without, muc...|
|  1|[one, touch, wa, ...|
|  1|[wa, said, tissue...|
|  1|[wa, would, surpr...|
|  1|[amazing, quiet, ...|
|  1|[engineering, fir...|
|  1|[said, tell, know...|
|  1|[im, prize, even,...|
+---+--------------------+
only showing top 20 rows



In [8]:
from pyspark.sql import Row
from gorani.spark import write_data
df2 = read_data_all(spark, 'user_known_words')
user_known_maps = df2.rdd.map(lambda x: (x['user_id'], (x['word'], x['score'])))\
        .groupByKey().map(lambda x: (x[0], dict((y, z) for y, z in x[1])))\
        .collect()
user_known_maps_bd = sc.broadcast(user_known_maps)
uw_threshold_bd = sc.broadcast(uw_threshold)
words_in_page_bd = sc.broadcast(words_in_page)

def known(known_map, word):
    if word not in known_map:
        return 0
    return known_map[word]

def uwpp(row):
    return [Row(user_id=id,\
                uwpp=len([word for word in row['content'] if known(nmap, word) <= uw_threshold_bd.value])/words_in_page_bd.value,\
                book_id=row['id']) for id, nmap in user_known_maps_bd.value]
uwpp_df = page_df.rdd.flatMap(uwpp).toDF()
res_df = uwpp_df.groupBy('book_id', 'user_id').agg(F.mean('uwpp').alias('auwpp'))
res_df.show()
write_data('user_auwpps', res_df)
print('success')

+-------+-------+------------------+
|book_id|user_id|             auwpp|
+-------+-------+------------------+
|      5|      1|0.9704294478527603|
|      1|      1|0.9767935222672084|
|      4|     10|0.8721188118811884|
|      6|      1|0.9787021276595733|
|      1|     10| 0.869122807017542|
|      4|      1|0.9712871287128707|
|      6|     10|0.8670000000000005|
|      2|     10|0.8658227848101266|
|      5|     10|0.8648834355828219|
|      2|      1| 0.970784810126582|
|      3|     10|0.8651809523809528|
|      3|      1|0.9739809523809522|
+-------+-------+------------------+

success
