In [1]:
import sys
sys.path.append('gorani.zip')
sc.addPyFile('gorani.zip')
from gorani import firebase
firebase.init('spark')
mydb = firebase.db()
from gorani.gorani import Gorani
from gorani.transformer import Transformer
from gorani.utils import split_sentence
gorani = Gorani(mydb)
transformer = Transformer(gorani, spark, sc)

In [2]:
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from gorani.transformer import piper
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [3]:
df = spark.read.json("clean_logs.json")
df = df.filter(df['scorePerc'] >= 0)

In [4]:
indDf = df.n(transformer.parse_time())\
    .withColumn("id", F.monotonically_increasing_id())\
    .withColumn('sentence', F.explode('sentences'))\
    .withColumn('sid', F.col('sentence.sid'))\
    .withColumn('unknown', F.when(F.col('sentence.unknown') == True, 1).otherwise(0))\
    .withColumn('wordCount', F.size('sentence.words'))\
    .withColumn('uwordCount', F.size('sentence.unknownWords'))

senDf = indDf\
    .groupBy('id', 'userId', 'eltime', 'sentences', 'scorePerc', 'timeZ', 'classId', 'chapterId', 'bookId').agg(F.sum('unknown').alias('usenCount'), 
                       F.sum('wordCount').alias('wordCount'), 
                       F.sum('uwordCount').alias('uwordCount'))\
    .drop('id').withColumn('wpm', F.col('wordCount')/(F.col('eltime')/(1000*60))).filter(F.col('wpm') < 1000)\
    .filter(F.col('eltime') < 5*60*1000)

In [5]:
@F.pandas_udf(IntegerType())
def uscore(x,y):
    import pandas as pd
    return pd.Series([1 if y in x else 0 for x, y in zip(x,y)])

In [11]:
from pyspark.sql.window import Window
import time

vocabDf = indDf\
    .withColumn('sid', F.col('sentence.sid'))\
    .withColumn('wordCount', F.size('sentence.words'))\
    .withColumn('word', F.explode('sentence.words'))\
    .withColumn('no', uscore('sentence.unknownWords', F.col('word')))\
    .withColumn('yes', 1 - F.col('no'))\
    .withColumn('word', F.lower(F.col('word')))\
    .withColumn('word', transformer.stem('word'))\
    .groupBy('classId', 'userId', 'word')\
    .agg(F.sum('no').alias('ucount'), F.sum('yes').alias('ncount'))\
    .select('classId', 'userId', 'word', 'ucount', 'ncount')\

nwordDf = vocabDf.filter((F.col('ncount')-5*F.col('ucount')) > 0).drop('ncount').drop('ucount')

uwordDf = vocabDf.filter((F.col('ncount')-5*F.col('ucount')) <= 0).drop('ncount').drop('ucount')


#                 .groupBy('userId')\
#                 .agg(F.collect_list('word').alias('words'))


In [12]:
@F.pandas_udf(ArrayType(StringType()))
def split_setence_udf(sen):
    from gorani.utils import split_sentence
    import pandas as pd
    return pd.Series([split_sentence(sen) for sen in sen])

from pyspark.sql.window import Window

totalCountDf = transformer.booksDf.withColumn('sentence', transformer.get_sentence('bookId', 'chapterId', 'sid'))\
        .withColumn('sentence', split_setence_udf('sentence'))\
        .withColumn('word', F.explode('sentence'))\
        .drop('sentence')\
        .withColumn('word', F.lower(F.col('word')))\
        .withColumn('word', transformer.stem('word'))\
        .drop('chapterId').drop('sid')\
        .dropDuplicates(['bookId', 'word'])\
        .withColumn('totalCount', F.count('*').over(Window.partitionBy('bookId')))

npercDf = totalCountDf.join(nwordDf, ['word'], 'inner')\
        .groupBy('classId', 'userId', 'bookId', 'totalCount')\
        .agg(F.count('*').alias('count'))\
        .withColumn('nperc', F.col('count')/F.col('totalCount'))\
        .drop('totalCount').drop('count')

upercDf = totalCountDf.join(uwordDf, ['word'], 'inner')\
        .groupBy('classId', 'userId', 'bookId', 'totalCount')\
        .agg(F.count('*').alias('count'))\
        .withColumn('uperc', F.col('count')/F.col('totalCount'))\
        .drop('totalCount').drop('count')

percDf = npercDf.join(upercDf, ['classId', 'userId', 'bookId'], 'full')\
                .withColumn('uperc', F.when(F.col('uperc').isNull(), 0).otherwise(F.col('uperc')))\
                .withColumn('nperc', F.when(F.col('nperc').isNull(), 0).otherwise(F.col('nperc')))\
                .withColumn('eperc', 1 - F.col('uperc') - F.col('nperc'))


In [28]:
from pyspark.sql.window import Window
import time

window = Window.partitionBy('classId', 'bookId').orderBy(F.col('uperc').desc())

classPercDf = percDf.groupBy('classId', 'bookId')\
    .agg(F.avg('eperc').alias('eperc'), F.avg('nperc').alias('nperc'), F.avg('uperc').alias('uperc'))

classStruggleDf = percDf\
    .select('classId', 'bookId', 'userId', F.row_number().over(window).alias('rank'))\
    .filter(F.col('rank') <= 2)\
    .withColumn('username', transformer.get_username(F.col('userId')))\
    .groupBy('classId', 'bookId')\
    .agg(F.collect_list('username').alias('struggles'))


In [30]:
classDf = classPercDf.join(classStruggleDf, ['classId', 'bookId'], 'inner')

In [31]:
rows = classDf.collect()

In [32]:
classes = list(set([row['classId'] for row in rows]))
out = {
    classId: [
        {
            'bookId': row['bookId'],
            'eperc': row['eperc'],
            'nperc': row['nperc'],
            'uperc': row['uperc'],
            'struggles': row['struggles']
        }
        for row in rows if row['classId'] == classId
    ]
    for classId in classes
}

for classId, res in out.items():
    mydb.collection('dataResult').document(classId).set({'recommendedBooks': res})

In [None]:
from pyspark.ml.fpm import FPGrowth

fpGrowth = FPGrowth(itemsCol="words", minSupport=0.8, minConfidence=0.6)
model = fpGrowth.fit(uwordDf)
model.associationRules.orderBy(F.col('lift').desc()).show(1000, False)