In [18]:
import sys
sys.path.append('gorani.zip')
sc.addPyFile('gorani.zip')
from gorani import firebase
firebase.init('spark')
mydb = firebase.db()
from gorani.gorani import Gorani
from gorani.transformer import Transformer
from gorani.utils import split_sentence
gorani = Gorani(mydb)
transformer = Transformer(gorani, spark, sc)

ValueError: The default Firebase app already exists. This means you called initialize_app() more than once without providing an app name as the second argument. In most cases you only need to call initialize_app() once. But if you do want to initialize multiple apps, pass a second argument to initialize_app() to give each app a unique name.

In [22]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [23]:
import os
df = spark.read.csv("logs.csv", header="true", quote='"', escape='"')

In [24]:
from pyspark.sql.functions import col, pandas_udf, PandasUDFType
from pyspark.sql.functions import udf, from_json
import pyspark.sql.functions as F
from pyspark.sql.types import *

PaginateSentence = StructType([
    StructField("sid", StringType()),
    StructField("words", ArrayType(StringType())),
    StructField("unknownIndice", ArrayType(IntegerType())),
    StructField("unknownWords", ArrayType(StringType())),
    StructField("unknown", BooleanType())
])

SubmitQuestionPayload = StructType([
    StructField("bookId", StringType()),
    StructField("chapterId", StringType()),
    StructField("questionId", StringType()),
    StructField("option", StringType()),
    StructField("right", BooleanType()),
    StructField("time", IntegerType())
])


@udf(ArrayType(PaginateSentence))
def get_sentences(bookId, chapterId, sids, wordUnknowns, sentenceUnknowns):
    out = []
    for sid in sids:
        wis = []
        wus = []
        unknown = False
        sentence = gorani.get_sentence(bookId, chapterId, sid) or ""
        words = split_sentence(sentence)
        for su in sentenceUnknowns:
            if su['sentenceId'] == sid:
                unknown = True
        for wu in wordUnknowns:
            if wu['sentenceId'] == sid:
                wi = wu['wordIndex']
                if words[wi] != wu['word']:
                    raise Exception(sentence + ' ' + sid + ' word mismatch: ' + words[wi]  + ',' + wu['word'])
                wis.append(wi)
                wus.append(words[wi])
        out.append({'sid': sid, 'words': words, 'unknownIndice': wis, 'unknownWords': wus,'unknown': unknown})
    return out

paginateDf = df.n(transformer.parse_paginate())\
    .n(transformer.filter_cheat())\
    .withColumn('sentences', get_sentences(col('bookId'), col('chapterId'), col('sids'), col('wordUnknowns'), col('sentenceUnknowns')))\
    .select('time', 'classId', 'userId', 'bookId', 'chapterId', 'sentences', 'eltime')


In [25]:
import pandas as pd
import math

qDf = df.filter(df['type'] == 'submit_question')\
    .withColumn('payload', from_json(col('payload'), SubmitQuestionPayload))\
    .withColumn('eltime', col('payload.time'))\
    .withColumn('qid', col('payload.questionId'))\
    .withColumn('chapterId', col('payload.chapterId'))\
    .withColumn('bookId', col('payload.bookId'))\
    .withColumn('option', col('payload.option'))\
    .withColumn('right', col('payload.right'))\
    .drop('payload')

rawDf = qDf.orderBy('time').dropDuplicates(['qid','userId','chapterId']).drop('time')\
    .withColumn('cr', F.when(col('right') == True, 1).otherwise(0)).groupBy('userId', 'bookId', 'chapterId')\
    .agg(F.sum(col('cr')).alias('score'), F.sum(col('eltime')).alias('time')).drop('cr')\
    .withColumn('total', transformer.get_questions_len(col('bookId'), col('chapterId')))\
    .withColumn('raw', col('score') / col('total'))

@pandas_udf(StructType(rawDf.schema.fields + [StructField('timeZ', FloatType()), StructField('scorePerc', FloatType())]), PandasUDFType.GROUPED_MAP)
def perc(df):
    sz = df['raw'].size-1
    df['scorePerc'] = df['raw'].rank(method='max').apply(lambda x: -1 if sz == 0 else (x-1)/sz)
    sd = df['time'].std()
    if sd == 0 or sd is None or math.isnan(sd):
        sd = 1
    df['timeZ'] = -(df['time'] - df['time'].mean())/sd
    return df

percDf = rawDf\
    .groupBy('chapterId').apply(perc)\
    .select('userId', 'timeZ', 'scorePerc', 'chapterId', 'bookId')


In [26]:
resDf = percDf.join(paginateDf, ['userId','chapterId','bookId'], 'inner')

In [27]:
import json
result = resDf.rdd.map(lambda row: row.asDict(recursive=True)).collect()
with open('clean_logs.json', 'w') as f:
    json.dump(result, f)