In [1]:
%%configure -f
{
   "conf":{
      "spark.jars.packages":"ml.combust.mleap:mleap-spark_2.11:0.16.0"
   }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,application_1610278166754_0003,pyspark,idle,Link,Link,


In [60]:
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', '{:0.2f}'.format)

import zipfile
import tarfile
import os
import json
from pprint import pprint

import boto3
from mleap.pyspark.spark_support import SimpleSparkSerializer

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Load training data
df = spark.sql("select * from training.model_data")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
df.show(1, vertical=True, truncate=False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 dataset_name                               | android-stackexchange-com                                                                                                                                                                                                                                                                                                                                                                                                                               
 post_id                  

In [5]:
# check NA
df.select([f.count(f.when(f.isnan(c[0]) | f.isnull(c[0]), c[0])).alias(c[0]) for c in df.dtypes if not c[1] in ("string", "timestamp", "array<string>")])\
.toPandas().transpose().sum().sum() == 0

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

True

In [6]:
# Potential target variables
df.agg(
      f.mean(("answer_accepted_1d_flag")),
      f.mean(("answer_accepted_7d_flag")),
      f.mean(("answer_accepted_14d_flag")),
      f.mean(("answer_accepted_30d_flag")),
      f.mean(("answer_accepted_flag"))).show()

df.agg(
      f.mean(("answer_1d_flag")),
      f.mean(("answer_7d_flag")),
      f.mean(("answer_14d_flag")),
      f.mean(("answer_30d_flag")),
      f.mean(("answer_flag")),).show()

df.agg(f.mean(("post_closed_flag"))).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------------------+----------------------------+-----------------------------+-----------------------------+-------------------------+
|avg(answer_accepted_1d_flag)|avg(answer_accepted_7d_flag)|avg(answer_accepted_14d_flag)|avg(answer_accepted_30d_flag)|avg(answer_accepted_flag)|
+----------------------------+----------------------------+-----------------------------+-----------------------------+-------------------------+
|         0.14489200888418927|         0.28837452992207835|           0.3083902121153894|           0.3227118498424023|      0.37565655757954775|
+----------------------------+----------------------------+-----------------------------+-----------------------------+-------------------------+

+-------------------+-------------------+--------------------+--------------------+------------------+
|avg(answer_1d_flag)|avg(answer_7d_flag)|avg(answer_14d_flag)|avg(answer_30d_flag)|  avg(answer_flag)|
+-------------------+-------------------+--------------------+-

In [7]:
df.groupBy("dataset_name").agg(f.mean("answer_accepted_flag")).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------------------------+
|        dataset_name|avg(answer_accepted_flag)|
+--------------------+-------------------------+
|cs-stackexchange-com|       0.4482627578718784|
|datascience-stack...|       0.3252392907578848|
|softwareengineeri...|       0.5755453094520305|
|android-stackexch...|      0.24668830241054845|
|unix-stackexchang...|      0.44735514678683125|
|dba-stackexchange...|      0.46438227050623804|
|devops-stackexcha...|       0.3636882671995432|
|gamedev-stackexch...|       0.4996215479769036|
|       askubuntu-com|       0.2853066188245777|
|raspberrypi-stack...|       0.3082882018235415|
+--------------------+-------------------------+

In [8]:
df.groupBy("dataset_name").agg(f.mean("answer_flag")).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+------------------+
|        dataset_name|  avg(answer_flag)|
+--------------------+------------------+
|cs-stackexchange-com|0.7833876221498371|
|datascience-stack...|0.7485093362623568|
|softwareengineeri...|0.9534846604007803|
|android-stackexch...|0.7246341314065274|
|unix-stackexchang...|0.8246236450661913|
|dba-stackexchange...|0.8665947367115608|
|devops-stackexcha...|0.8321438766771339|
|gamedev-stackexch...|0.8586535758309725|
|       askubuntu-com| 0.742233687734043|
|raspberrypi-stack...|0.7819633920614245|
+--------------------+------------------+

In [17]:
# Features vs target
y = "answer_accepted_7d_flag"
x = ['post_hour', 
     'post_dayofweek', 
     'post_month', 
     'post_year',
     'post_body_char_count',
     'post_body_nocode_char_count',
     'post_body_code_perc',
     'post_body_code_flag',
     'post_body_image_flag',
     'post_body_link_flag',
     'post_body_bold_flag',
     'post_title_upper_flag',
     'post_title_question_flag',
     'post_title_char_count',
     'post_tag_count', 
     'post_body_sentence_count', 
     'post_body_word_count', 
     'post_body_word_distinct_count', 
     'post_body_verb_perc', 
     'post_body_noun_perc', 
     'post_body_pronoun_perc', 
     'post_body_adjective_perc', 
     'post_body_adverb_perc', 
     'post_title_word_count', 
     'post_title_word_distinct_count', 
     'post_title_verb_perc', 
     'post_title_noun_perc', 
     'post_title_pronoun_perc', 
     'post_title_adjective_perc', 
     'post_title_adverb_perc', 
     'post_title_in_body_perc', 
     'tag_post_count_max', 
     'tag_post_count_30d_max', 
     'tag_post_count_365d_max',
     'tag_age_days_max',
     'tag_post_count_avg',
     'tag_post_count_30d_avg',
     'tag_post_count_365d_avg',
     'tag_age_days_avg',
     'user_age_days',
     'user_website_flag',
     'user_location_flag',
     'user_about_me_flag',
     'user_badge_count',
     'user_badge_1_count',
     'user_badge_2_count',
     'user_badge_3_count', 
     'user_post_count',
     'user_question_count',
     'user_answer_count',
     'user_first_post_flag',
     'user_first_question_flag',
     'user_answered_questions_count',
     'user_accepted_answers_count',
     'user_score',
     'user_question_score',
     'user_answer_score',
     'android_stackexchange_com_flag',
     'askubuntu_com_flag',
     'cs_stackexchange_com_flag',
     'datascience_stackexchange_com_flag',
     'dba_stackexchange_com_flag',
     'devops_stackexchange_com_flag',
     'gamedev_stackexchange_com_flag',
     'raspberrypi_stackexchange_com_flag',
     'softwareengineering_stackexchange_com_flag',
     'unix_stackexchange_com_flag']

df_model = df.select(*x, y).withColumnRenamed(y, "y")

x_mean_vs_y = \
df_model.groupBy("y")\
.agg(*[f.mean(c).alias(c) for c in x])\
.toPandas().transpose()

x_median_vs_y = \
df_model.groupBy("y")\
.agg(*[f.expr(f"percentile_approx({c}, 0.5)").alias(c) for c in x])\
.toPandas().transpose()

x_vs_y = x_mean_vs_y.merge(x_median_vs_y, 
                           left_index=True, 
                           right_index=True).drop("y")
x_vs_y.columns = ["1_mean", "0_mean", "1_median", "0_median"]
x_vs_y

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                                            1_mean  0_mean  1_median  0_median
post_hour                                    12.78   12.65     13.00     13.00
post_dayofweek                                3.99    4.00      4.00      4.00
post_month                                    6.46    6.50      6.00      6.00
post_year                                  2015.36 2015.95   2015.00   2016.00
post_body_char_count                        895.05 1022.70    577.00    598.00
post_body_nocode_char_count                 444.95  506.18    322.00    373.00
post_body_code_perc                           0.32    0.27      0.12      0.00
post_body_code_flag                           0.57    0.48      1.00      0.00
post_body_image_flag                          0.08    0.09      0.00      0.00
post_body_link_flag                           0.23    0.25      0.00      0.00
post_body_bold_flag                           0.15    0.14      0.00      0.00
post_title_upper_flag                         0.89  

In [18]:
train, test = df_model.randomSplit([0.7, 0.3], seed=1)
df.count(), train.count(), test.count()

train.select(f.mean("y")).show()
test.select(f.mean("y")).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+
|            avg(y)|
+------------------+
|0.2883552671974457|
+------------------+

+------------------+
|            avg(y)|
+------------------+
|0.2884194208716736|
+------------------+

In [19]:
# Baseline model
indexer = StringIndexer(inputCol="y", outputCol='label')
vec_assembler = VectorAssembler(inputCols=x, outputCol="features")
gbt = GBTClassifier(seed=1)
pipeline = Pipeline(stages=[indexer, vec_assembler, gbt])

model = pipeline.fit(train)
prediction = model.transform(test)

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Accuracy:", evaluator.evaluate(prediction))

evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
print("ROC AUC:", evaluator.evaluate(prediction))

evaluator = BinaryClassificationEvaluator(metricName="areaUnderPR")
print("PR AUC:", evaluator.evaluate(prediction))

metrics = MulticlassMetrics(prediction.select("prediction", "label").rdd.map(tuple))
print("\nConfusion matrix:\n", metrics.confusionMatrix().toArray()/test.count())

print("\nFeature importance")
pd.DataFrame(model.stages[2].featureImportances.toArray(), index=x)\
.sort_values(0, ascending=False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Accuracy: 0.7206073494047738
ROC AUC: 0.7017243978455301
PR AUC: 0.4660457005630915

Confusion matrix:
 [[0.67751655 0.03406403]
 [0.24532862 0.0430908 ]]

Feature importance
                                              0
post_year                                  0.12
user_accepted_answers_count                0.09
post_body_code_perc                        0.07
post_body_char_count                       0.06
post_title_question_flag                   0.05
post_body_word_distinct_count              0.05
askubuntu_com_flag                         0.04
user_question_count                        0.04
post_body_noun_perc                        0.04
user_age_days                              0.04
cs_stackexchange_com_flag                  0.04
softwareengineering_stackexchange_com_flag 0.03
gamedev_stackexchange_com_flag             0.03
dba_stackexchange_com_flag                 0.03
tag_post_count_365d_avg                    0.02
user_location_flag                         0.02
post_titl

In [18]:
# Grid search
paramGrid = ParamGridBuilder() \
.addGrid(gbt.stepSize, [0.05, 0.1, 0.15])\
.addGrid(gbt.maxDepth, [5, 8, 10])\
.addGrid(gbt.minInstancesPerNode, [10, 50, 100])\
.build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR"),
                          numFolds=5,
                         seed=1)

cvModel = crossval.fit(train)
prediction = cvModel.transform(test)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Exception in thread cell_monitor-17:
Traceback (most recent call last):
  File "/mnt/notebook-env/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/mnt/notebook-env/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/mnt/notebook-env/lib/python3.7/site-packages/awseditorssparkmonitoringwidget-1.0-py3.7.egg/awseditorssparkmonitoringwidget/cellmonitor.py", line 178, in cell_monitor
    job_binned_stages[job_id][stage_id] = all_stages[stage_id]
KeyError: 1298



In [27]:
print("PR AUC:", crossval.getEvaluator().evaluate(prediction))

metrics = MulticlassMetrics(prediction.select("prediction", "label").rdd.map(tuple))
print("\nConfusion matrix:\n", metrics.confusionMatrix().toArray()/test.count())

print("\nFeature importance")
pd.DataFrame(model.stages[2].featureImportances.toArray(), index=x)\
.sort_values(0, ascending=False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

PR AUC: 0.46062548171070317

Confusion matrix:
 [[0.67346305 0.03922247]
 [0.23801679 0.04929768]]

Feature importance
                                  0
post_year                     0.120
user_accepted_answers_count   0.097
post_body_char_count          0.086
tag_post_count_30d_max        0.077
user_question_count           0.071
post_body_code_perc           0.068
tag_post_count_365d_max       0.052
post_body_sent_count          0.032
tag_post_count_30d_avg        0.032
tag_post_count_365d_avg       0.029
post_title_question_flag      0.027
user_about_me_flag            0.026
user_age_months               0.026
post_tag_count                0.023
tag_post_count_max            0.023
tag_post_count_avg            0.021
user_age_days                 0.020
user_post_count               0.020
user_badge_count              0.019
post_body_nocode_char_count   0.019
post_hour                     0.013
user_badge_3_count            0.011
user_location_flag            0.011
user_answer_count

In [59]:
# Save model
SimpleSparkSerializer().serializeToBundle(model, "jar:file:/tmp/model.zip", prediction)
with zipfile.ZipFile("/tmp/model.zip") as zf:
    zf.extractall("/tmp/model")
with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
    tar.add("/tmp/model/bundle.json", arcname='bundle.json')
    tar.add("/tmp/model/root", arcname='root')
    
# upload to S3
# MLeap format
s3 = boto3.resource('s3') 
bucket_name = "rate-my-post-pre"
s3.Bucket(bucket_name).upload_file('/tmp/model.tar.gz', "model.tar.gz")

# Spark format
model.save(f"s3://{bucket_name}/sparkmodel")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…