In [1]:
%%configure -f
{
   "conf":{
      "spark.jars.packages":"ml.combust.mleap:mleap-spark_2.11:0.16.0"
   }
}

In [2]:
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', '{:0.3f}'.format)

import zipfile
import tarfile
import os
import json
from pprint import pprint

import sagemaker
from sagemaker import get_execution_role
from sagemaker.sparkml.model import SparkMLModel
import boto3
from mleap.pyspark.spark_support import SimpleSparkSerializer

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
4,application_1609708034480_0005,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Load training data
df = spark.sql("select * from training.training_data")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# check NA
df.select([f.count(f.when(f.isnan(c[0]) | f.isnull(c[0]), c[0])).alias(c[0]) for c in df.dtypes if not c[1] in ("string", "timestamp", "array<string>")])\
.toPandas().transpose().sum().sum() == 0

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

True

In [5]:
# Potential target variables
df.agg(
      f.mean(("answer_accepted_1d_flag")),
      f.mean(("answer_accepted_7d_flag")),
      f.mean(("answer_accepted_14d_flag")),
      f.mean(("answer_accepted_30d_flag")),
      f.mean(("answer_accepted_flag"))).show()

df.agg(
      f.mean(("answer_1d_flag")),
      f.mean(("answer_7d_flag")),
      f.mean(("answer_14d_flag")),
      f.mean(("answer_30d_flag")),
      f.mean(("answer_flag")),).show()

df.agg(f.mean(("post_closed_flag"))).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------------------+----------------------------+-----------------------------+-----------------------------+-------------------------+
|avg(answer_accepted_1d_flag)|avg(answer_accepted_7d_flag)|avg(answer_accepted_14d_flag)|avg(answer_accepted_30d_flag)|avg(answer_accepted_flag)|
+----------------------------+----------------------------+-----------------------------+-----------------------------+-------------------------+
|         0.14489200888418927|         0.28837452992207835|           0.3083902121153894|           0.3227118498424023|      0.37565655757954775|
+----------------------------+----------------------------+-----------------------------+-----------------------------+-------------------------+

+-------------------+-------------------+--------------------+--------------------+------------------+
|avg(answer_1d_flag)|avg(answer_7d_flag)|avg(answer_14d_flag)|avg(answer_30d_flag)|  avg(answer_flag)|
+-------------------+-------------------+--------------------+-

In [6]:
df.groupBy("dataset_name").agg(f.mean("answer_accepted_flag")).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------------------------+
|        dataset_name|avg(answer_accepted_flag)|
+--------------------+-------------------------+
|cs-stackexchange-com|       0.4482627578718784|
|datascience-stack...|       0.3252392907578848|
|softwareengineeri...|       0.5755453094520305|
|android-stackexch...|      0.24668830241054845|
|unix-stackexchang...|      0.44735514678683125|
|dba-stackexchange...|      0.46438227050623804|
|devops-stackexcha...|       0.3636882671995432|
|gamedev-stackexch...|       0.4996215479769036|
|       askubuntu-com|       0.2853066188245777|
|raspberrypi-stack...|       0.3082882018235415|
+--------------------+-------------------------+

In [7]:
df.groupBy("dataset_name").agg(f.mean("answer_flag")).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+------------------+
|        dataset_name|  avg(answer_flag)|
+--------------------+------------------+
|cs-stackexchange-com|0.7833876221498371|
|datascience-stack...|0.7485093362623568|
|softwareengineeri...|0.9534846604007803|
|android-stackexch...|0.7246341314065274|
|unix-stackexchang...|0.8246236450661913|
|dba-stackexchange...|0.8665947367115608|
|devops-stackexcha...|0.8321438766771339|
|gamedev-stackexch...|0.8586535758309725|
|       askubuntu-com| 0.742233687734043|
|raspberrypi-stack...|0.7819633920614245|
+--------------------+------------------+

In [8]:
# Features vs target
y = "answer_accepted_7d_flag"
x = [
"post_hour",
"post_dayofweek",
"post_month",    
"post_year",
"post_body_char_count",
"post_body_nocode_char_count",
"post_body_code_perc",
"post_body_code_flag",
"post_body_image_flag",
"post_body_link_flag",
"post_body_bold_flag",
"post_title_upper_flag",
"post_title_question_flag",
"post_title_char_count",
"post_tag_count",
"post_body_sent_count",
"post_body_word_count",
"post_title_word_count",
"tag_post_count_max",
"tag_post_count_30d_max",
"tag_post_count_365d_max",
"tag_post_count_avg",
"tag_post_count_30d_avg",
"tag_post_count_365d_avg",
"user_age_days",
"user_age_months",
"user_website_flag",
"user_location_flag",
"user_about_me_flag",
"user_badge_count",
"user_badge_1_count",
"user_badge_2_count",
"user_badge_3_count",
"user_post_count",
"user_question_count",
"user_answer_count",
"user_first_post_flag",
"user_first_question_flag",
"user_answered_questions_count",
"user_accepted_answers_count",
"user_score",
"user_question_score",
"user_answer_score"
]

df_model = df.select(*x, y)

x_mean_vs_y = \
df_model.groupBy(y)\
.agg(*[f.mean(c[0]).alias(c[0]) for c in df_model.dtypes if not c[1] in ("string", "timestamp", "array<string>")] )\
.toPandas().transpose()

x_median_vs_y = \
df_model.groupBy(y)\
.agg(*[f.expr(f"percentile_approx({c[0]}, 0.5)").alias(c[0]) for c in df_model.dtypes if not c[1] in ("string", "timestamp", "array<string>")] )\
.toPandas().transpose()

x_vs_y = x_mean_vs_y.merge(x_median_vs_y, 
                           left_index=True, 
                           right_index=True,
                          suffixes=("_mean", "_median"))
x_vs_y

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                                0_mean   1_mean  0_median  1_median
answer_accepted_7d_flag          1.000    0.000     1.000     0.000
answer_accepted_7d_flag          1.000    0.000     1.000     0.000
answer_accepted_7d_flag          1.000    0.000     1.000     0.000
answer_accepted_7d_flag          1.000    0.000     1.000     0.000
post_body_bold_flag              0.145    0.137     0.000     0.000
post_body_char_count           895.109 1022.750   577.000   598.000
post_body_code_flag              0.566    0.476     1.000     0.000
post_body_code_perc              0.322    0.267     0.119     0.000
post_body_image_flag             0.000    0.000     0.000     0.000
post_body_link_flag              0.226    0.249     0.000     0.000
post_body_nocode_char_count    444.993  506.219   322.000   373.000
post_body_sent_count             4.630    5.256     4.000     4.000
post_body_word_count            80.791   91.012    59.000    68.000
post_dayofweek                   3.991    3.996 

In [9]:
train, test = df_model.randomSplit([0.7, 0.3], seed=1)
df.count(), train.count(), test.count()

train.select(f.mean(y)).show()
test.select(f.mean(y)).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------------------+
|avg(answer_accepted_7d_flag)|
+----------------------------+
|         0.28837863407399833|
+----------------------------+

+----------------------------+
|avg(answer_accepted_7d_flag)|
+----------------------------+
|          0.2883649475837585|
+----------------------------+

In [10]:
# Baseline model
indexer = StringIndexer(inputCol=y, outputCol='label')
vec_assembler = VectorAssembler(inputCols=x, outputCol="features")
gbt = GBTClassifier(seed=1)
pipeline = Pipeline(stages=[indexer, vec_assembler, gbt])

model = pipeline.fit(train)
prediction = model.transform(test)

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Accuracy:", evaluator.evaluate(prediction))

evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
print("ROC AUC:", evaluator.evaluate(prediction))

evaluator = BinaryClassificationEvaluator(metricName="areaUnderPR")
print("PR AUC:", evaluator.evaluate(prediction))

metrics = MulticlassMetrics(prediction.select("prediction", "label").rdd.map(tuple))
print("Confusion matrix:\n", metrics.confusionMatrix().toArray()/test.count())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Accuracy: 0.7191611809375223
ROC AUC: 0.6769007572072855
PR AUC: 0.44600799571552063
Confusion matrix:
 [[0.68353644 0.02809861]
 [0.25274021 0.03562474]]

In [11]:
# Model parameters
pprint(model.stages[2].extractParamMap())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{Param(parent='GBTClassifier_611b50e3b828', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be at least 2 and at least number of categories for any categorical feature.'): 32,
 Param(parent='GBTClassifier_611b50e3b828', name='maxDepth', doc='Maximum depth of the tree. (Nonnegative) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 5,
 Param(parent='GBTClassifier_611b50e3b828', name='maxIter', doc='maximum number of iterations (>= 0)'): 20,
 Param(parent='GBTClassifier_611b50e3b828', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation.'): 256,
 Param(parent='GBTClassifier_611b50e3b828', name='subsamplingRate', doc='Fraction of the training data used for learning each decision tree, in range (0, 1].'): 1.0,
 Param(parent='GBTClassifier_611b50e3b828', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split.  If a split causes the left or right child to h

In [12]:
# Grid search
paramGrid = ParamGridBuilder() \
.addGrid(gbt.stepSize, [0.05, 0.1, 0.15])\
.addGrid(gbt.maxDepth, [5, 8, 10])\
.addGrid(gbt.minInstancesPerNode, [10, 50, 100])\
.build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR"),
                          numFolds=5,
                         seed=1)

cvModel = crossval.fit(train)
prediction = cvModel.transform(test)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Exception in thread cell_monitor-11:
Traceback (most recent call last):
  File "/mnt/notebook-env/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/mnt/notebook-env/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/mnt/notebook-env/lib/python3.7/site-packages/awseditorssparkmonitoringwidget-1.0-py3.7.egg/awseditorssparkmonitoringwidget/cellmonitor.py", line 154, in cell_monitor
    job_group_filtered_jobs = [job for job in jobs_data if job['jobGroup'] == str(statement_id)]
  File "/mnt/notebook-env/lib/python3.7/site-packages/awseditorssparkmonitoringwidget-1.0-py3.7.egg/awseditorssparkmonitoringwidget/cellmonitor.py", line 154, in <listcomp>
    job_group_filtered_jobs = [job for job in jobs_data if job['jobGroup'] == str(statement_id)]
KeyError: 'jobGroup'

An error was encountered:
Invalid status code '404' from http://localhost:8998/sessions/4/statements/11 with error payload: {"msg":"Session '4' no

In [13]:
print("PR AUC:", evaluator.evaluate(prediction))

# Feature importance
pd.DataFrame(model.stages[2].featureImportances.toArray(), index=x)\
.sort_values(0, ascending=False)

VBox()

An error was encountered:
Invalid status code '404' from http://localhost:8998/sessions/4 with error payload: {"msg":"Session '4' not found."}


In [None]:
# Save model
SimpleSparkSerializer().serializeToBundle(model, "jar:file:/tmp/model.zip", prediction)
with zipfile.ZipFile("/tmp/model.zip") as zf:
    zf.extractall("/tmp/model")
with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
    tar.add("/tmp/model/bundle.json", arcname='bundle.json')
    tar.add("/tmp/model/root", arcname='root')

In [3]:
# upload to S3
s3 = boto3.resource('s3') 
file_name = "model.tar.gz"
bucket_name = "rate-my-post-pre-2"
s3.Bucket(bucket_name).upload_file('/tmp/model.tar.gz', file_name)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# Deploy on Sagemaker
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
sess = sagemaker.Session(boto3.session.Session())
role = get_execution_role()

schema = json.dumps(
{
    "input": [
        {
            "name": "post_hour",
            "type": "int"
        },
        {
            "name": "post_dayofweek",
            "type": "int"
        },
        {
            "name": "post_month",
            "type": "int"
        },
        {
            "name": "post_year",
            "type": "int"
        },
        {
            "name": "post_body_char_count",
            "type": "int"
        },
        {
            "name": "post_body_nocode_char_count",
            "type": "int"
        },
        {
            "name": "post_body_code_perc",
            "type": "double"
        },
        {
            "name": "post_body_code_flag",
            "type": "int"
        },
        {
            "name": "post_body_image_flag",
            "type": "int"
        },
        {
            "name": "post_body_link_flag",
            "type": "int"
        },
        {
            "name": "post_body_bold_flag",
            "type": "int"
        },
        {
            "name": "post_title_upper_flag",
            "type": "int"
        },
        {
            "name": "post_title_question_flag",
            "type": "int"
        },
        {
            "name": "post_title_char_count",
            "type": "int"
        },
        {
            "name": "post_tag_count",
            "type": "int"
        },
        {
            "name": "post_body_sent_count",
            "type": "int"
        },
        {
            "name": "post_body_word_count",
            "type": "int"
        },
        {
            "name": "post_title_word_count",
            "type": "int"
        },
        {
            "name": "tag_post_count_max",
            "type": "long"
        },
        {
            "name": "tag_post_count_30d_max",
            "type": "long"
        },
        {
            "name": "tag_post_count_365d_max",
            "type": "long"
        },
        {
            "name": "tag_post_count_avg",
            "type": "double"
        },
        {
            "name": "tag_post_count_30d_avg",
            "type": "double"
        },
        {
            "name": "tag_post_count_365d_avg",
            "type": "double"
        },
        {
            "name": "user_age_days",
            "type": "int"
        },
        {
            "name": "user_age_months",
            "type": "double"
        },
        {
            "name": "user_website_flag",
            "type": "int"
        },
        {
            "name": "user_location_flag",
            "type": "int"
        },
        {
            "name": "user_about_me_flag",
            "type": "int"
        },
        {
            "name": "user_badge_count",
            "type": "long"
        },
        {
            "name": "user_badge_1_count",
            "type": "long"
        },
        {
            "name": "user_badge_2_count",
            "type": "long"
        },
        {
            "name": "user_badge_3_count",
            "type": "long"
        },
        {
            "name": "user_post_count",
            "type": "long"
        },
        {
            "name": "user_question_count",
            "type": "long"
        },
        {
            "name": "user_answer_count",
            "type": "long"
        },
        {
            "name": "user_first_post_flag",
            "type": "int"
        },
        {
            "name": "user_first_question_flag",
            "type": "int"
        },
        {
            "name": "user_answered_questions_count",
            "type": "long"
        },
        {
            "name": "user_accepted_answers_count",
            "type": "long"
        },
        {
            "name": "user_score",
            "type": "long"
        },
        {
            "name": "user_question_score",
            "type": "long"
        },
        {
            "name": "user_answer_score",
            "type": "long"
        },
        {
            "name": "answer_accepted_7d_flag",
            "type": "int"
        }
    ],
    "output":
        {
            "name": "probability",
            "type": "double",
            "struct": "vector"
    }
}
)

model_name = "rate-my-post"
sparkml_model = SparkMLModel(model_data='s3://{}/{}'.format(bucket_name, file_name), 
                             role=role, 
                             sagemaker_session=sess, 
                             name=model_name,
                             env={'SAGEMAKER_SPARKML_SCHEMA' : schema})

endpoint_name = 'test'
sparkml_model.deploy(initial_instance_count=1, instance_type='ml.t2.large', endpoint_name=endpoint_name)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

---------------!<sagemaker.sparkml.model.SparkMLPredictor object at 0x7ff134c91a90>