In [1]:
import sys
sys.path.append('../../ml_utils')

import findspark
findspark.init()

from handyspark import *

import data_utils as du
import spark_utils as su
import sentiment_classifier
import timeit

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### create Spark environment

In [2]:
# spark_endpoint = "spark://spark.home.net:7077"
#base_dir = '/Users/administrator/'

spark_endpoint = "spark://lasvegas:7077"
base_dir = '/home/administrator/'

# these two jars must be added!
spark_jars = base_dir + 'Development/spark-2.4.4-bin-hadoop2.7/jars/spark-nlp_2.11-2.3.3.jar,' + base_dir + '/Development/spark-2.4.4-bin-hadoop2.7/jars/config-1.4.0.jar'

conf = SparkConf().setMaster(spark_endpoint) \
.set("spark.jars", spark_jars) \
.setAppName("Sentiment Analysis")

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
#sc.getConf().getAll()

### Load and clean the data

In [3]:
log_entries_df = sqlContext.read.format('com.databricks.spark.csv') \
    .schema(su.feature_schema) \
    .options(header = 'false', inferschema = 'false', delimiter = '\t') \
    .load('./../../shared/data/swissid_authorize_logs_april_to_sept_2019.csv')

In [4]:
reduced_feature_set = True

In [5]:
reduced_df = su.clean_log_entries(log_entries_df, False, False, False, reduced_feature_set, False)

In [6]:
normal_df = reduced_df.filter("label_nr = 0.0")
anomaly_df = reduced_df.filter("label_nr = 1.0")

### Build a sampled dataframe having the same anomaly rate as the original dateset and collect the metrics data

In [7]:
iterations = 100
metrics = []
iter_values = []

In [8]:
sample_size_percentage = 0.041

start_overall = timeit.default_timer()

for i in range(iterations):

    x_df = normal_df.sample(False, sample_size_percentage).union(anomaly_df.sample(False, sample_size_percentage))
    
    preprocessed_df = sentiment_classifier.preprocess(x_df, False, reduced_feature_set)
    encoded_features_df = sentiment_classifier.vectorize(preprocessed_df, False)
    
    train_df, test_df = encoded_features_df.randomSplit([0.8, 0.2], seed=42)
    
    clf = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label_nr', maxDepth=6, maxBins=25)
    
    start_fitting = timeit.default_timer()
    model = clf.fit(train_df)
    stop_fitting = timeit.default_timer()
    
    fit_time = stop_fitting - start_fitting
    print("{0} Time to fit: {1:.2f} seconds".format(i, fit_time))
    
    
    predictions_df = model.transform(test_df)
    
    evaluator = BinaryClassificationEvaluator(labelCol='label_nr')
    auc = evaluator.evaluate(predictions_df)

    predictions_and_label = predictions_df.select("prediction", "label_nr").rdd
    class_metrics = MulticlassMetrics(predictions_and_label)

    tot = predictions_df.count()
    cm = class_metrics.confusionMatrix().toArray()
    t_pos = cm[0][0]
    f_pos = cm[0][1]
    f_neg = cm[1][0]
    t_neg = cm[1][1]

    acc = (t_pos + t_neg) / tot

    f1 = class_metrics.fMeasure(1.0)
    p = class_metrics.precision(1.0)
    r = class_metrics.recall(1.0)

    iter_values = [tot, r, p, acc, f1, t_pos, f_pos, f_neg, t_neg, auc, fit_time ]
    
    metrics.append(iter_values)
    
    iter_values = []

stop_overall = timeit.default_timer()

overall_runtime = stop_overall - start_overall
print("Time: {0:.2f} seconds\n".format(overall_runtime))
print("Time: {0:.2f} minutes\n".format(overall_runtime/60))

0 Time to fit: 42.19 seconds
1 Time to fit: 35.17 seconds
2 Time to fit: 35.38 seconds
3 Time to fit: 34.75 seconds
4 Time to fit: 34.95 seconds
5 Time to fit: 34.94 seconds
6 Time to fit: 34.90 seconds
7 Time to fit: 34.74 seconds
8 Time to fit: 34.73 seconds
9 Time to fit: 34.86 seconds
10 Time to fit: 34.91 seconds
11 Time to fit: 34.94 seconds
12 Time to fit: 34.65 seconds
13 Time to fit: 34.61 seconds
14 Time to fit: 34.46 seconds
15 Time to fit: 34.09 seconds
16 Time to fit: 33.81 seconds
17 Time to fit: 34.29 seconds
18 Time to fit: 34.00 seconds
19 Time to fit: 34.43 seconds
20 Time to fit: 34.37 seconds
21 Time to fit: 34.64 seconds
22 Time to fit: 34.60 seconds
23 Time to fit: 34.78 seconds
24 Time to fit: 34.32 seconds
25 Time to fit: 34.58 seconds
26 Time to fit: 34.57 seconds
27 Time to fit: 34.64 seconds
28 Time to fit: 34.47 seconds
29 Time to fit: 34.69 seconds
30 Time to fit: 34.78 seconds
31 Time to fit: 34.63 seconds
32 Time to fit: 34.76 seconds
33 Time to fit: 34.3

### Calculate Metrics

In [9]:
metrics_df = du.create_sentiment_metric_df(metrics, './sentiment_clf_overall_metrics.csv')
metrics_df.describe()

Unnamed: 0,total,tpr_recall,precision,accuracy,f1_score,t_pos,f_pos,f_neg,t_neg,auc,time_to_fit
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,39295.96,0.996348,0.997585,0.999972,0.996957,39112.04,0.44,0.67,182.81,0.997661,34.449345
std,87.616234,0.004632,0.003442,2.5e-05,0.00271,86.793488,0.624742,0.853454,7.393274,0.002721,0.859738
min,39075.0,0.978261,0.988764,0.999898,0.988827,38888.0,0.0,0.0,166.0,0.983734,33.677653
25%,39241.5,0.994536,0.994645,0.999949,0.994645,39051.75,0.0,0.0,178.0,0.997147,34.05552
50%,39303.0,1.0,1.0,0.999975,0.997275,39119.5,0.0,0.0,182.0,0.99734,34.417872
75%,39352.25,1.0,1.0,1.0,1.0,39165.0,1.0,1.0,187.0,1.0,34.631278
max,39609.0,1.0,1.0,1.0,1.0,39423.0,2.0,4.0,201.0,1.0,42.186243


In [10]:
# Stop the Spark Context
sc.stop()