In [1]:
import sys
sys.path.append('../../ml_utils')

import findspark
findspark.init()

import timeit
import data_utils as du
import spark_utils as su
import autoencoder as aenc

import pandas as pd
import numpy as np
import pyspark
import keras
import tensorflow

from pyspark.sql import SQLContext, SparkSession

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import platform

if platform.system() == 'Darwin':
    spark_master = "spark://spark.home.net:7077"
    base_dir = '/Users/administrator/'
else:
    spark_master = "spark://lasvegas:7077"
    base_dir = '/home/administrator/'

In [3]:
spark  = SparkSession.builder \
                  .master(spark_master) \
                  .appName("Deep Learning") \
                  .enableHiveSupport() \
                  .config('spark.jars.packages', 'databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11') \
                  .getOrCreate()

#spark.conf.set("spark.executor.memory", '8g')
#spark.conf.set('spark.executor.cores', '3')
#spark.conf.set('spark.cores.max', '3')
#spark.conf.set("spark.driver.memory",'8g')
#spark.conf.set(“spark.sql.shuffle.partitions”, 6)


sc = spark.sparkContext

sqlContext = SQLContext(sc)

#sc.getConf().getAll()

In [4]:
from sparkdl import KerasTransformer

print("pyspark: ", pyspark.__version__)
print("pandas: ", pd.__version__)
print("numpy: ", np.__version__)
print("Keras: ", keras.__version__)
print("Tensorflow: ", tensorflow.__version__)

pyspark:  2.4.4
pandas:  0.25.3
numpy:  1.17.4
Keras:  2.2.4
Tensorflow:  1.13.1


In [5]:
overall_start = timeit.default_timer()

In [6]:
# read the log statements based on a given schema

log_entries_df = sqlContext.read.format('com.databricks.spark.csv') \
    .schema(su.feature_schema_all_string) \
    .options(header = 'false', inferschema = 'false', delimiter = '\t') \
    .load('./../../shared/data/swissid_authorize_logs_april_to_sept_2019.csv')

In [7]:
# filter 'bad' statements and select a subset of all features
reduced_df = su.clean_log_entries(log_entries_df, True, False, False, True, False)

In [8]:
#reduced_df.show(5)

In [9]:
categorical_columns = reduced_df.schema.names
categorical_columns.remove('label_nr')
#categorical_columns

In [10]:
pipeline_start = timeit.default_timer()

In [11]:
pipeline = su.build_scaled_features_pipeline(categorical_columns)
pipeline_model = pipeline.fit(reduced_df)

In [12]:
pipeline_stop = timeit.default_timer()

In [13]:
pipeline_model_path = base_dir + "Development/workspaces/datascience/masterarbeit/shared/models/autoencoder/spark_autoencoder_pipeline_model.hdf5"

pipeline_model.write().overwrite().save(pipeline_model_path)

In [14]:
#pipeline_model = Pipeline.load(pipeline_model_path)

#df = pipelineModel.transform(df)

In [15]:
features_df = pipeline_model.transform(reduced_df).select(['features', 'feature_vec', 'label_nr'])
features_df.show(5)

+--------------------+--------------------+--------+
|            features|         feature_vec|label_nr|
+--------------------+--------------------+--------+
|(10,[1,7,8,9],[0....|(10,[1,7,8,9],[1....|       2|
|(10,[1,7,8,9],[1....|(10,[1,7,8,9],[2....|       0|
|(10,[1,7,9],[1.68...|(10,[1,7,9],[2.0,...|       2|
|(10,[1,7,9],[1.68...|(10,[1,7,9],[2.0,...|       2|
|(10,[1,2,7,8,9],[...|(10,[1,2,7,8,9],[...|       2|
+--------------------+--------------------+--------+
only showing top 5 rows



In [16]:
normal_df = features_df.filter("label_nr = 0.0")
normal_count = normal_df.count()
normal_count

4769169

In [17]:
normal_sample = normal_df.sample(False, 0.16)
normal_sample.count()

762170

In [18]:
# Split the index-labelled Scaled Feature Vectors into Training and Test DataFrames

train_df, test_df = normal_sample.randomSplit([0.8, 0.2], seed=12345)
train_df.count(), test_df.count()

(609545, 152625)

In [19]:
x_train = su.convert_feature_vector_to_list(train_df)
x_train_pdf = pd.DataFrame(x_train, columns=categorical_columns)

In [20]:
x_test = su.convert_feature_vector_to_list(test_df)
x_test_pdf = pd.DataFrame(x_test, columns=categorical_columns)

In [21]:
input_dim = 10
encoding_dim = 4

model = aenc.create_sparse_auto_encoder(input_dim, encoding_dim)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 44        
_________________________________________________________________
dense_2 (Dense)              (None, 10)                50        
Total params: 94
Trainable params: 94
Non-trainable params: 0
_________________________________________________________________


In [22]:
auto_encoder_start = timeit.default_timer()

In [23]:
history = aenc.auto_encoder_fit(model, x_train_pdf, x_test_pdf, 'RMSprop', 2)

Instructions for updating:
Use tf.cast instead.
Train on 609545 samples, validate on 152625 samples
Epoch 1/2
Epoch 2/2


In [24]:
auto_encoder_stop = timeit.default_timer()

In [25]:
model_path = base_dir + "Development/workspaces/datascience/masterarbeit/shared/models/autoencoder/spark_sparse_autoencoder_model.hdf5"
model.save(model_path)

history_path = base_dir + "Development/workspaces/datascience/masterarbeit/shared/models/autoencoder/spark_sparse_autoencoder_history"
du.save_model_history(history, history_path)

In [26]:
overall_stop = timeit.default_timer()

In [27]:
print("Overall Time: {} minutes\n".format((overall_stop - overall_start)/60))
print("Auto-Encoder Time: {} minutes\n".format((auto_encoder_stop - auto_encoder_start)/60))
print("Pipeline Time: {} minutes\n".format((pipeline_stop - pipeline_start)/60))

Overall Time: 4.367078569816667 minutes

Auto-Encoder Time: 0.7841705017000005 minutes

Pipeline Time: 1.5728168885166665 minutes



# Load the Model

In [28]:
test_df.show(5)

+--------------------+--------------------+--------+
|            features|         feature_vec|label_nr|
+--------------------+--------------------+--------+
|(10,[0,1,2],[6.07...|(10,[0,1,2],[1.0,...|       0|
|(10,[0,1,2],[6.07...|(10,[0,1,2],[1.0,...|       0|
|(10,[0,1,2],[6.07...|(10,[0,1,2],[1.0,...|       0|
|(10,[0,1,2,7],[6....|(10,[0,1,2,7],[1....|       0|
|(10,[0,1,2,7],[6....|(10,[0,1,2,7],[1....|       0|
+--------------------+--------------------+--------+
only showing top 5 rows



In [29]:
test_values_df = su.convert_feature_vector_to_X_df(test_df, sqlContext)

In [30]:
# here we use the spark KerasTransformer
predictions_df = su.predict(model_path, test_values_df)

Instructions for updating:
Use tf.compat.v1.graph_util.convert_variables_to_constants
Instructions for updating:
Use tf.compat.v1.graph_util.extract_sub_graph
INFO:tensorflow:Froze 4 variables.
INFO:tensorflow:Converted 4 variables to const ops.
INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.
Instructions for updating:
Use tf.compat.v1.graph_util.remove_training_nodes


In [31]:
predictions, features, labels = su.convert_prediction_df_to_lists(predictions_df)

In [32]:
x_test_pdf = pd.DataFrame(features, columns=categorical_columns)
y_pred = pd.DataFrame(predictions, columns=categorical_columns)

In [33]:
from keras.models import load_model
 
# load model
auto_encoder = load_model(model_path)

In [34]:
score = auto_encoder.evaluate(x_test_pdf, x_test_pdf, verbose=1)

for i, metric in enumerate(auto_encoder.metrics_names):
    print('Test ' + metric + ':', score[i])

Test loss: 0.9387733298454687
Test mean_absolute_error: 0.3386950209623082
Test acc: 0.9244946764947009


In [35]:
from sklearn.metrics import classification_report

threshold = 10.0

y_class, errors = aenc.classify(x_test_pdf, y_pred, threshold)

print(classification_report(labels, y_class))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    152625
           1       0.00      0.00      0.00         0

    accuracy                           0.99    152625
   macro avg       0.50      0.49      0.50    152625
weighted avg       1.00      0.99      0.99    152625



  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
# (16) Stop the Spark Context
sc.stop()