# Model Training

## 1. Setting Up Spark Context

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

## 2. Download data from Object Store

In [3]:
import os
import getpass

def get_or_set_environment_variable(variable):
    try:
        var = os.environ[variable]
    except KeyError:
        var = getpass.getpass('Please enter value for {:}: '.format(variable))
    
    os.environ[variable] = var
    return var

ibm_api_key_id = get_or_set_environment_variable('IBM_API_KEY_ID')
ibm_cloud_store_bucket = get_or_set_environment_variable('IBM_OBJECT_STORE_BUCKET')

Please enter value for IBM_API_KEY_ID: ········
Please enter value for IBM_OBJECT_STORE_BUCKET: ········


### 2.1 Load Training Data

In [4]:
# The code was removed by Watson Studio for sharing.

{'train': ['desaster_detection_clean_train-0000.parquet'],
 'test': ['desaster_detection_clean_test-0000.parquet'],
 'label': ['desaster_detection_label-0000.parquet']}

In [5]:
def load_dataframe(files, **kargs):
    dfs = []
    for fn in files:
        body = client.get_object(Bucket=ibm_cloud_store_bucket,
                                 Key=fn)['Body']
        if not hasattr(body, "__iter__"):
            body.__iter__ = types.MethodType( __iter__, body )
        
        tfn = 'temp_{:}'.format(fn)
        with open(tfn, 'wb') as temp:
            temp.write(body.read())
        dfs.append(spark.read.options(**kargs).parquet(tfn))
    df = dfs.pop()
    for other in dfs:
        df = df.union(other)
    return df

df_train = load_dataframe(files['train'])
df_label = load_dataframe(files['label'])

### 2.2 Load Model Definitions

In [6]:
# The code was removed by Watson Studio for sharing.

{'LogisticRegression_count.ai.zip': 'spark',
 'LogisticRegression_tfidf.ai.zip': 'spark',
 'NaiveBayes_count.ai.zip': 'spark',
 'NaiveBayes_tfidf.ai.zip': 'spark',
 'Sequential_NN_w2v.ai.h5': 'keras'}

In [7]:
def download_model_files(files):
    temp_files = []
    for fn in files:
        body = client.get_object(Bucket=ibm_cloud_store_bucket,
                                 Key=fn)['Body']
        if not hasattr(body, "__iter__"):
            body.__iter__ = types.MethodType( __iter__, body )
        
        tfn = os.path.join(os.path.curdir, 'temp_{:}'.format(fn))
        with open(tfn, 'wb') as temp:
            temp.write(body.read())
            
        temp_files.append(tfn)
    return temp_files

model_temp_files = download_model_files(model_files.keys())
model_temp_files

['./temp_LogisticRegression_count.ai.zip',
 './temp_LogisticRegression_tfidf.ai.zip',
 './temp_NaiveBayes_count.ai.zip',
 './temp_NaiveBayes_tfidf.ai.zip',
 './temp_Sequential_NN_w2v.ai.h5']

In [8]:
import zipfile

def unzip_file(path):
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall(os.curdir)
        extracted = zip_ref.namelist()[0]
    return os.path.join(os.curdir, extracted)

extracted_models = [unzip_file(path) for path in model_temp_files[:-1]]
extracted_models

['./LogisticRegression_count.ai/',
 './LogisticRegression_tfidf.ai/',
 './NaiveBayes_count.ai/',
 './NaiveBayes_tfidf.ai/']

#### 2.2.1 Logistic Regression

In [9]:
from pyspark.ml.classification import LogisticRegression

lrs = [LogisticRegression.load(t_file) for t_file in extracted_models[:2]]
lrs

[LogisticRegression_6c76ceaefa53, LogisticRegression_05e1e1ba89f0]

#### 2.2.2 Naive Bayes

In [10]:
from pyspark.ml.classification import NaiveBayes

nbs = [NaiveBayes.load(t_file) for t_file in extracted_models[2:4]]
nbs

[NaiveBayes_15f252f354da, NaiveBayes_f195de66bbed]

#### 2.2.3 Convolutional Neural Network

In [11]:
import tensorflow as tf
tf.__version__

'2.2.0-rc0'

In [12]:
from tensorflow import keras

model = keras.models.load_model(model_temp_files[-1])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 82)                8282      
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 82)                0         
_________________________________________________________________
dropout (Dropout)            (None, 82)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 82)                6806      
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 82)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 82)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 8

## 3. Training the models

#### 3.1 Transforming and Splitting Training Data

In [13]:
import pyspark.sql.functions as sfun

df_train = df_train.join(df_label.select('id', sfun.col('target').alias('label')), on='id', how='inner')
df_train.show()

+---+--------------------+--------------------+--------------------+--------------------+-----+
| id|                text|      features_count|      features_tfidf|        features_w2v|label|
+---+--------------------+--------------------+--------------------+--------------------+-----+
|  1|Our Deeds are the...|(2266,[16,80,201,...|(2500,[26,166,336...|[-7.7565892466476...|    1|
|  4|Forest fire near ...|(2266,[2,139,183,...|(2500,[191,974,20...|[-0.0055530799685...|    1|
|  5|All residents ask...|(2266,[212,318,32...|(2500,[294,691,11...|[0.01664723102426...|    1|
|  6|13,000 people rec...|(2266,[8,41,85,21...|(2500,[8,325,644,...|[-0.0023539206013...|    1|
|  7|Just got sent thi...|(2266,[0,85,130,1...|(2500,[178,198,32...|[0.01491489600286...|    1|
|  8|#RockyFire Update...|(2266,[2,41,85,21...|(2500,[191,325,64...|[-0.0123642495212...|    1|
| 10|#flood #disaster ...|(2266,[15,21,57,1...|(2500,[353,356,64...|[0.00932735649985...|    1|
| 13|I'm on top of the...|(2266,[2,25,18

In [14]:
df_training, df_validation = df_train.randomSplit([0.7, 0.3], seed=42)
print("Training Dataset Count: " + str(df_training.count()))
print("Test Dataset Count: " + str(df_validation.count()))

Training Dataset Count: 5355
Test Dataset Count: 2258


#### 3.2 Training Spark ML Models

In [15]:
spark_models = lrs + nbs

def training(model):
    print('Training', model)
    return model.fit(df_training)

trained = [training(model) for model in spark_models]
trained

Training LogisticRegression_6c76ceaefa53
Training LogisticRegression_05e1e1ba89f0
Training NaiveBayes_15f252f354da
Training NaiveBayes_f195de66bbed


[LogisticRegressionModel: uid = LogisticRegression_6c76ceaefa53, numClasses = 2, numFeatures = 2266,
 LogisticRegressionModel: uid = LogisticRegression_05e1e1ba89f0, numClasses = 2, numFeatures = 2500,
 NaiveBayes_15f252f354da,
 NaiveBayes_f195de66bbed]

#### 3.3 Training Keras Model

In [16]:
import numpy as np
X = np.array(df_training.select('features_w2v').collect())
X = X.reshape(-1, 100)

y = np.array(df_training.select('label').collect())
y = y.reshape(-1,)

X.shape

(5355, 100)

In [17]:
history = model.fit(X, y,
                    batch_size=124, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [18]:
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

In [19]:
import plotly.express as px
import pandas as pd

loss_values = history.history['loss']
acc_values = history.history['accuracy']
history.history['epoche'] = range(1, len(loss_values)+1)

fig = px.line(pd.DataFrame(history.history),
              x='epoche', y=['loss', 'accuracy', 'val_loss', 'val_accuracy'])
fig.show()

## 4. Serializing Trained Models

In [20]:
!rm -rf *_*.ai

In [21]:
import os
import shutil

spark_models = lrs + nbs

def serialize_spark_model(model, name, feature):
    export_path = '{name:}_{feature:}_trained.ai'.format(name=name, feature=feature)
    model.save(export_path)
    return shutil.make_archive(base_name=export_path,
                               format='zip', base_dir=export_path)

spark_paths = [serialize_spark_model(model, str(model).split(':')[0], feature)
                for model, feature in zip(trained, ['count', 'tfidf'] * 2)]
spark_paths

['LogisticRegressionModel_count_trained.ai.zip',
 'LogisticRegressionModel_tfidf_trained.ai.zip',
 'NaiveBayes_15f252f354da_count_trained.ai.zip',
 'NaiveBayes_f195de66bbed_tfidf_trained.ai.zip']

In [22]:
def serialize_keras_model(model, name, feature):
    export_path = '{name:}_{feature:}_trained.ai.h5'.format(name=name, feature=feature)
    model.save(export_path)
    return export_path

keras_path = serialize_keras_model(model, 'Sequential_NN', 'w2v')
keras_path

'Sequential_NN_w2v_trained.ai.h5'

## 5. Serializing the Validation dataframe in Parquet Format

In [23]:
!rm -r ./desaster_detection_*

In [24]:
import glob

temp_parquet_file = os.path.join(os.path.curdir,
                                 'desaster_detection_validation_{}')
df_validation.write.parquet(temp_parquet_file.format('validation'), mode='overwrite')

glob.glob(temp_parquet_file.format('*'))

['./desaster_detection_validation_validation']

## 6. Uploading the Files to Object Cloud

In [25]:
def upload_model(client, path, model_key):
    with open(path, 'rb') as modelF:
        client.put_object(Bucket=ibm_cloud_store_bucket,
                          Body=modelF,
                          Key=model_key
                         )
    return model_key

client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

models = {upload_model(client, path, model_key=path): 'spark'
          for path in spark_paths}
models[upload_model(client, keras_path, model_key = keras_path)] = 'keras'

models

{'LogisticRegressionModel_count_trained.ai.zip': 'spark',
 'LogisticRegressionModel_tfidf_trained.ai.zip': 'spark',
 'NaiveBayes_15f252f354da_count_trained.ai.zip': 'spark',
 'NaiveBayes_f195de66bbed_tfidf_trained.ai.zip': 'spark',
 'Sequential_NN_w2v_trained.ai.h5': 'keras'}

In [26]:
import json

client.put_object(Bucket=ibm_cloud_store_bucket,
                  Body=json.dumps(models),
                  Key='model_train_files.json')

{'ResponseMetadata': {'RequestId': '45d8c6a3-8ea4-40b0-82b0-a895f0d5e89b',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Sun, 31 Jan 2021 16:21:26 GMT',
   'x-clv-request-id': '45d8c6a3-8ea4-40b0-82b0-a895f0d5e89b',
   'server': 'Cleversafe',
   'x-clv-s3-version': '2.5',
   'x-amz-request-id': '45d8c6a3-8ea4-40b0-82b0-a895f0d5e89b',
   'etag': '"9c0d62541008f2873813d5f2c4b00ac0"',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"9c0d62541008f2873813d5f2c4b00ac0"'}

In [27]:
def upload_parquet(client, path):
    parts = glob.glob(os.path.join(path, '*.parquet'))
    parquets = ['{:s}-{:04d}.parquet'.format(os.path.split(path)[-1], i)
                for i in range(len(parts))]
    for part, parquet in zip(parts, parquets):
        with open(part, 'rb') as parquetF:
            client.put_object(Bucket=ibm_cloud_store_bucket,
                          Body=parquetF,
                          Key=parquet
                         )
    return parquets

client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')


parquets = {}
for dataset in ('validation',):
    parquets[dataset] = upload_parquet(client, temp_parquet_file.format(dataset))

print(parquets)

{'validation': ['desaster_detection_validation_validation-0000.parquet']}


In [28]:
import json


parquets.update(files)
client.put_object(Bucket=ibm_cloud_store_bucket,
                  Body=json.dumps(parquets),
                  Key='validation_parquet_files.json')

{'ResponseMetadata': {'RequestId': '1967474a-f7a8-4fda-85b3-e9997551cd7e',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Sun, 31 Jan 2021 16:21:27 GMT',
   'x-clv-request-id': '1967474a-f7a8-4fda-85b3-e9997551cd7e',
   'server': 'Cleversafe',
   'x-clv-s3-version': '2.5',
   'x-amz-request-id': '1967474a-f7a8-4fda-85b3-e9997551cd7e',
   'etag': '"b0bf149ce6f4f580db8b8e2ddfb57f55"',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"b0bf149ce6f4f580db8b8e2ddfb57f55"'}