# Model Training

## 1. Setting Up Spark Context

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

## 2. Download data from Object Store

In [3]:
import os
import getpass

def get_or_set_environment_variable(variable):
    try:
        var = os.environ[variable]
    except KeyError:
        var = getpass.getpass('Please enter value for {:}: '.format(variable))
    
    os.environ[variable] = var
    return var

ibm_api_key_id = get_or_set_environment_variable('IBM_API_KEY_ID')
ibm_cloud_store_bucket = get_or_set_environment_variable('IBM_OBJECT_STORE_BUCKET')

Please enter value for IBM_API_KEY_ID: ········
Please enter value for IBM_OBJECT_STORE_BUCKET: ········


In [4]:
import json
import os

import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client.get_object(Bucket=ibm_cloud_store_bucket,
                         Key='feature_eng_parquet_files.json')['Body']
# add missing __iter__ method

if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

files = json.load(body)
files

{'train': ['disaster_detection_clean_train-0000.parquet'],
 'test': ['disaster_detection_clean_test-0000.parquet'],
 'label': ['disaster_detection_label-0000.parquet']}

In [5]:
def load_dataframe(files, **kargs):
    dfs = []
    for fn in files:
        body = client.get_object(Bucket=ibm_cloud_store_bucket,
                                 Key=fn)['Body']
        if not hasattr(body, "__iter__"):
            body.__iter__ = types.MethodType( __iter__, body )
        
        tfn = 'temp_{:}'.format(fn)
        with open(tfn, 'wb') as temp:
            temp.write(body.read())
        dfs.append(spark.read.options(**kargs).parquet(tfn))
    df = dfs.pop()
    for other in dfs:
        df = df.union(other)
    return df

df_train = load_dataframe(files['train'])

In [6]:
df_train.first()

Row(id=1, text='Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', features_count=SparseVector(2266, {16: 1.0, 80: 1.0, 201: 1.0, 451: 1.0, 1499: 1.0, 1917: 1.0}), features_tfidf=SparseVector(2500, {26: 3.5396, 166: 4.4269, 336: 4.9488, 689: 6.8583, 768: 4.7946, 1830: 6.5398, 2174: 5.942}), features_w2v=DenseVector([-0.0027, -0.0033, -0.0001, -0.0146, -0.0046, -0.0019, 0.0145, -0.0033, 0.0021, -0.0035, 0.0034, -0.0032, 0.0055, 0.0025, 0.0017, -0.0042, -0.0064, 0.0136, -0.0145, 0.0039, -0.0013, -0.0159, 0.0002, -0.0047, -0.0079, 0.0002, 0.003, 0.0003, 0.0128, 0.0077, 0.0042, 0.0006, -0.0179, 0.0065, 0.0165, 0.0014, 0.0083, 0.0085, -0.0005, -0.0023, -0.0029, 0.0114, -0.0027, -0.006, -0.0053, -0.0076, -0.007, -0.0069, 0.0033, 0.0037, -0.0085, -0.0051, -0.0048, 0.002, -0.0105, 0.0108, 0.0022, 0.0136, -0.0087, 0.0014, -0.0045, -0.0036, -0.0108, 0.0035, -0.0076, -0.0008, 0.004, -0.0032, 0.0056, -0.0102, -0.0083, 0.0029, 0.0036, -0.008, -0.003, -0.0052, 0.0015, -0.0037, 0

## 2. Model definition

We try 3 different models

* logistic regression,
* multinomial naive Bayes, and
* a convolutional neural network

### 2.1 Logistic Regression

In [7]:
from pyspark.ml.classification import LogisticRegression

lrs = [LogisticRegression(featuresCol=feat,
                          maxIter=20, regParam=0.3, elasticNetParam=0)
       for feat in ('features_count', 'features_tfidf')]
lrs

[LogisticRegression_c8b0de18d229, LogisticRegression_20d068424298]

### 2.2 Naive Bayes

In [8]:
from pyspark.ml.classification import NaiveBayes

nbs = [NaiveBayes(featuresCol=feat, smoothing=1)
       for feat in ('features_count', 'features_tfidf')]
nbs

[NaiveBayes_eadba888dbea, NaiveBayes_9056b7fbe3e8]

### 2.3 Convolutional Neural Network

In [9]:
import tensorflow as tf
tf.__version__

'2.2.0-rc0'

In [10]:
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers

MAX_SEQUENCE_LENGTH = len(df_train.select('features_w2v').first()['features_w2v'])

model = Sequential()
model.add(Dense(82, input_dim=MAX_SEQUENCE_LENGTH))
model.add(LeakyReLU(alpha=0.01))
model.add(Dropout(0.20))
model.add(Dense(82))
model.add(LeakyReLU(alpha=0.01))
model.add(Dropout(0.20))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## 3. Serializing the Models

In [11]:
!rm -rf *_*.ai

In [12]:
import os
import shutil

spark_models = lrs + nbs

def serialize_spark_model(model, name, feature):
    export_path = '{name:}_{feature:}.ai'.format(name=name, feature=feature)
    model.save(export_path)
    return shutil.make_archive(base_name=export_path,
                               format='zip', base_dir=export_path)

spark_paths = [serialize_spark_model(model, str(model).split('_')[0], feature)
                for model, feature in zip(spark_models, ['count', 'tfidf'] * 2)]
spark_paths

['LogisticRegression_count.ai.zip',
 'LogisticRegression_tfidf.ai.zip',
 'NaiveBayes_count.ai.zip',
 'NaiveBayes_tfidf.ai.zip']

In [13]:
def serialize_keras_model(model, name, feature):
    export_path = '{name:}_{feature:}.ai.h5'.format(name=name, feature=feature)
    model.save(export_path)
    return export_path

keras_path = serialize_keras_model(model, 'Sequential_NN', 'w2v')
keras_path

'Sequential_NN_w2v.ai.h5'

## 4. Uploading the files to object cloud

In [14]:
def upload_model(client, path, model_key):
    with open(path, 'rb') as modelF:
        client.put_object(Bucket=ibm_cloud_store_bucket,
                          Body=modelF,
                          Key=model_key
                         )
    return model_key

client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

models = {upload_model(client, path, model_key=path): 'spark'
          for path in spark_paths}
models[upload_model(client, keras_path, model_key = keras_path)] = 'keras'

models

{'LogisticRegression_count.ai.zip': 'spark',
 'LogisticRegression_tfidf.ai.zip': 'spark',
 'NaiveBayes_count.ai.zip': 'spark',
 'NaiveBayes_tfidf.ai.zip': 'spark',
 'Sequential_NN_w2v.ai.h5': 'keras'}

In [15]:
import json

client.put_object(Bucket=ibm_cloud_store_bucket,
                  Body=json.dumps(models),
                  Key='model_def_files.json')

{'ResponseMetadata': {'RequestId': '07a41d92-bfb0-4b32-b6fd-4dc60fcaf2d4',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Sat, 06 Feb 2021 21:48:13 GMT',
   'x-clv-request-id': '07a41d92-bfb0-4b32-b6fd-4dc60fcaf2d4',
   'server': 'Cleversafe',
   'x-clv-s3-version': '2.5',
   'x-amz-request-id': '07a41d92-bfb0-4b32-b6fd-4dc60fcaf2d4',
   'etag': '"6f57c0c7b085219d66ba44f66bbca793"',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"6f57c0c7b085219d66ba44f66bbca793"'}