# Model Evaluation

## 1. Setting Up Spark Context

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

## 2. Download data from Object Store

In [4]:
import os
import getpass

def get_or_set_environment_variable(variable):
    try:
        var = os.environ[variable]
    except KeyError:
        var = getpass.getpass('Please enter value for {:}: '.format(variable))
    
    os.environ[variable] = var
    return var

ibm_api_key_id = get_or_set_environment_variable('IBM_API_KEY_ID')
ibm_cloud_store_bucket = get_or_set_environment_variable('IBM_OBJECT_STORE_BUCKET')

### 2.1 Loading Data for Evaluation

In [5]:
# The code was removed by Watson Studio for sharing.

{'validation': ['desaster_detection_validation_validation-0000.parquet'],
 'train': ['desaster_detection_clean_train-0000.parquet'],
 'test': ['desaster_detection_clean_test-0000.parquet'],
 'label': ['desaster_detection_label-0000.parquet']}

In [6]:
def load_dataframe(files, **kargs):
    dfs = []
    for fn in files:
        body = client.get_object(Bucket=ibm_cloud_store_bucket,
                                 Key=fn)['Body']
        if not hasattr(body, "__iter__"):
            body.__iter__ = types.MethodType( __iter__, body )
        
        tfn = 'temp_{:}'.format(fn)
        with open(tfn, 'wb') as temp:
            temp.write(body.read())
        dfs.append(spark.read.options(**kargs).parquet(tfn))
    df = dfs.pop()
    for other in dfs:
        df = df.union(other)
    return df

df_test = load_dataframe(files['test'])

### 2.1 Loading Trained Models

In [7]:
client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client.get_object(Bucket=ibm_cloud_store_bucket,
                         Key='model_train_files.json')['Body']

if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

model_files = json.load(body)
model_files

{'LogisticRegressionModel_count_trained.ai.zip': 'spark',
 'LogisticRegressionModel_tfidf_trained.ai.zip': 'spark',
 'NaiveBayes_15f252f354da_count_trained.ai.zip': 'spark',
 'NaiveBayes_f195de66bbed_tfidf_trained.ai.zip': 'spark',
 'Sequential_NN_w2v_trained.ai.h5': 'keras'}

In [10]:
fn = list(model_files.keys())[2]

body = client.get_object(Bucket=ibm_cloud_store_bucket,
                         Key=fn)['Body']
if not hasattr(body, "__iter__"):
    body.__iter__ = types.MethodType( __iter__, body )

tfn = os.path.join(os.path.curdir, 'temp_{:}'.format(fn))
with open(tfn, 'wb') as temp:
    temp.write(body.read())

tfn

'./temp_NaiveBayes_15f252f354da_count_trained.ai.zip'

In [11]:
import zipfile

def unzip_file(path):
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall(os.curdir)
        extracted = zip_ref.namelist()[0]
    return os.path.join(os.curdir, extracted)

extracted_model = unzip_file(tfn)
extracted_model

'./NaiveBayes_15f252f354da_count_trained.ai/'

### 2.2 Loading the Naive Bayes Model

In [13]:
from pyspark.ml.classification import NaiveBayesModel

model = NaiveBayesModel.load(extracted_model)
model

NaiveBayes_15f252f354da

## 3. Predicting the Test Data

In [16]:
prediction = model.transform(df_test).select('id', 'text', 'prediction')

prediction.limit(10).toPandas()

Unnamed: 0,id,text,prediction
0,0,Just happened a terrible car crash,1.0
1,2,"Heard about #earthquake is different cities, s...",0.0
2,3,"there is a forest fire at spot pond, geese are...",1.0
3,9,Apocalypse lighting. #Spokane #wildfires,1.0
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1.0
5,12,We're shaking...It's an earthquake,1.0
6,21,They'd probably still show more life than Arse...,0.0
7,22,Hey! How are you?,0.0
8,27,What a nice hat?,0.0
9,29,Fuck off!,0.0


## 4. Exporting the Data in Specified Format

In [29]:
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as sfun

df_exp = prediction.select('id', sfun.col('prediction').cast(IntegerType()).alias('target'))
df_exp.limit(10).toPandas()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [30]:
df_exp.toPandas().to_csv('submission.csv', index=False)

## 5. Submitting the CSV to Kaggle

In [31]:
import os
import json

client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client.get_object(Bucket=ibm_cloud_store_bucket,
                         Key='kaggle.json')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object 

if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

creds = json.load(body)
os.environ['KAGGLE_USERNAME'] = creds['username']
os.environ['KAGGLE_KEY'] = creds['key']

In [32]:
!kaggle competitions submit -f submission.csv -m "Submitting from IBM cloud to Kaggle" nlp-getting-started

100%|██████████████████████████████████████| 22.2k/22.2k [00:00<00:00, 34.8kB/s]
Successfully submitted to Natural Language Processing with Disaster Tweets

In [36]:
!kaggle competitions submissions nlp-getting-started

fileName        date                 description                          status    publicScore  privateScore  
--------------  -------------------  -----------------------------------  --------  -----------  ------------  
submission.csv  2021-01-30 22:31:42  Submitting from IBM cloud to Kaggle  complete  0.78700      None          
submission.csv  2021-01-30 22:30:36  Submitting from IBM cloud to Kaggle  error     None         None          
