# Model Development
Below is a set of steps to prepare the pipeline for model development

### 1. Import libraries and load data from database.
* Import libraries to be used across the notebook

* Load dataset from database with `read_sql_table`

* Define feature and target variables X and Y




In [23]:
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import re

from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV

from sqlalchemy import create_engine

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
# Load data from database
engine = create_engine('sqlite:///..data/datasets.db')
preprocessed_dataset = pd.read_sql_table('labelled_messages', engine) 
X = preprocessed_dataset['message']
Y = preprocessed_dataset.drop(['message', 'original', 'genre', 'id'], axis=1)

### 2. Define a tokenization function to process the text data

In [24]:
def tokenize(text):
    tokens = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    tokens = text.split()
    
    tokens = [t for t in tokens if t not in stopwords.words("english")]
    
    # Reduce words to their stems
    tokens = [PorterStemmer().stem(t) for t in tokens]
    
    # Reduce words to their root form
    tokens = [WordNetLemmatizer().lemmatize(t) for t in tokens]
    
    return tokens

print(tokenize('Is the Hurricane over or is it not over'))

['Is', 'hurrican']


### 3. Build the machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset.

In [None]:
# We first use the RandomForestClassifier for the multi-output prediction
pipeline = Pipeline([
    ('vect1', CountVectorizer(tokenizer=tokenize)), # Adding a count vectorizer that utilizes the previously defined tokenize function
    ('tfidf1', TfidfTransformer()), # Using a term frequency inverse document frequency transformer to construct feature matrix
    ('clf1', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train machine learning model for multi-output classification
- Split data into train and test sets
- Train the model with default parameters

In [None]:
# Split the dataset into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

# Fit the previously defined pipeline with default parameter on the training dataset
pipeline.fit(X_train, Y_train, verbose=2)

# Predict on test data
Y_pred = pipeline.predict(X_test)

# Evaluate model accuracy
labels = np.unique(Y_pred)
accuracy = (Y_pred == Y_test).mean()

print("Labels:", labels)
print("Accuracy:", accuracy)

Labels: [0 1]
Accuracy: related                   0.817912
request                   0.885005
offer                     0.995163
aid_related               0.766890
medical_help              0.918240
medical_products          0.947106
search_and_rescue         0.970822
security                  0.983149
military                  0.970510
child_alone               1.000000
water                     0.950226
food                      0.938680
shelter                   0.930878
clothing                  0.983929
money                     0.979872
missing_people            0.988454
refugees                  0.967546
death                     0.961304
other_aid                 0.867686
infrastructure_related    0.934623
transport                 0.955375
buildings                 0.958028
electricity               0.977844
tools                     0.992667
hospitals                 0.987830
shops                     0.995475
aid_centers               0.988454
other_infrastructure      0.957

### 5. Evaluating model performance further
Report the f1 score, precision and recall for each output category of the dataset

In [None]:
y_pred_df = pd.DataFrame(Y_pred, columns=Y_test.columns)

# Printing out the classification report for every label in the dataset
for column in Y_test.columns:
    print(column, classification_report(Y_test[column],y_pred_df[column]))
    

related               precision    recall  f1-score   support

           0       0.68      0.41      0.51      1493
           1       0.84      0.94      0.89      4916

    accuracy                           0.82      6409
   macro avg       0.76      0.68      0.70      6409
weighted avg       0.80      0.82      0.80      6409

request               precision    recall  f1-score   support

           0       0.89      0.98      0.93      5257
           1       0.82      0.46      0.59      1152

    accuracy                           0.89      6409
   macro avg       0.86      0.72      0.76      6409
weighted avg       0.88      0.89      0.87      6409

offer               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6378
           1       0.00      0.00      0.00        31

    accuracy                           1.00      6409
   macro avg       0.50      0.50      0.50      6409
weighted avg       0.99      1.00      0.99      640

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


water               precision    recall  f1-score   support

           0       0.95      1.00      0.97      5988
           1       0.88      0.28      0.43       421

    accuracy                           0.95      6409
   macro avg       0.92      0.64      0.70      6409
weighted avg       0.95      0.95      0.94      6409

food               precision    recall  f1-score   support

           0       0.95      0.99      0.97      5689
           1       0.85      0.55      0.67       720

    accuracy                           0.94      6409
   macro avg       0.90      0.77      0.82      6409
weighted avg       0.93      0.94      0.93      6409

shelter               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5818
           1       0.82      0.32      0.46       591

    accuracy                           0.93      6409
   macro avg       0.88      0.66      0.71      6409
weighted avg       0.92      0.93      0.92      6409



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


transport               precision    recall  f1-score   support

           0       0.96      1.00      0.98      6112
           1       0.79      0.05      0.09       297

    accuracy                           0.96      6409
   macro avg       0.87      0.52      0.54      6409
weighted avg       0.95      0.96      0.94      6409

buildings               precision    recall  f1-score   support

           0       0.96      1.00      0.98      6106
           1       0.95      0.12      0.21       303

    accuracy                           0.96      6409
   macro avg       0.95      0.56      0.59      6409
weighted avg       0.96      0.96      0.94      6409

electricity               precision    recall  f1-score   support

           0       0.98      1.00      0.99      6266
           1       0.67      0.01      0.03       143

    accuracy                           0.98      6409
   macro avg       0.82      0.51      0.51      6409
weighted avg       0.97      0.98      0.9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


floods               precision    recall  f1-score   support

           0       0.95      1.00      0.97      5878
           1       0.90      0.39      0.54       531

    accuracy                           0.95      6409
   macro avg       0.92      0.69      0.76      6409
weighted avg       0.94      0.95      0.94      6409

storm               precision    recall  f1-score   support

           0       0.95      0.99      0.97      5818
           1       0.79      0.45      0.57       591

    accuracy                           0.94      6409
   macro avg       0.87      0.72      0.77      6409
weighted avg       0.93      0.94      0.93      6409

fire               precision    recall  f1-score   support

           0       0.99      1.00      0.99      6338
           1       0.00      0.00      0.00        71

    accuracy                           0.99      6409
   macro avg       0.49      0.50      0.50      6409
weighted avg       0.98      0.99      0.98      6409

e

### 6. Improve the model by conducting hyper parameter tuning
Use grid search to find better parameters for the RandomForestClassifier

In [None]:
# Tuning the n_estimators of the random forest classifier
parameters = {
        'clf1__estimator__n_estimators': [50, 100, 200]
    }

cv = GridSearchCV(pipeline, param_grid=parameters, verbose=2)

cv.fit(X_train, Y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ...................clf1__estimator__n_estimators=50; total time= 5.1min
[CV] END ...................clf1__estimator__n_estimators=50; total time= 4.7min
[CV] END ...................clf1__estimator__n_estimators=50; total time= 4.7min
[CV] END ...................clf1__estimator__n_estimators=50; total time= 4.6min
[CV] END ...................clf1__estimator__n_estimators=50; total time= 4.7min
[CV] END ..................clf1__estimator__n_estimators=100; total time= 8.2min
[CV] END ..................clf1__estimator__n_estimators=100; total time= 8.2min
[CV] END ..................clf1__estimator__n_estimators=100; total time= 8.0min
[CV] END ..................clf1__estimator__n_estimators=100; total time= 8.0min
[CV] END ..................clf1__estimator__n_estimators=100; total time= 8.0min
[CV] END ..................clf1__estimator__n_estimators=200; total time=14.6min
[CV] END ..................clf1__estimator__n_est

GridSearchCV(estimator=Pipeline(steps=[('vect1',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7f29739b58c0>)),
                                       ('tfidf1', TfidfTransformer()),
                                       ('clf1',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             param_grid={'clf1__estimator__n_estimators': [50, 100, 200]},
             verbose=2)

### 7. Evaluate the accuracy of the tuned model
Check the improvement in model accuracy after tuning the n_estimators

In [None]:
# Predict on test data
Y_pred = cv.predict(X_test)

labels = np.unique(Y_pred)
accuracy = (Y_pred == Y_test).mean()

print("Labels:", labels)
print("Accuracy:", accuracy)

Labels: [0 1]
Accuracy: related                   0.818068
request                   0.883913
offer                     0.995163
aid_related               0.775160
medical_help              0.916836
medical_products          0.947574
search_and_rescue         0.970822
security                  0.983305
military                  0.970354
child_alone               1.000000
water                     0.953659
food                      0.940084
shelter                   0.931971
clothing                  0.985177
money                     0.979560
missing_people            0.988454
refugees                  0.967546
death                     0.961929
other_aid                 0.867218
infrastructure_related    0.934467
transport                 0.955531
buildings                 0.957248
electricity               0.978000
tools                     0.992667
hospitals                 0.987830
shops                     0.995475
aid_centers               0.988454
other_infrastructure      0.957

### 8. Testing other algorithms and how well they perform against the RandomClassifier


In [21]:
# Changing the RandomClassifier to an AdaBoostClassifier
ada_pipeline = Pipeline([
    ('vect1', CountVectorizer(tokenizer=tokenize)),
    ('tfidf1', TfidfTransformer()),
    ('clf1', MultiOutputClassifier(AdaBoostClassifier()))
])

ada_parameters = {
        'vect1__ngram_range': ((1, 1), (1, 2)), # Tune the count vectorizer for a unigram or bigram model
        'clf1__estimator__learning_rate': [0.001, 0.0001],
        'clf1__estimator__n_estimators': [50, 100, 150]
    }

ada_cv = GridSearchCV(ada_pipeline, param_grid=ada_parameters, verbose=2)
ada_cv.fit(X_train, Y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END clf1__estimator__learning_rate=0.001, clf1__estimator__n_estimators=50, vect1__ngram_range=(1, 1); total time=  53.7s
[CV] END clf1__estimator__learning_rate=0.001, clf1__estimator__n_estimators=50, vect1__ngram_range=(1, 1); total time=  44.2s
[CV] END clf1__estimator__learning_rate=0.001, clf1__estimator__n_estimators=50, vect1__ngram_range=(1, 1); total time=  43.0s
[CV] END clf1__estimator__learning_rate=0.001, clf1__estimator__n_estimators=50, vect1__ngram_range=(1, 1); total time=  42.0s
[CV] END clf1__estimator__learning_rate=0.001, clf1__estimator__n_estimators=50, vect1__ngram_range=(1, 1); total time=  42.0s
[CV] END clf1__estimator__learning_rate=0.001, clf1__estimator__n_estimators=50, vect1__ngram_range=(1, 2); total time= 1.0min
[CV] END clf1__estimator__learning_rate=0.001, clf1__estimator__n_estimators=50, vect1__ngram_range=(1, 2); total time= 1.1min
[CV] END clf1__estimator__learning_rate=0.001, clf

GridSearchCV(estimator=Pipeline(steps=[('vect1',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7fd20f0df440>)),
                                       ('tfidf1', TfidfTransformer()),
                                       ('clf1',
                                        MultiOutputClassifier(estimator=AdaBoostClassifier()))]),
             param_grid={'clf1__estimator__learning_rate': [0.001, 0.0001],
                         'clf1__estimator__n_estimators': [50, 100, 150],
                         'vect1__ngram_range': ((1, 1), (1, 2))},
             verbose=2)

In [26]:
# Predict on test data
ada_Y_pred = ada_cv.predict(X_test)

ada_labels = np.unique(ada_Y_pred)
ada_accuracy = (ada_Y_pred == Y_test).mean()

print("Labels:", ada_labels)
print("Accuracy:", ada_accuracy)

Labels: [0 1]
Accuracy: related                   0.661684
request                   0.721873
offer                     0.998820
aid_related               0.717152
medical_help              0.950039
medical_products          0.969709
search_and_rescue         0.977970
security                  0.990559
military                  0.995279
water                     0.949253
food                      0.906766
shelter                   0.918568
clothing                  0.993312
money                     0.990559
missing_people            0.992132
refugees                  0.982691
death                     0.974823
other_aid                 0.864280
infrastructure_related    0.972069
transport                 0.979544
buildings                 0.965775
electricity               0.994099
tools                     0.996066
hospitals                 0.996066
shops                     0.996853
aid_centers               0.994493
other_infrastructure      0.984658
weather_related           0.887

### 9. Trying a stochastic gradient descent classifier and comparing it to other models



In [53]:
# Change the last step of the pipline to utilize an SGDClassifier for the MultiOutputClassifier
sgd_pipeline = Pipeline([
    ('vect1', CountVectorizer(tokenizer=tokenize)),
    ('tfidf1', TfidfTransformer()),
    ('clf1', MultiOutputClassifier(SGDClassifier()))
])

sgd_parameters = {
        'vect1__ngram_range': ((1, 1), (1, 2)), # Tune the count vectorizer for a unigram or bigram model
        'clf1__estimator__loss':['hinge', 'log', 'modified_huber'], # Evaluate the different algrithms governing the loss function of the SGDClassifier
        'clf1__estimator__alpha':[0.0001,0.001] # Evaluate the best learning rate for the algorithm
    }

sgd_cv = GridSearchCV(sgd_pipeline, param_grid=sgd_parameters, verbose=2)
sgd_cv.fit(X_train, Y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END clf1__estimator__alpha=0.0001, clf1__estimator__loss=hinge, vect1__ngram_range=(1, 1); total time=  15.5s
[CV] END clf1__estimator__alpha=0.0001, clf1__estimator__loss=hinge, vect1__ngram_range=(1, 1); total time=  15.3s
[CV] END clf1__estimator__alpha=0.0001, clf1__estimator__loss=hinge, vect1__ngram_range=(1, 1); total time=  15.3s
[CV] END clf1__estimator__alpha=0.0001, clf1__estimator__loss=hinge, vect1__ngram_range=(1, 1); total time=  15.1s
[CV] END clf1__estimator__alpha=0.0001, clf1__estimator__loss=hinge, vect1__ngram_range=(1, 1); total time=  15.3s
[CV] END clf1__estimator__alpha=0.0001, clf1__estimator__loss=hinge, vect1__ngram_range=(1, 2); total time=  15.6s
[CV] END clf1__estimator__alpha=0.0001, clf1__estimator__loss=hinge, vect1__ngram_range=(1, 2); total time=  15.6s
[CV] END clf1__estimator__alpha=0.0001, clf1__estimator__loss=hinge, vect1__ngram_range=(1, 2); total time=  15.5s
[CV] END clf1__esti

GridSearchCV(estimator=Pipeline(steps=[('vect1',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7f810a6efef0>)),
                                       ('tfidf1', TfidfTransformer()),
                                       ('clf1',
                                        MultiOutputClassifier(estimator=SGDClassifier()))]),
             param_grid={'clf1__estimator__alpha': [0.0001, 0.001],
                         'clf1__estimator__loss': ['hinge', 'log',
                                                   'modified_huber'],
                         'vect1__ngram_range': ((1, 1), (1, 2))},
             verbose=2)

In [54]:
# Predict on test data
Y_pred = sgd_cv.predict(X_test)

labels = np.unique(Y_pred)
accuracy = (Y_pred == Y_test).mean()

print("Labels:", labels)
print("Accuracy:", accuracy)

print("\nBest Parameters:", sgd_cv.best_params_)

Labels: [0 1 2]
Accuracy: related                   0.786389
request                   0.823761
offer                     1.000000
aid_related               0.822581
medical_help              0.945712
medical_products          0.969709
search_and_rescue         0.982297
security                  0.991345
military                  0.996459
water                     0.976397
food                      0.958301
shelter                   0.944532
clothing                  0.992132
money                     0.988198
missing_people            0.994099
refugees                  0.988198
death                     0.982297
other_aid                 0.867034
infrastructure_related    0.968922
transport                 0.981511
buildings                 0.968922
electricity               0.995673
tools                     0.996853
hospitals                 0.995279
shops                     0.995673
aid_centers               0.990952
other_infrastructure      0.983478
weather_related           0.9

### 9. Export the tuned model as a pickle file

In [30]:
import pickle

with open('picked_model', 'wb') as f:
  pickle.dump(sgd_cv, f)