# Machine Learning Pipeline

# Import Libraries and Load Data

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sqlalchemy import create_engine

In [3]:
import os
print(os.getcwd())
os.chdir(os.getcwd() + '/data')

/Users/zhikaichen/Documents/data_science/DisasterResponse


In [4]:
# Load data from database
engine = create_engine('sqlite:///disasterMessages.db')
df = pd.read_sql_table('DisasterMessagesTable', engine)
X = df['message']
Y = df.iloc[:,4:]

In [5]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Preprocess Data

We will convert all text data to lowercase to ensure consistency and reduce the complexity of the vocabulary. Then, we will tokenize the lowercase text into individual words and lemmatize them to further reduce the vocabulary and group together words with similar meanings.

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [7]:
nltk.download(['punkt', 'wordnet', 'stopwords'])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhikaichen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhikaichen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhikaichen/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Tokenization Function

In [8]:
from nltk.corpus import stopwords

def tokenize(text):
    """
    Tokenize a string of text into individual words

    Input:
        text: The string that needs to be tokenized
    Output:
        clean_tokens: tokens
    """
    # convert text to lowercase
    text = text.lower()
    # tokenize the text 
    tokens = word_tokenize(text)
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for token in tokens:
        if token not in stopwords.words('english'):
            clean_token = lemmatizer.lemmatize(token).strip()
            clean_tokens.append(clean_token)
    
    return clean_tokens

# Build ML Pipeline

This pipeline takes in the message column as input and outputs classification results on the other 36 categories in the dataset.

In [9]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

# Train Pipeline

In [10]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,Y)

# Fit data on pipeline
pipeline.fit(X_train, y_train)

# Test Model

In [16]:
# Make predictions using the model on X_test
y_pred = pipeline.predict(X_test)

In [17]:
# Generate a classification report for each of the categories
from sklearn.metrics import classification_report

for idx, column in enumerate(y_test):
    print('Classification report for Category: {}'.format(column))
    print(classification_report(y_test[column], y_pred[:, idx]))

Classification report for Category: related
              precision    recall  f1-score   support

           0       0.71      0.26      0.38      1538
           1       0.80      0.97      0.88      4974
           2       0.58      0.17      0.26        42

    accuracy                           0.80      6554
   macro avg       0.70      0.46      0.50      6554
weighted avg       0.78      0.80      0.76      6554

Classification report for Category: request
              precision    recall  f1-score   support

           0       0.89      0.99      0.94      5421
           1       0.88      0.43      0.57      1133

    accuracy                           0.89      6554
   macro avg       0.89      0.71      0.76      6554
weighted avg       0.89      0.89      0.87      6554

Classification report for Category: offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6525
           1       0.00      0.00      0.00        29

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      1.00      0.99      6374
           1       0.80      0.07      0.12       180

    accuracy                           0.97      6554
   macro avg       0.89      0.53      0.55      6554
weighted avg       0.97      0.97      0.96      6554

Classification report for Category: security
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6447
           1       0.00      0.00      0.00       107

    accuracy                           0.98      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.97      0.98      0.98      6554

Classification report for Category: military
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      6359
           1       0.71      0.06      0.11       195

    accuracy                           0.97      6554
   macro avg       0.84      0.53     

# Hyperparameter Tuning

Use grid search for hyperparameter tuning

In [18]:
# Get a list of hyperparameters 
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x7f8aba790320>,
                   vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                          ccp_alpha=0.0,
                                                          class_weight=None,
                                                          criterion='gini',
                                                          max_depth=No

In [19]:
from sklearn.model_selection import GridSearchCV

# Hyperparameters we want to tune
parameters = {
    'clf__estimator__n_estimators': [50, 100]
}

cv = GridSearchCV(pipeline, param_grid = parameters)

In [20]:
cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        p

In [21]:
cv.best_params_

{'clf__estimator__n_estimators': 100}

# Test Model

In [22]:
y_pred = cv.predict(X_test)

In [23]:
for idx, column in enumerate(y_test):
    print('Classification report for Category: {}'.format(column))
    print(classification_report(y_test[column], y_pred[:, idx]))

Classification report for Category: related
              precision    recall  f1-score   support

           0       0.73      0.27      0.39      1538
           1       0.81      0.97      0.88      4974
           2       0.38      0.07      0.12        42

    accuracy                           0.80      6554
   macro avg       0.64      0.44      0.46      6554
weighted avg       0.79      0.80      0.76      6554

Classification report for Category: request
              precision    recall  f1-score   support

           0       0.89      0.99      0.94      5421
           1       0.88      0.42      0.57      1133

    accuracy                           0.89      6554
   macro avg       0.89      0.70      0.75      6554
weighted avg       0.89      0.89      0.87      6554

Classification report for Category: offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6525
           1       0.00      0.00      0.00        29

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6403
           1       1.00      0.01      0.03       151

    accuracy                           0.98      6554
   macro avg       0.99      0.51      0.51      6554
weighted avg       0.98      0.98      0.97      6554

Classification report for Category: missing_people
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6481
           1       1.00      0.01      0.03        73

    accuracy                           0.99      6554
   macro avg       0.99      0.51      0.51      6554
weighted avg       0.99      0.99      0.98      6554

Classification report for Category: refugees
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6362
           1       0.25      0.01      0.01       192

    accuracy                           0.97      6554
   macro avg       0.61      0.5

In [24]:
accuracy = (y_pred == y_test).mean()
print(accuracy)

related                   0.798291
request                   0.889533
offer                     0.995575
aid_related               0.775099
medical_help              0.919591
medical_products          0.947208
search_and_rescue         0.974062
security                  0.983674
military                  0.970552
child_alone               1.000000
water                     0.952701
food                      0.928593
shelter                   0.929966
clothing                  0.984284
money                     0.977266
missing_people            0.989014
refugees                  0.970400
death                     0.959262
other_aid                 0.875954
infrastructure_related    0.932560
transport                 0.955294
buildings                 0.950259
electricity               0.982301
tools                     0.993592
hospitals                 0.988251
shops                     0.994812
aid_centers               0.987946
other_infrastructure      0.955752
weather_related     

# Export Model to Pickle File

In [25]:
import pickle
filename = '../models/model.pkl'
pickle.dump(cv, open(filename, 'wb'))