# ML Pipeline Preparation
### 1. Import libraries and load data from database

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import re
import pickle

# nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

#sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /Users/zora/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///../webapp/data/DisasterResponse.db')
df = pd.read_sql_table('messages_categories', engine)

# create X and y dataframes for pipeline
X = df['message'] # predictors
y = df.drop(['id', 'message', 'original', 'genre'], axis = 1).astype('int64') # labels

### 2. Write a tokenization function to process text data

In [3]:
def tokenize(text):
    text_norm = re.sub(r'[^a-zA-Z0-9]', ' ', text) # normalize: remove punctuation
    word_list = word_tokenize(text_norm) # tokenize 
    word_list_clean = [w for w in word_list if w not in stopwords.words('english')] # remove stopwords
    word_list_stemmed = [PorterStemmer().stem(w) for w in word_list_clean] # stemm words
    return word_list_stemmed

### 3. Build a machine learning pipeline

In [4]:
# build a pipeline with countvectorizer, tfidf and random forest classifier
pipeline_RF = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('multi-clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline

In [5]:
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

# fit data
pipeline_RF.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x11c057ca0>)),
                ('tfidf', TfidfTransformer()),
                ('multi-clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

### 5. Test model

In [6]:
# run prediction with test data
y_pred = pipeline_RF.predict(X_test)

In [7]:
# print precision, recall and f1-score
i = 0
for col in y_test:
    print('Evaluation for "{}": \n {} \n\n'.format(col, classification_report(y_test[col], y_pred[:,i])))
    i += 1

Evaluation for "related": 
               precision    recall  f1-score   support

           0       0.70      0.42      0.53      1862
           1       0.84      0.94      0.89      5946
           2       0.27      0.30      0.28        57

    accuracy                           0.81      7865
   macro avg       0.60      0.55      0.56      7865
weighted avg       0.80      0.81      0.80      7865
 


Evaluation for "request": 
               precision    recall  f1-score   support

           0       0.91      0.98      0.94      6555
           1       0.83      0.49      0.62      1310

    accuracy                           0.90      7865
   macro avg       0.87      0.74      0.78      7865
weighted avg       0.89      0.90      0.89      7865
 


Evaluation for "offer": 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7838
           1       0.00      0.00      0.00        27

    accuracy                           1

  _warn_prf(average, modifier, msg_start, len(result))


Evaluation for "child_alone": 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7865

    accuracy                           1.00      7865
   macro avg       1.00      1.00      1.00      7865
weighted avg       1.00      1.00      1.00      7865
 


Evaluation for "water": 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      7406
           1       0.84      0.35      0.49       459

    accuracy                           0.96      7865
   macro avg       0.90      0.67      0.73      7865
weighted avg       0.95      0.96      0.95      7865
 


Evaluation for "food": 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      7003
           1       0.87      0.61      0.72       862

    accuracy                           0.95      7865
   macro avg       0.91      0.80      0.84      7865
weighted avg       0.94      0.95      

Evaluation for "direct_report": 
               precision    recall  f1-score   support

           0       0.87      0.98      0.92      6349
           1       0.82      0.37      0.51      1516

    accuracy                           0.86      7865
   macro avg       0.84      0.67      0.71      7865
weighted avg       0.86      0.86      0.84      7865
 




### 6. Improve model

In [8]:
# define parameters for gridsearch
parameters_RF = {
    'vect__max_df': (.6, 1),
    'tfidf__norm': ('l1', 'l2'),
    'multi-clf__estimator__criterion': ('gini', 'entropy')
}

# build parameter grid and fit data
cv_RF = GridSearchCV(pipeline_RF, parameters_RF)
cv_RF.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x11c057ca0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('multi-clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             param_grid={'multi-clf__estimator__criterion': ('gini', 'entropy'),
                         'tfidf__norm': ('l1', 'l2'),
                         'vect__max_df': (0.6, 1)})

### 7. Test tuned model

In [9]:
# run prediction with test data
y_pred = cv_RF.predict(X_test)

# print precision, recall and f1-score
i = 0
for col in y_test:
    print('Evaluation for "{}": \n {} \n\n'.format(col, classification_report(y_test[col], y_pred[:,i])))
    i += 1

Evaluation for "related": 
               precision    recall  f1-score   support

           0       0.68      0.42      0.52      1862
           1       0.84      0.94      0.88      5946
           2       0.26      0.30      0.28        57

    accuracy                           0.81      7865
   macro avg       0.59      0.55      0.56      7865
weighted avg       0.80      0.81      0.79      7865
 


Evaluation for "request": 
               precision    recall  f1-score   support

           0       0.91      0.98      0.94      6555
           1       0.83      0.50      0.62      1310

    accuracy                           0.90      7865
   macro avg       0.87      0.74      0.78      7865
weighted avg       0.89      0.90      0.89      7865
 


Evaluation for "offer": 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7838
           1       0.00      0.00      0.00        27

    accuracy                           1

  _warn_prf(average, modifier, msg_start, len(result))


Evaluation for "clothing": 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      7757
           1       0.91      0.09      0.17       108

    accuracy                           0.99      7865
   macro avg       0.95      0.55      0.58      7865
weighted avg       0.99      0.99      0.98      7865
 


Evaluation for "money": 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      7692
           1       0.67      0.02      0.04       173

    accuracy                           0.98      7865
   macro avg       0.82      0.51      0.52      7865
weighted avg       0.97      0.98      0.97      7865
 


Evaluation for "missing_people": 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      7785
           1       1.00      0.01      0.02        80

    accuracy                           0.99      7865
   macro avg       0.99      0.5

### 8. Further improvement
The RandomForest algorithm takes very long to fit the dataset so it is difficult to test many parameters with gridsearch. This is why I will try a support vector machine.

In [10]:
# build pipeline with count vecotrizer, tfidf and support vector machine
pipeline_SVC = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('multi-clf', MultiOutputClassifier(LinearSVC()))
])

# define parameters for gridsearch
parameters_SVC = {
    'vect__max_df': (.6, 1),
    'tfidf__norm': ('l1', 'l2'),
    'multi-clf__estimator__C': (.1, 1, 100)
}

# build parameter grid and fit data
cv_SVC = GridSearchCV(pipeline_SVC, parameters_SVC)
cv_SVC.fit(X_train, y_train.drop('child_alone', axis = 1))



GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x11c057ca0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('multi-clf',
                                        MultiOutputClassifier(estimator=LinearSVC()))]),
             param_grid={'multi-clf__estimator__C': (0.1, 1, 100),
                         'tfidf__norm': ('l1', 'l2'),
                         'vect__max_df': (0.6, 1)})

In [13]:
# run prediction with test data
y_pred = cv_SVC.predict(X_test)
y_test = y_test.drop('child_alone', axis = 1)

# print precision, recall and f1-score
i = 0
for col in y_test:
    print('Evaluation for "{}": \n {} \n\n'.format(col, classification_report(y_test[col], y_pred[:,i])))
    i += 1

Evaluation for "related": 
               precision    recall  f1-score   support

           0       0.63      0.53      0.58      1862
           1       0.86      0.91      0.88      5946
           2       0.79      0.19      0.31        57

    accuracy                           0.81      7865
   macro avg       0.76      0.54      0.59      7865
weighted avg       0.80      0.81      0.81      7865
 


Evaluation for "request": 
               precision    recall  f1-score   support

           0       0.92      0.96      0.94      6555
           1       0.75      0.60      0.67      1310

    accuracy                           0.90      7865
   macro avg       0.84      0.78      0.80      7865
weighted avg       0.89      0.90      0.90      7865
 


Evaluation for "offer": 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7838
           1       0.00      0.00      0.00        27

    accuracy                           1

Evaluation for "weather_related": 
               precision    recall  f1-score   support

           0       0.90      0.94      0.92      5695
           1       0.81      0.72      0.76      2170

    accuracy                           0.88      7865
   macro avg       0.86      0.83      0.84      7865
weighted avg       0.87      0.88      0.87      7865
 


Evaluation for "floods": 
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      7246
           1       0.85      0.54      0.66       619

    accuracy                           0.96      7865
   macro avg       0.91      0.77      0.82      7865
weighted avg       0.95      0.96      0.95      7865
 


Evaluation for "storm": 
               precision    recall  f1-score   support

           0       0.96      0.97      0.97      7139
           1       0.69      0.59      0.64       726

    accuracy                           0.94      7865
   macro avg       0.82      0.78

The support vector machine yields slightly better results and is faster, so I would prefer this algorithm.

### 9. Export models as a pickle file

In [14]:
# save both models
pickle.dump(cv_RF, open('../webapp/models/ML_pipeline_RF.pkl', 'wb'))
pickle.dump(cv_SVC, open('../webapp/models/ML_pipeline_SVC.pkl', 'wb'))