# ML Pipeline Preparation
### 1. Import libraries and load data from database

In [43]:
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import re
import pickle

# nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

#sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /Users/zora/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# load data from database
engine = create_engine('sqlite:///../webapp/data/DisasterResponse.db')
df = pd.read_sql_table('messages_categories', engine)

# create X and y dataframes for pipeline
X = df['message'] # predictors
y = df.drop(['id', 'message', 'original', 'genre'], axis = 1).astype('int64') # labels

### 2. Write a tokenization function to process text data

In [4]:
def tokenize(text):
    text_norm = re.sub(r'[^a-zA-Z0-9]', ' ', text) # normalize: remove punctuation
    word_list = word_tokenize(text_norm) # tokenize 
    word_list_clean = [w for w in word_list if w not in stopwords.words('english')] # remove stopwords
    word_list_stemmed = [PorterStemmer().stem(w) for w in word_list_clean] # stemm words
    return word_list_stemmed

### 3. Build a machine learning pipeline

In [5]:
# build a pipeline with countvectorizer, tfidf and random forest classifier
pipeline_RF = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('multi-clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline

In [6]:
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

# fit data
pipeline_RF.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x124008040>)),
                ('tfidf', TfidfTransformer()),
                ('multi-clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

### 5. Test model

In [7]:
# run prediction with test data
y_pred = pipeline_RF.predict(X_test)

In [8]:
# print precision, recall and f1-score
i = 0
for col in y_test:
    print('Evaluation for "{}": \n {} \n\n'.format(col, classification_report(y_test[col], y_pred[:,i])))
    i += 1

Evaluation for "related": 
               precision    recall  f1-score   support

           0       0.70      0.41      0.52      1795
           1       0.84      0.95      0.89      6021
           2       0.50      0.49      0.49        49

    accuracy                           0.82      7865
   macro avg       0.68      0.62      0.63      7865
weighted avg       0.81      0.82      0.80      7865
 


Evaluation for "request": 
               precision    recall  f1-score   support

           0       0.90      0.98      0.94      6490
           1       0.84      0.49      0.62      1375

    accuracy                           0.89      7865
   macro avg       0.87      0.73      0.78      7865
weighted avg       0.89      0.89      0.88      7865
 


Evaluation for "offer": 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7833
           1       0.00      0.00      0.00        32

    accuracy                           1

  _warn_prf(average, modifier, msg_start, len(result))


Evaluation for "food": 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      6959
           1       0.84      0.61      0.71       906

    accuracy                           0.94      7865
   macro avg       0.90      0.80      0.84      7865
weighted avg       0.94      0.94      0.94      7865
 


Evaluation for "shelter": 
               precision    recall  f1-score   support

           0       0.94      0.99      0.97      7141
           1       0.85      0.36      0.50       724

    accuracy                           0.94      7865
   macro avg       0.89      0.68      0.73      7865
weighted avg       0.93      0.94      0.92      7865
 


Evaluation for "clothing": 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      7739
           1       0.75      0.17      0.27       126

    accuracy                           0.99      7865
   macro avg       0.87      0.58      0

### 6. Improve model

In [9]:
# define parameters for gridsearch
parameters_RF = {
    'vect__max_df': (.6, 1),
    'tfidf__norm': ('l1', 'l2'),
    'multi-clf__estimator__criterion': ('gini', 'entropy')
}

# build parameter grid and fit data
cv_RF = GridSearchCV(pipeline_RF, parameters_RF)
cv_RF.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x124008040>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('multi-clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             param_grid={'multi-clf__estimator__criterion': ('gini',
                                                             'entropy')})

### 7. Test tuned model

In [39]:
# run prediction with test data
y_pred = cv_RF.predict(X_test)

# print precision, recall and f1-score
i = 0
for col in y_test:
    print('Evaluation for "{}": \n {} \n\n'.format(col, classification_report(y_test[col], y_pred[:,i])))
    i += 1

Evaluation for "related": 
               precision    recall  f1-score   support

           0       0.72      0.40      0.52      1795
           1       0.84      0.95      0.89      6021
           2       0.58      0.39      0.46        49

    accuracy                           0.82      7865
   macro avg       0.71      0.58      0.62      7865
weighted avg       0.81      0.82      0.80      7865
 


Evaluation for "request": 
               precision    recall  f1-score   support

           0       0.90      0.98      0.94      6490
           1       0.84      0.49      0.62      1375

    accuracy                           0.89      7865
   macro avg       0.87      0.74      0.78      7865
weighted avg       0.89      0.89      0.88      7865
 


Evaluation for "offer": 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7833
           1       0.00      0.00      0.00        32

    accuracy                           1

Evaluation for "shops": 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7829
           1       0.00      0.00      0.00        36

    accuracy                           1.00      7865
   macro avg       0.50      0.50      0.50      7865
weighted avg       0.99      1.00      0.99      7865
 


Evaluation for "aid_centers": 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      7773
           1       0.00      0.00      0.00        92

    accuracy                           0.99      7865
   macro avg       0.49      0.50      0.50      7865
weighted avg       0.98      0.99      0.98      7865
 


Evaluation for "other_infrastructure": 
               precision    recall  f1-score   support

           0       0.95      1.00      0.98      7491
           1       0.00      0.00      0.00       374

    accuracy                           0.95      7865
   macro avg       0.48

### 8. Further improvement
The RandomForest algorithm takes very long to fit the dataset so it is difficult to test many parameters with gridsearch. This is why I will try a support vector machine.

In [40]:
# build pipeline with count vecotrizer, tfidf and support vector machine
pipeline_SVC = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('multi-clf', MultiOutputClassifier(LinearSVC()))
])

# define parameters for gridsearch
parameters_SVC = {
    'vect__max_df': (.6, 1),
    'tfidf__norm': ('l1', 'l2'),
    'multi-clf__estimator__C': (.1, 1, 100)
}

# build parameter grid and fit data
cv_SVC = GridSearchCV(pipeline_SVC, parameters_SVC)
cv_SVC.fit(X_train, y_train.drop('child_alone', axis = 1))



GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x124008040>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('multi-clf',
                                        MultiOutputClassifier(estimator=LinearSVC()))]),
             param_grid={'multi-clf__estimator__C': (0.1, 1, 100),
                         'vect__max_df': (0.6, 1)})

In [41]:
# run prediction with test data
y_pred = cv_SVC.predict(X_test)

# print precision, recall and f1-score
i = 0
for col in y_test:
    print('Evaluation for "{}": \n {} \n\n'.format(col, classification_report(y_test[col], y_pred[:,i])))
    i += 1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation for "related": 
               precision    recall  f1-score   support

           0       0.71      0.40      0.51      1795
           1       0.84      0.95      0.89      6021
           2       0.00      0.00      0.00        49

    accuracy                           0.82      7865
   macro avg       0.52      0.45      0.47      7865
weighted avg       0.80      0.82      0.80      7865
 


Evaluation for "request": 
               precision    recall  f1-score   support

           0       0.91      0.97      0.94      6490
           1       0.82      0.56      0.66      1375

    accuracy                           0.90      7865
   macro avg       0.87      0.77      0.80      7865
weighted avg       0.90      0.90      0.89      7865
 


Evaluation for "offer": 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7833
           1       0.00      0.00      0.00        32

    accuracy                           1

IndexError: index 35 is out of bounds for axis 1 with size 35

The support vector machine yields slightly better results and is faster, so I would prefer this algorithm.

### 9. Export models as a pickle file

In [44]:
# save both models
pickle.dump(cv_RF, open('../webapp/models/ML_pipeline_RF.sav', 'wb'))
pickle.dump(cv_SVC, open('../webapp/models/ML_pipeline_SVC.sav', 'wb'))