# ML Pipeline Preparation

### 1. Import libraries and load data from database.


In [79]:
# import libraries
import pandas as pd
import numpy as np

from sqlalchemy import create_engine

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import multilabel_confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from sklearn.svm import SVC

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Toluwee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Toluwee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Toluwee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table("DisasterResponse",  engine)

X = df["message"]
y = df.drop(["id", "message", "original", "genre"], axis=1)

### 2. Write a tokenization function to process your text data

In [4]:
def tokenize(text):

    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize andremove stop words
    tokens = [WordNetLemmatizer().lemmatize(word) for word in tokens if word not in stopwords.words("english")]

    return tokens

### 3. Build a machine learning pipeline


In [5]:
clf = RandomForestClassifier()

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('model', MultiOutputClassifier(clf))
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [69]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x000002374856F1C0>)),
                ('tfidf', TfidfTransformer()),
                ('model',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

### 5. Test your model


In [70]:
 # predict on test data
y_pred = pipeline.predict(X_test)

In [75]:
for col in range(y_pred.shape[1]):
    y_true = y_test.to_numpy()[:,col]
    y_hat = y_pred[:,col]
    col_name = y.columns[col]

    # labels = np.unique(y_hat)
    
    # target_names = [f"{col_name}_{val}" for val in labels] 

    accuracy = (y_hat == y_true).mean()
    print(
        f"Classification report for classifier {clf} in predicting {col_name}:\n"
        f"{metrics.classification_report(y_true, y_hat, zero_division = 0)}\n"
        f"{multilabel_confusion_matrix(y_true, y_hat)} \n"
    )

Classification report for classifier RandomForestClassifier() in predicting related:
              precision    recall  f1-score   support

           0       0.71      0.45      0.55      1525
           1       0.85      0.94      0.89      4978
           2       0.35      0.39      0.37        51

    accuracy                           0.82      6554
   macro avg       0.64      0.59      0.60      6554
weighted avg       0.81      0.82      0.81      6554

[[[4750  279]
  [ 843  682]]

 [[ 720  856]
  [ 298 4680]]

 [[6466   37]
  [  31   20]]] 

Classification report for classifier RandomForestClassifier() in predicting request:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5434
           1       0.83      0.50      0.62      1120

    accuracy                           0.90      6554
   macro avg       0.87      0.74      0.78      6554
weighted avg       0.89      0.90      0.89      6554

[[[ 557  563]
  [ 112 5322]]



### 6. Model improvement
Use grid search to find better parameters. 

In [76]:
# get the parameters of the pipeline
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x000002374856F1C0>)),
  ('tfidf', TfidfTransformer()),
  ('model', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x000002374856F1C0>),
 'tfidf': TfidfTransformer(),
 'model': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': Fal

In [82]:
# specify parameters for grid search
parameters = {
    # 'vect__ngram_range': ((1, 1), (1, 2)),
    # 'model__estimator__n_estimators': [50, 100, 200],
    # 'model__estimator__min_samples_split': [2, 3, 4]
    "model__estimator" : [RandomForestClassifier(), XGBRegressor(), SVC() ]
}

# create grid search object
model_cv = GridSearchCV(pipeline, param_grid=parameters)
model_cv.fit(X_train, y_train)
y_pred = model_cv.predict(X_test)  
print("\nBest Parameters:", model_cv.best_params_)

### 7. Test your model


In [None]:
print(
    f"Classification report for classifier {clf}:\n"
    f"{classification_report(y_test, y_pred)}\n"
)

### 9. Export your model as a pickle file

In [None]:
# save the model to disk
filename = 'classifier.pkl'
pickle.dump(model, open(filename, 'wb'))