# ML Pipeline Preparation

### In this notebook work we will work on creating ML Pipeline 
- Import Necessary Python Modules
- Load data from sqlite database created earlier
- **Build , Train and Evaluate** your model

In [1]:
###################    Import Python Modules    ###########################

#To Handle datasets
import pandas as pd
import numpy as np

#To handle Databases
from sqlalchemy import create_engine

import re
import pickle
import string 
import sys 

#To Handle text data using Natural Language ToolKit
import nltk
nltk.download(['punkt', 'wordnet','stopwords'])

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

#Sklearn Libraries for Ml Models
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\U6054057\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\U6054057\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\U6054057\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1.Load Dataset from sqlite database
- Use `read_sql_table` to read data from DisasterResponse database

In [8]:
engine= create_engine('sqlite:///DisasterResponse.db')
#
df = pd.read_sql_table('DS_messages',engine)
X=df['message']
y=df[df.columns[4:]]
print(X.head(),y.head())

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object    related  request  offer  aid_related  medical_help  medical_products  \
0        1        0      0            0             0                 0   
1        1        0      0            1             0                 0   
2        1        0      0            0             0                 0   
3        1        1      0            1             0                 1   
4        1        0      0            0             0                 0   

   search_and_rescue  security  military  water  ...  aid_centers  \
0                  0         0         0      0  ...            0   
1                  0         0         0      0  ...            0   
2                  0         0        

## 2.Function to tokenize text

In [9]:
def tokenize(text):
    """
    INPUT - text - messages column from the table
    Returns tokenized text 
    
    1. Remove Punctuation and normalize text
    2. Tokenize text and remove stop words
    3.Use stemmer and Lemmatizer to Reduce words to its root form
    """
    # Remove Punctuations and normalize text by converting text into lower case
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # Tokenize text and remove stop words
    tokens = word_tokenize(text)
    stop_words = stopwords.words("english")
    words = [w for w in tokens if w not in stop_words]
    
    #Reduce words to its stem/Root form
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in words]
    
    #Lemmatizer - Reduce words to its root form
    lemmatizer = WordNetLemmatizer()
    lemm = [lemmatizer.lemmatize(w) for w in stemmed]
    
    return lemm

## 3.Build Model 

Here we have X as messages columns which is the input to the model and y as the 36 categories which results as output classification to our model. As we have multi features to classify we can make use of [MultiOutputClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html)


In [12]:
#create pipeline

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
]) 
    

## 4.Train Model

In [13]:
#Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Fit pipeline
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x0000025A971C2700>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

## 5.Predict the model 
prints [classification_report](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html?highlight=classification_report#sklearn.metrics.classification_report) from sklearn library which returns Precision,Recall and f1-score of your model

In [15]:
# Predict on test set 
y_pred = pipeline.predict(X_test)
categories = y.columns.tolist()
# Test model on test set
print(classification_report(y_test, y_pred, target_names=categories, zero_division="warn"))

                        precision    recall  f1-score   support

               related       0.84      0.95      0.89      7939
               request       0.84      0.47      0.60      1811
                 offer       0.00      0.00      0.00        48
           aid_related       0.77      0.70      0.73      4378
          medical_help       0.70      0.07      0.13       848
      medical_products       0.83      0.09      0.16       530
     search_and_rescue       0.71      0.02      0.03       288
              security       0.20      0.01      0.01       188
              military       0.61      0.06      0.10       341
                 water       0.88      0.37      0.52       638
                  food       0.86      0.54      0.66      1204
               shelter       0.84      0.33      0.47       933
              clothing       0.89      0.05      0.09       163
                 money       0.78      0.03      0.05       250
        missing_people       1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Use KNN Classifier**

In [None]:
#Use KNN Classifier

#create pipeline

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(KNeighborsClassifier()))
]) 

#Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Fit pipeline
pipeline.fit(X_train, y_train)

# Predict on test set 
y_pred = pipeline.predict(X_test)
categories = y.columns.tolist()

# Test model on test set
print(classification_report(y_test, y_pred, target_names=categories, zero_division="warn"))

## 6.Improve Model
**Use GridSearchCV**

In [27]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1,2)),
    'vect__max_features': (None, 5000,10000),
    'tfidf__use_idf': (True, False)
}

cv = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=3, verbose=3)

cv.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1), score=0.222, total=  41.5s
[CV] tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.5s remaining:    0.0s


[CV]  tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1), score=0.242, total=  43.0s
[CV] tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s


[CV]  tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 1), score=0.224, total=  44.0s
[CV] tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 2) 
[CV]  tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 2), score=0.223, total=  45.2s
[CV] tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 2) 
[CV]  tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 2), score=0.231, total=  44.4s
[CV] tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 2) 
[CV]  tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=None, vect__ngram_range=(1, 2), score=0.220, total=  45.6s
[CV] tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=5000, vect__ngram_range=(1, 1) 
[CV]  tfidf__use_idf=True, vect__max_df=0.5, vect__max_features=5000, vect__ngram_range=(1, 1), score=0.221, total=  42.6s
[CV] 

[CV]  tfidf__use_idf=True, vect__max_df=1.0, vect__max_features=None, vect__ngram_range=(1, 2), score=0.231, total=  45.3s
[CV] tfidf__use_idf=True, vect__max_df=1.0, vect__max_features=None, vect__ngram_range=(1, 2) 
[CV]  tfidf__use_idf=True, vect__max_df=1.0, vect__max_features=None, vect__ngram_range=(1, 2), score=0.220, total=  46.1s
[CV] tfidf__use_idf=True, vect__max_df=1.0, vect__max_features=5000, vect__ngram_range=(1, 1) 
[CV]  tfidf__use_idf=True, vect__max_df=1.0, vect__max_features=5000, vect__ngram_range=(1, 1), score=0.221, total=  43.1s
[CV] tfidf__use_idf=True, vect__max_df=1.0, vect__max_features=5000, vect__ngram_range=(1, 1) 
[CV]  tfidf__use_idf=True, vect__max_df=1.0, vect__max_features=5000, vect__ngram_range=(1, 1), score=0.235, total=  43.7s
[CV] tfidf__use_idf=True, vect__max_df=1.0, vect__max_features=5000, vect__ngram_range=(1, 1) 
[CV]  tfidf__use_idf=True, vect__max_df=1.0, vect__max_features=5000, vect__ngram_range=(1, 1), score=0.215, total=  44.3s
[CV] 

[CV]  tfidf__use_idf=False, vect__max_df=0.75, vect__max_features=5000, vect__ngram_range=(1, 1), score=0.236, total=  45.2s
[CV] tfidf__use_idf=False, vect__max_df=0.75, vect__max_features=5000, vect__ngram_range=(1, 1) 
[CV]  tfidf__use_idf=False, vect__max_df=0.75, vect__max_features=5000, vect__ngram_range=(1, 1), score=0.249, total=  44.6s
[CV] tfidf__use_idf=False, vect__max_df=0.75, vect__max_features=5000, vect__ngram_range=(1, 1) 
[CV]  tfidf__use_idf=False, vect__max_df=0.75, vect__max_features=5000, vect__ngram_range=(1, 1), score=0.235, total=  44.6s
[CV] tfidf__use_idf=False, vect__max_df=0.75, vect__max_features=5000, vect__ngram_range=(1, 2) 
[CV]  tfidf__use_idf=False, vect__max_df=0.75, vect__max_features=5000, vect__ngram_range=(1, 2), score=0.237, total=  45.3s
[CV] tfidf__use_idf=False, vect__max_df=0.75, vect__max_features=5000, vect__ngram_range=(1, 2) 
[CV]  tfidf__use_idf=False, vect__max_df=0.75, vect__max_features=5000, vect__ngram_range=(1, 2), score=0.250, t

[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed: 79.2min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x0000025A971C2700>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=KNeighborsClassifier()))]),
             param_grid={'tfidf__use_idf': (True, False),
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__max_features': (None, 5000, 10000),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=3)

In [28]:
# Predict on test set 
y_pred = pipeline.predict(X_test)
categories = y.columns.tolist()

# Test model on test set
print(classification_report(y_test, y_pred, target_names=categories, zero_division="warn"))

                        precision    recall  f1-score   support

               related       0.84      0.89      0.86      7939
               request       0.64      0.50      0.56      1811
                 offer       0.00      0.00      0.00        48
           aid_related       0.67      0.55      0.61      4378
          medical_help       0.52      0.11      0.19       848
      medical_products       0.60      0.12      0.20       530
     search_and_rescue       0.64      0.06      0.10       288
              security       0.40      0.01      0.02       188
              military       0.65      0.13      0.22       341
                 water       0.67      0.31      0.42       638
                  food       0.73      0.38      0.50      1204
               shelter       0.65      0.25      0.36       933
              clothing       0.60      0.15      0.24       163
                 money       0.53      0.09      0.16       250
        missing_people       0.60      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Save your model

In [None]:
import joblib 

joblib.dump(cv, 'DS_model.pkl')

with the help of this notebook we can build ML Pipeline in `train_classifie.py` Script

In [21]:
#Use KNN Classifier

from sklearn.neighbors import KNeighborsClassifier

#create pipeline

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(KNeighborsClassifier()))
]) 

#Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Fit pipeline
pipeline.fit(X_train, y_train)

# Predict on test set 
y_pred = pipeline.predict(X_test)
categories = y.columns.tolist()

# Test model on test set
print(classification_report(y_test, y_pred, target_names=categories, zero_division="0"))
    

                        precision    recall  f1-score   support

               related       0.84      0.89      0.86      7939
               request       0.64      0.50      0.56      1811
                 offer       0.00      0.00      0.00        48
           aid_related       0.67      0.55      0.61      4378
          medical_help       0.52      0.11      0.19       848
      medical_products       0.60      0.12      0.20       530
     search_and_rescue       0.64      0.06      0.10       288
              security       0.40      0.01      0.02       188
              military       0.65      0.13      0.22       341
                 water       0.67      0.31      0.42       638
                  food       0.73      0.38      0.50      1204
               shelter       0.65      0.25      0.36       933
              clothing       0.60      0.15      0.24       163
                 money       0.53      0.09      0.16       250
        missing_people       0.60      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
