# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [55]:
# import libraries
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

import sqlite3

import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
# from sklearn.pipeline import Pipeline, FeatureUnion
# from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


[nltk_data] Downloading package punkt to C:\Users\The
[nltk_data]     Godfather\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\The
[nltk_data]     Godfather\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\The Godfather\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [56]:
# load data from database
# engine = create_engine('sqlite:///DisasterResponse.db')
# df = pd.read_sql_table(disasters, engine)
df = pd.read_csv(r"C:\Users\The Godfather\Desktop\disasters.csv", dtype = {'id':'str', 'message':'str'})
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
X = df.message.values
y = df.iloc[:, -36:]

### 2. Write a tokenization function to process your text data

In [65]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [66]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip..._score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=None))])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [69]:
#predict using the fitted model
y_pred = pipeline.predict(X_test)

In [83]:
y_test = np.asarray(y_test)
for i in range(0,len(y_pred.T)):
    print("------nth column : ", i)
    print(classification_report(y_test.T[i], y_pred.T[i]))

------nth column :  0
              precision    recall  f1-score   support

           0       0.61      0.35      0.45      1535
           1       0.82      0.93      0.87      4973
           2       0.50      0.20      0.28        46

   micro avg       0.79      0.79      0.79      6554
   macro avg       0.64      0.49      0.53      6554
weighted avg       0.77      0.79      0.77      6554

------nth column :  1
              precision    recall  f1-score   support

           0       0.88      0.99      0.93      5442
           1       0.84      0.37      0.51      1112

   micro avg       0.88      0.88      0.88      6554
   macro avg       0.86      0.68      0.72      6554
weighted avg       0.88      0.88      0.86      6554

------nth column :  2
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6513
           1       0.00      0.00      0.00        41

   micro avg       0.99      0.99      0.99      6554
   macro

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5958
           1       0.89      0.57      0.69       596

   micro avg       0.95      0.95      0.95      6554
   macro avg       0.92      0.78      0.83      6554
weighted avg       0.95      0.95      0.95      6554

------nth column :  33
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6419
           1       0.76      0.14      0.24       135

   micro avg       0.98      0.98      0.98      6554
   macro avg       0.87      0.57      0.61      6554
weighted avg       0.98      0.98      0.98      6554

------nth column :  34
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      6191
           1       0.54      0.02      0.04       363

   micro avg       0.94      0.94      0.94      6554
   macro avg       0.74      0.51      0.50      6554
weighted avg       0.92     

### 6. Improve your model
Use grid search to find better parameters. 

In [84]:
RandomForestClassifier().get_params().keys()

dict_keys(['bootstrap', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [86]:
#defining parameters for random forest
parameters = {
    'clf__estimator__n_estimators': [10,20],
    'clf__estimator__max_features': ['auto','sqrt'],
    #'clf__max_depth': [20,50,100],
    #'clf__min_samples_leaf': [1,3,5]
    }

cv = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=None, n_jobs=-1,verbose=10)

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [87]:
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:  1.2min remaining:  3.6min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:  1.2min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  1.8min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:  2.1min remaining:   41.2s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.5min finished


In [88]:
for i in range(0,len(y_pred.T)):
    print("------nth column : ", i)
    print(confusion_matrix(y_test.T[i], y_pred.T[i], labels=labels1))
    

------nth column :  0
[[ 466 1068    1]
 [ 231 4736    6]
 [   3   37    6]]
------nth column :  1
[[5356   86    0]
 [ 653  459    0]
 [   0    0    0]]
------nth column :  2
[[6513    0    0]
 [  41    0    0]
 [   0    0    0]]
------nth column :  3
[[3403  444    0]
 [1206 1501    0]
 [   0    0    0]]
------nth column :  4
[[6015   14    0]
 [ 495   30    0]
 [   0    0    0]]
------nth column :  5
[[6205    6    0]
 [ 323   20    0]
 [   0    0    0]]
------nth column :  6
[[6352    1    0]
 [ 184   17    0]
 [   0    0    0]]
------nth column :  7
[[6444    1    0]
 [ 108    1    0]
 [   0    0    0]]
------nth column :  8
[[6337    8    0]
 [ 201    8    0]
 [   0    0    0]]
------nth column :  9
[[6554    0    0]
 [   0    0    0]
 [   0    0    0]]
------nth column :  10
[[6129   13    0]
 [ 338   74    0]
 [   0    0    0]]
------nth column :  11
[[5797   43    0]
 [ 421  293    0]
 [   0    0    0]]
------nth column :  12
[[5907   38    0]
 [ 440  169    0]
 [   0    0    

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms

In [92]:
DecisionTreeClassifier().get_params().keys()

dict_keys(['class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [94]:
from sklearn.tree import DecisionTreeClassifier

In [95]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(DecisionTreeClassifier()))
     ])

#defining parameters for SVM classification
parameters_1 = {
    'clf__estimator__max_depth': [10,20]  
    }

cv = GridSearchCV(estimator=pipeline, param_grid=parameters_1, cv=None, n_jobs=-1,verbose=10)

In [96]:
cv.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:   42.1s remaining:  1.4min
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   42.2s remaining:   42.2s
[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed:   53.9s remaining:   26.9s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   55.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   55.1s finished


GridSearchCV(cv=None, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip..._leaf=0.0, presort=False, random_state=None,
            splitter='best'),
           n_jobs=None))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'clf__estimator__max_depth': [10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [97]:
y_pred = cv.predict(X_test)

In [102]:
y_test = np.asarray(y_test)
for i in range(0,len(y_pred.T)):
    print("------nth column : ", i)
    print(classification_report(y_test.T[i], y_pred.T[i]))

------nth column :  0
              precision    recall  f1-score   support

           0       0.57      0.35      0.44      1548
           1       0.81      0.92      0.86      4966
           2       0.00      0.00      0.00        40

   micro avg       0.78      0.78      0.78      6554
   macro avg       0.46      0.42      0.43      6554
weighted avg       0.75      0.78      0.76      6554

------nth column :  1
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      5398
           1       0.77      0.51      0.61      1156

   micro avg       0.89      0.89      0.89      6554
   macro avg       0.83      0.74      0.77      6554
weighted avg       0.88      0.89      0.88      6554

------nth column :  2
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6521
           1       0.00      0.00      0.00        33

   micro avg       0.99      0.99      0.99      6554
   macro

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6402
           1       0.68      0.36      0.47       152

   micro avg       0.98      0.98      0.98      6554
   macro avg       0.83      0.68      0.73      6554
weighted avg       0.98      0.98      0.98      6554

------nth column :  34
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      6254
           1       0.35      0.11      0.17       300

   micro avg       0.95      0.95      0.95      6554
   macro avg       0.65      0.55      0.57      6554
weighted avg       0.93      0.95      0.94      6554

------nth column :  35
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      5242
           1       0.69      0.43      0.53      1312

   micro avg       0.85      0.85      0.85      6554
   macro avg       0.78      0.69      0.72      6554
weighted avg       0.83     

### 9. Using space below to show script in `train_classifier.py`

In [None]:
# import libraries

import sys
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

import sqlite3

import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
# from sklearn.pipeline import Pipeline, FeatureUnion
# from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer





def load_data(database_filepath):
     """
    Loads the SQLite database from the given database_filepath.
    INPUTS:
        database_filepath - path to tables
    RETURNS:
        X - inputs to be used for modeling
        Y - labels for modeling
        category_names - Names of label categories
    """
    # load data from database
    engine = create_engine('sqlite:///DisasterResponse.db')
    
   # Generate data and labels
    df = pd.read_sql_table(disasters, engine)
    X = df.message.values
    y = df.iloc[:, -36:]
    
    # Get names of all categories
    category_names = Y.columns.tolist()
    
    return X, Y, category_names





def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens




def build_model():
    """
    Creates the model and also performs parameter optimization using gridsearch
    
    INPUTS:
        model_type - the model type selected by the user.
    RETURNS:
        cv_model - the optimal parameter model
    """
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
        ])
        
    parameters = {
        'clf__estimator__n_estimators': [10,20],
        'clf__estimator__max_features': ['auto','sqrt'],
        #'clf__max_depth': [20,50,100],
        #'clf__min_samples_leaf': [1,3,5]
        }

    cv = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=-1, verbose=10)
    return cv
    



def evaluate_model(model, X_test, Y_test, category_names):
    """
    Evaluate the performance of the model using test data
    
    INPUTS:
        model - the optimal model
        X_test - test features
        Y_test - test labels
        category_names - the names of all categories
    """
    Y_pred = model.predict(X_test)
    category_names = Y_test.columns.tolist()

    Y_test = np.asarray(y_test)
    for i in range(0,len(y_pred.T)):
        print("------nth column : ", i)
        print(classification_report(y_test.T[i], y_pred.T[i]))
    
    
    

def save_model(model, model_filepath):
     pickle.dump(model, open(model_filepath, 'wb'))


def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
        
        print('Building model...')
        model = build_model()
        
        print('Training model...')
        model.fit(X_train, Y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


if __name__ == '__main__':
    main()
