# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [2]:
# import standard libraries
import numpy as np
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine 
import pickle
import time

In [3]:
# import ML libraries

import sklearn
import nltk

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, log_loss
from sklearn.datasets import make_multilabel_classification
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


In [4]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql('SELECT * FROM master', engine)
df.head(3)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X = df['message'].to_string()
y = df.drop(['id', 'message','original','genre'], axis=1)

In [6]:
y.shape

(26216, 36)

In [7]:
X



In [8]:
def load_data():
    
    '''Load data and split into X matric and y vector'''
    
    engine = create_engine('sqlite:///DisasterResponse.db')
    df = pd.read_sql('SELECT * FROM master', engine)
    X = df["message"].values
    Y = df.drop(["id", "message", "original", "genre"], axis=1).values
    return X, y

### 2. Write a tokenization function to process your text data

In [9]:
def tokenize(text):
    """a tokenization function to process our text data, which is splitting text into words / tokens"""
    tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}')
    tokens = tokenizer.tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

In [10]:
tokenize(X)

['weather',
 'update',
 'cold',
 'front',
 'from',
 'cuba',
 'that',
 'the',
 'hurricane',
 'over',
 'not',
 'over',
 'looking',
 'for',
 'someone',
 'but',
 'name',
 'report',
 'leogane',
 'destroyed',
 'only',
 'hospi',
 'say',
 'west',
 'side',
 'haiti',
 'rest',
 'the',
 'country',
 'information',
 'about',
 'the',
 'national',
 'palace',
 'storm',
 'sacred',
 'heart',
 'jesus',
 'please',
 'need',
 'tent',
 'and',
 'water',
 'are',
 'sil',
 'would',
 'like',
 'receive',
 'the',
 'message',
 'thank',
 'you',
 'croix',
 'de',
 'bouquets',
 'have',
 'health',
 'i',
 'there',
 'nothing',
 'eat',
 'and',
 'water',
 'starving',
 'petionville',
 'need',
 'more',
 'information',
 'thomassin',
 'number',
 'the',
 'area',
 'named',
 'let',
 'together',
 'need',
 'food',
 'delma',
 'more',
 'information',
 'the',
 'number',
 'order',
 'comitee',
 'delmas',
 'rue',
 'street',
 'janvier',
 'need',
 'food',
 'and',
 'water',
 'klecin',
 'are',
 'are',
 'you',
 'going',
 'call',
 'you',
 'want',

In [11]:
def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [11]:
df.sample(5)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
18938,22127,- Flooded after rain - Lao Cai urgently evacua...,,news,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7723,8692,Is still there help in food for the poor,Eske gen ed manje jiskaprezan pou ti malere yo,direct,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
11748,13556,ofyaesman I am fine because I live on the capi...,,social,1,0,0,1,0,1,...,0,0,1,1,0,0,0,0,1,1
215,259,Road off of 200/2 to south of central Leogane....,Deslandes ( LeoganeHaiti ) de puis le. 12 janv...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
24445,28265,A team from Telecommunication Sans Frontières ...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
def find_classifier(): 
    
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
    
    classifiers = [
        KNeighborsClassifier(36),
        DecisionTreeClassifier(),
        RandomForestClassifier(36),
        ExtraTreeClassifier(),
        ExtraTreesClassifier(36),
        RadiusNeighborsClassifier(36)
        ]
    
    for classifier in classifiers:
        pipe = Pipeline(steps=[
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier)])

        pipe.fit(X_train, y_train)   
        print(classifier)
        print("model score: %.3f" % pipe.score(X_test, y_test))

In [13]:
find_classifier()

KNeighborsClassifier(n_neighbors=36)
model score: 0.248
DecisionTreeClassifier()
model score: 0.254
RandomForestClassifier(n_estimators=36)
model score: 0.280
ExtraTreeClassifier()
model score: 0.197
ExtraTreesClassifier(n_estimators=36)
model score: 0.277
RadiusNeighborsClassifier(radius=36)
model score: 0.192


In [14]:
def build_model():
    """building model pipeline for feature prediction using the best score classifier aka clf,
        based on the output of the previous function"""
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
    classifier = ExtraTreesClassifier(36) #use another classifier with best score accordingly
    # build pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(classifier, n_jobs=-1))
        ])
    
    # train classifier
    pipeline.fit(X_train,y_train)
    
    # predict on test data
    y_pred = pipeline.predict(X_test)
    
    #display_results(y_test, y_pred)
    #print(f"x_train:{X_train.shape}"),print(f"x_test: {X_test.shape}") , print(f"y_train: {y_train.shape}"), print(f"y_test:{y_test.shape}")
    print(classification_report(y_test, y_pred))
 

The recall means "how many of this class you find over the whole number of element of this class"

The precision will be "how many are correctly classified among that class"

The f1-score is the harmonic mean between precision & recall

The support is the number of occurence of the given class in your dataset

In [15]:
build_model()

              precision    recall  f1-score   support

           0       0.82      0.96      0.89      5916
           1       0.85      0.40      0.55      1359
           2       0.00      0.00      0.00        38
           3       0.77      0.63      0.69      3267
           4       0.62      0.09      0.15       607
           5       0.73      0.10      0.17       388
           6       0.40      0.05      0.08       211
           7       0.00      0.00      0.00       143
           8       0.58      0.05      0.10       279
           9       0.00      0.00      0.00         0
          10       0.86      0.32      0.47       469
          11       0.88      0.43      0.58       867
          12       0.83      0.27      0.41       695
          13       0.80      0.12      0.21       135
          14       0.44      0.02      0.04       175
          15       1.00      0.03      0.05        76
          16       0.67      0.05      0.09       246
          17       0.88    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 6. Improve your model
Use grid search to find better parameters. 

In [30]:
def grid_search():
    """Using grid search to find better parameters"""
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
    
    classifier = ExtraTreesClassifier(36) #use another classifier with best score accordingly
        
    pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', MultiOutputClassifier(classifier, n_jobs=-1))
            ])
    
    
    parameters = {'clf__estimator__criterion': ["gini", "entropy"],     
        'clf__estimator__n_jobs':[-1,1],
        'clf__estimator__max_features': ['auto', 'sqrt', 'log2'],
        'clf__estimator__max_depth' : [2,4,5,6,7,8]}
        
    cv = GridSearchCV(
        pipeline,
        parameters,
        n_jobs=1
    )
    
    cv.fit(X_train, y_train)
        
    #return cv
    print(cv.best_params_)    
    print(cv.best_score_)

In [31]:
start = time.time()
grid_search()
end = time.time()
print(end - start)

{'clf__estimator__criterion': 'gini', 'clf__estimator__max_depth': 2, 'clf__estimator__max_features': 'auto', 'clf__estimator__n_jobs': -1}
0.19562986126626175
1105.13197183609


In [34]:
def build_model2():
    """building model pipeline for feature prediction using best params defined with the grid_search function"""

    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
    classifier = ExtraTreesClassifier(36, criterion='gini', max_depth=2,max_features='auto', n_jobs=-1)
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(classifier))
        ])
    
    # train classifier
    pipeline.fit(X_train,y_train)
    
    # predict on test data
    y_pred = pipeline.predict(X_test)
    
    #display results
    #display_results(y_test, y_pred)
    #print(f"x_train:{X_train.shape}"),print(f"x_test: {X_test.shape}") , print(f"y_train: {y_train.shape}"), print(f"y_test:{y_test.shape}")
    print(classification_report(y_test, y_pred))
    display_results(y_test, y_pred)

In [33]:
build_model2()

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      5940
           1       0.00      0.00      0.00      1380
           2       0.00      0.00      0.00        36
           3       0.00      0.00      0.00      3224
           4       0.00      0.00      0.00       617
           5       0.00      0.00      0.00       384
           6       0.00      0.00      0.00       212
           7       0.00      0.00      0.00       130
           8       0.00      0.00      0.00       251
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00       505
          11       0.00      0.00      0.00       857
          12       0.00      0.00      0.00       675
          13       0.00      0.00      0.00       120
          14       0.00      0.00      0.00       194
          15       0.00      0.00      0.00        99
          16       0.00      0.00      0.00       257
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Note: doesn't seem like this model is better than the original one.

## Export your model as a pickle file

In [14]:
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
classifier = ExtraTreesClassifier(36, criterion='gini', max_depth=2,max_features='auto', n_jobs=-1)

model = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(classifier, n_jobs=-1))
    ])
model.fit(X_train,y_train)

filename = 'classifier.pkl'
pickle.dump(model, open(filename, 'wb'))

In [15]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.19529561347743166
