# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [6]:
# import standard libraries
import numpy as np
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine 
import pickle
import time

In [7]:
# import ML libraries

import sklearn
import nltk

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, log_loss
from sklearn.datasets import make_multilabel_classification
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


In [8]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql('SELECT * FROM master', engine)
df.head(3)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
X = df['message'].to_string()
y = df.drop(['id', 'message','original','genre'], axis=1)

In [12]:
y.shape

(26216, 36)

In [None]:
X

In [None]:
def load_data():
    
    '''Load data and split into X matric and y vector'''
    
    engine = create_engine('sqlite:///DisasterResponse.db')
    df = pd.read_sql('SELECT * FROM master', engine)
    X = df["message"].values
    Y = df.drop(["id", "message", "original", "genre"], axis=1).values
    return X, y

### 2. Write a tokenization function to process your text data

In [None]:
def tokenize(text):
    """a tokenization function to process our text data, which is splitting text into words / tokens"""
    tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}')
    tokens = tokenizer.tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

In [None]:
tokenize(X)

In [None]:
def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [13]:
df.sample(5)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
4451,5079,I don't need this incompetent government's mes...,Mwn pa need mesaj gouvneman incomptan sa. Plea...,direct,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20214,23541,It has become clear that the RUF/AFRC leadersh...,,news,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8188,9189,ERT GLASS-CONDUCTEUR CAR AND TRUCK -OPERATOR-T...,ERT GLASS-CONDUCTEUR CAR AND TRUCK -OPERATOR-T...,direct,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13171,15739,Local administrative bodies are trained to qui...,,news,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4789,5450,OPDF Organ. pour Developpement Fort Royal Hait...,0. P. D. F 0RGARNISATI0NP0UR. LEDEVL0PPEMEN T....,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def find_classifier(): 
    
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
    
    classifiers = [
        KNeighborsClassifier(36),
        DecisionTreeClassifier(),
        RandomForestClassifier(36),
        ExtraTreeClassifier(),
        ExtraTreesClassifier(36),
        RadiusNeighborsClassifier(36)
        ]
    
    for classifier in classifiers:
        pipe = Pipeline(steps=[
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier)])

        pipe.fit(X_train, y_train)   
        print(classifier)
        print("model score: %.3f" % pipe.score(X_test, y_test))

In [None]:
find_classifier()

In [None]:
def build_model():
    """building model pipeline for feature prediction using the best score classifier aka clf,
        based on the output of the previous function"""
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
    classifier = ExtraTreesClassifier(36) #use another classifier with best score accordingly
    # build pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(classifier, n_jobs=-1))
        ])
    
    # train classifier
    pipeline.fit(X_train,y_train)
    
    # predict on test data
    y_pred = pipeline.predict(X_test)
    
    #display_results(y_test, y_pred)
    #print(f"x_train:{X_train.shape}"),print(f"x_test: {X_test.shape}") , print(f"y_train: {y_train.shape}"), print(f"y_test:{y_test.shape}")
    print(classification_report(y_test, y_pred))
 

The recall means "how many of this class you find over the whole number of element of this class"

The precision will be "how many are correctly classified among that class"

The f1-score is the harmonic mean between precision & recall

The support is the number of occurence of the given class in your dataset

In [None]:
build_model()

### 6. Improve your model
Use grid search to find better parameters. 

In [None]:
def grid_search():
    """Using grid search to find better parameters"""
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
    
    classifier = ExtraTreesClassifier(36) #use another classifier with best score accordingly
        
    pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', MultiOutputClassifier(classifier, n_jobs=-1))
            ])
    
    
    parameters = {'clf__estimator__criterion': ["gini", "entropy"],     
        'clf__estimator__n_jobs':[-1,1],
        'clf__estimator__max_features': ['auto', 'sqrt', 'log2'],
        'clf__estimator__max_depth' : [2,4,5,6,7,8]}
        
    cv = GridSearchCV(
        pipeline,
        parameters,
        n_jobs=1
    )
    
    cv.fit(X_train, y_train)
        
    #return cv
    print(cv.best_params_)    
    print(cv.best_score_)

In [None]:
start = time.time()
grid_search()
end = time.time()
print(end - start)

In [None]:
def build_model2():
    """building model pipeline for feature prediction using best params defined with the grid_search function"""

    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
    classifier = ExtraTreesClassifier(36, criterion='gini', max_depth=2,max_features='auto', n_jobs=-1)
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(classifier))
        ])
    
    # train classifier
    pipeline.fit(X_train,y_train)
    
    # predict on test data
    y_pred = pipeline.predict(X_test)
    
    #display results
    #display_results(y_test, y_pred)
    #print(f"x_train:{X_train.shape}"),print(f"x_test: {X_test.shape}") , print(f"y_train: {y_train.shape}"), print(f"y_test:{y_test.shape}")
    print(classification_report(y_test, y_pred))
    display_results(y_test, y_pred)

In [None]:
build_model2()

Note: doesn't seem like this model is better than the original one.

## Export your model as a pickle file

In [None]:
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
classifier = ExtraTreesClassifier(36, criterion='gini', max_depth=2,max_features='auto', n_jobs=-1)

model = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(classifier, n_jobs=-1))
    ])
model.fit(X_train,y_train)

filename = 'classifier.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)