# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import re
import numpy as np
import pandas as pd
import pickle
import nltk

from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.multioutput import MultiOutputClassifier

In [2]:
# load data from database
engine = create_engine("sqlite:///C:\\Users\\DJ\\Documents\\git\\DSND_Term2\\project_files\\09a Disaster_response_pipeline\\workspace\\data\\disaster_response.db")
df = pd.read_sql_table('messages', con=engine)

In [3]:
df.head(2)

Unnamed: 0,id,message,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,1,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [69]:
X = df['message']
y = df.iloc[:, 2:]

In [70]:
X.shape, y.shape

((26216,), (26216, 36))

In [71]:
y['related'].replace(2, y['related'].mode().iloc[0], inplace=True)

In [76]:
X = X.head(3000)
y = y.head(3000)

In [7]:
y.columns

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [9]:
type(X.head()), type(y)

(pandas.core.series.Series, pandas.core.frame.DataFrame)

In [10]:
X[0]

'Weather update - a cold front from Cuba that could pass over Haiti'

In [72]:
X.str.find('http').value_counts()

-1       25116
 0          36
 108        33
 117        26
 111        23
         ...  
 36          1
 1006        1
 190         1
 349         1
 3470        1
Name: message, Length: 207, dtype: int64

### 2. Write a tokenization function to process your text data

In [87]:
# Replace all urls with a urlplaceholder string
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [141]:
def tokenize(text):  
    
    # Extract all the urls from the provided text 
    detected_urls = re.findall(url_regex, text)
    
    # Replace url with a url placeholder string
    for url in detected_urls:
        text = text.replace(url, urlplaceholder )
    
    # didn't remove punctuation, seems to cause error 'list out of range'
    # text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # normalize text
    tokens = word_tokenize(text)
    
    # didn't remove stop words. slow down the process, also decreases model performance metrics
    # kokens = [w for w in tokens if w not in stopwords.words("english")]
    
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

print(X[0])
tokenize(X[0])

Weather update - a cold front from Cuba that could pass over Haiti


['weather',
 'update',
 '-',
 'a',
 'cold',
 'front',
 'from',
 'cuba',
 'that',
 'could',
 'pas',
 'over',
 'haiti']

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [123]:
X_train.shape, y_train.shape

((2250,), (2250, 36))

In [42]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [22]:
sentence = "kick ball, beat the opponent, play basketball, who i am, study hard"

In [23]:
pos_tags = nltk.pos_tag(tokenize(sentence))

In [24]:
pos_tags

[('kick', 'VB'),
 ('ball', 'NN'),
 ('beat', 'IN'),
 ('the', 'DT'),
 ('opponent', 'NN'),
 ('play', 'NN'),
 ('basketball', 'NN'),
 ('who', 'WP'),
 ('i', 'VBZ'),
 ('am', 'VBP'),
 ('study', 'RB'),
 ('hard', 'JJ')]

In [142]:
pipeline1 = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('count_vectorizer', CountVectorizer(tokenizer=tokenize)),
                ('tfidf_transformer', TfidfTransformer())
            ])),

            ('starting_verb_transformer', StartingVerbExtractor())
        ])),

        ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=100)))
    ])

pipeline1.fit(X_train, y_train)


# Overall scores
y_pred = pipeline1.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
print("Random Forest (tuned)")
print("-"*27)
print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1}")

Random Forest (tuned)
---------------------------
Accuracy: 0.21466666666666667
Precision: 0.832676303109943
Recall: 0.6428812986134596
F1 Score: 0.7255725190839696


In [138]:
pd.Series(X_train).apply(lambda x: len(x)).shape

(2250,)

In [144]:
class TextLenghExtractor(BaseEstimator, TransformerMixin):
    def character_count(self, text):
        return len(text)
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        count = pd.Series(x).apply(self.character_count)
        return pd.DataFrame(count)

len_extrac = TextLenghExtractor()
len_extrac.transform(X_train)

Unnamed: 0,message
1811,151
2687,90
641,82
1994,55
417,76
...,...
1546,74
2700,145
42,33
2079,176


In [153]:
class TextLenghExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.Series(X).apply(lambda x: len(x))
        return pd.DataFrame(X)
    

len_extrac = TextLenghExtractor()
len_extrac.transform(X_train)

Unnamed: 0,message
1811,151
2687,90
641,82
1994,55
417,76
...,...
1546,74
2700,145
42,33
2079,176


In [None]:
pipeline2 = Pipeline([
    ('features', FeatureUnion([

        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),
   
       ('text_length', TextLenghExtractor())
    ])),

    ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=100)))
])

pipeline2.fit(X_train, y_train)


# Overall scores
y_pred = pipeline2.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
print("Random Forest (tuned)")
print("-"*27)
print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1}")