# 1. Importing libraries

In [4]:
import pandas as pd
from sqlalchemy import create_engine, text

In [5]:
import re
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'omw-1.4'])

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC

In [7]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator,TransformerMixin

# 2. Loading data

In [8]:
# load data from database
sql_path = r'data/DisasterResponse'
engine = create_engine('sqlite:///' + sql_path)
query = 'SELECT * FROM response'

df = pd.read_sql_query(sql=text(query), con=engine.connect())
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Split input and output
X = df['message']
y = df.iloc[:, 4:]

# 2. Write a tokenization function to process the text data

In [12]:
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # get list of all urls using regex
    detected_urls = re.findall(url_regex, text)
    
    # replace each url in text string with placeholder
    for url in detected_urls:
        text = text.replace(url, 'urlplaceholder')

    # tokenize text
    tokens = word_tokenize(text)
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate through each token
    clean_tokens = []
    for tok in tokens:
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

# 3. Traing/Testing

In [13]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
# Build a machine learning pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(verbose=True)))
])

In [15]:
# Training with X_train, y_train
pipeline.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   39.2s finished


In [None]:
# Predict X_test
y_pred = pipeline.predict(X_test)

In [17]:
# define the function to get the 'acc, f1, precision, recall' table
def get_score(y_true, y_pred, target_names):
    df = pd.DataFrame()
    for i,target in enumerate(target_names):
        accuracy = accuracy_score(y_true[:, i], y_pred[:, i])
        f1 = f1_score(y_true[:, i], y_pred[:, i], average='weighted')
        precision = precision_score(y_true[:, i], y_pred[:, i], average='weighted')
        recall = recall_score(y_true[:, i], y_pred[:, i], average='weighted')

        df = df.append({'index':target, 'Accuracy':accuracy, 'F1 Score':f1, 
                        'Precision':precision, 'Recall':recall}, 
                       ignore_index = True)
    return df

In [18]:
# get the test score
res = get_score(y_test.values, y_pred, y.columns)
display(res)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,index,Accuracy,F1 Score,Precision,Recall
0,related,0.802632,0.765834,0.787374,0.802632
1,request,0.89016,0.873831,0.88986,0.89016
2,offer,0.994851,0.992284,0.989729,0.994851
3,aid_related,0.770786,0.76414,0.774588,0.770786
4,medical_help,0.92868,0.897534,0.906804,0.92868
5,medical_products,0.948894,0.926963,0.936899,0.948894
6,search_and_rescue,0.96987,0.955599,0.941742,0.96987
7,security,0.985126,0.977934,0.970846,0.985126
8,military,0.964531,0.947856,0.948396,0.964531
9,child_alone,1.0,1.0,1.0,1.0


In [19]:
# Display the mean score
res.mean(numeric_only=True)

Accuracy     0.945303
F1 Score     0.930682
Precision    0.936226
Recall       0.945303
dtype: float64

# 4. Improve model

In [13]:
# A look at the pipeline's parameters
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x7f7c10880040>)),
  ('tfidf', TfidfTransformer()),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(verbose=True)))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x7f7c10880040>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier(verbose=True)),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sub

In [111]:
# Using grid search to improve the existing model
parameters = {'clf__estimator__n_estimators': [50, 100, 150],
              'clf__estimator__min_samples_split': [2, 3, 4]}

cv = GridSearchCV(pipeline, param_grid=parameters)

In [None]:
# Training with Grid Search
cv.fit(X_train, y_train)

# 5. Add other features besides the TF-IDF

In [14]:
# Build a custom transformer which will extract the starting verb of a sentence
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    """
    Starting Verb Extractor class
    
    This class extract the starting verb of a sentence,
    creating a new feature for the ML classifier
    """

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    # Given it is a tranformer we can return the self 
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [126]:
# Create the new pipeline with the starting verb extracter
pipeline_2 = Pipeline([
    ('features', FeatureUnion([

        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('trans', StartingVerbExtractor())
    ])),

    ('clf', MultiOutputClassifier(RandomForestClassifier(verbose=True)))
])

In [None]:
pipeline_2.fit(X_train, y_train)

In [None]:
y_pred_2 = pipeline_2.predict(X_test)

In [130]:
get_score(y_test.values, y_pred_2, y.columns).mean(numeric_only=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  get_score(y_test.values, y_pred_2, y.columns).mean()


Accuracy     0.945965
F1 Score     0.931864
Precision    0.936558
Recall       0.945965
dtype: float64

# 6. Try SVM for classification

In [15]:
pipeline_3 = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(SVC(kernel='poly', probability=True, verbose=True)))
])

In [None]:
pipeline_3.fit(X_train, y_train)

In [None]:
y_pred_3 = pipeline_3.predict(X_test)

In [None]:
get_score(y_test.values, y_pred_3, y.columns).mean()

# 7. Export your model as a pickle file

In [20]:
import joblib
joblib.dump(pipeline, 'model.joblib')

['model.joblib']