## **Import libraries and load data from database**

In [1]:
!pip install nltk
!pip install plotly



In [16]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import re
import pickle
import nltk

# import relevant functions/modules from the nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# import relevant functions/modules from the sklearn

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
database_filepath = "/content/disaster_response_db.db"
engine = create_engine('sqlite:///' + database_filepath)
table_name = "disaster_response_db_table"
df = pd.read_sql_table(table_name,engine)

  interactivity=interactivity, compiler=compiler, result=result)


In [23]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X = df['message']
Y = df.iloc[:,4:]

#### Tokenize text

In [7]:
def tokenize(text, url_place_holder_string = "urlplaceholder"):
    """Normalize, replace URLs, tokenize and lemmatize text string
    
    Args:
    text: string. String containing message for processing
       
    Returns:
    clean_tokens: list of strings. List containing processed input
    """
    # Convert text to lowercase and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    #
    url_regex = 'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    
    detected_urls = re.findall(url_regex, text)
    
    # Replace url with a url placeholder string
    for detected_url in detected_urls:
        text = text.replace(detected_url, url_place_holder_string)

    # Tokenize words
    tokens = word_tokenize(text)
    
    # Stem word tokens and remove stop words
    lemmatizer = nltk.WordNetLemmatizer()

    clean_tokens = [lemmatizer.lemmatize(w).lower().strip() for w in tokens]
    return clean_tokens

#### Build a machine learning pipeline

In [8]:
initial_pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier())),
    ])

#### Train the model

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)
model = initial_pipeline.fit(X_train, Y_train)

#### Test your model

In [10]:
Y_prediction_test = model.predict(X_test)

print(classification_report(Y_test.values, Y_prediction_test, target_names=Y.columns.values))

                        precision    recall  f1-score   support

               related       0.81      0.97      0.88      4938
               request       0.89      0.46      0.61      1104
                 offer       0.00      0.00      0.00        26
           aid_related       0.79      0.64      0.71      2714
          medical_help       0.67      0.05      0.10       542
      medical_products       0.84      0.09      0.17       347
     search_and_rescue       0.91      0.05      0.10       192
              security       0.50      0.01      0.01       133
              military       0.80      0.04      0.07       225
                 water       0.90      0.24      0.38       430
                  food       0.86      0.47      0.61       731
               shelter       0.87      0.26      0.40       598
              clothing       0.82      0.09      0.16       101
                 money       0.75      0.04      0.07       168
        missing_people       1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Improve model

##### Use GridSearchCV for optimization of parameters

In [12]:
parameters = {'vect__min_df': [1, 5],
              'tfidf__use_idf':[True, False],
              'clf__estimator__n_estimators': [10, 20, 40], 
              'clf__estimator__min_samples_split':[2, 5, 10]}

cv = GridSearchCV(initial_pipeline, param_grid = parameters, scoring = 'f1_micro', verbose = 10, n_jobs=-1)

# Find best parameters
np.random.seed(81)
tuned_model = cv.fit(X_train, Y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [18]:
tuned_model.best_params_

{'clf__estimator__min_samples_split': 5,
 'clf__estimator__n_estimators': 40,
 'tfidf__use_idf': False,
 'vect__min_df': 5}

#### Test new improved model

In [13]:
Y_prediction_test = tuned_model.predict(X_test)

In [14]:
print(classification_report(Y_test.values, Y_prediction_test, target_names=Y.columns.values))

                        precision    recall  f1-score   support

               related       0.81      0.97      0.88      4938
               request       0.86      0.49      0.62      1104
                 offer       0.00      0.00      0.00        26
           aid_related       0.76      0.67      0.71      2714
          medical_help       0.64      0.11      0.18       542
      medical_products       0.81      0.14      0.24       347
     search_and_rescue       0.69      0.05      0.09       192
              security       0.00      0.00      0.00       133
              military       0.66      0.13      0.22       225
                 water       0.85      0.36      0.51       430
                  food       0.83      0.60      0.69       731
               shelter       0.84      0.43      0.57       598
              clothing       0.67      0.20      0.31       101
                 money       0.89      0.05      0.09       168
        missing_people       0.50      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### We see good improvement in the F1 score after tuning, which is what we were optimizing for.

#### Trying different ML algorithm: ADABoost Classifier

In [17]:
new_pipeline= Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(AdaBoostClassifier()))
])

parameters2 = {'clf__estimator__learning_rate': [0.01, 0.02, 0.05],
               'clf__estimator__n_estimators': [10, 20, 40]}

cv2 = GridSearchCV(new_pipeline, param_grid = parameters2, scoring = 'f1_micro', verbose = 10, n_jobs=-1)

# Find best parameters
np.random.seed(81)
tuned_model2 = cv2.fit(X_train, Y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [19]:
tuned_model2.best_params_

{'clf__estimator__learning_rate': 0.01, 'clf__estimator__n_estimators': 10}

#### Test different ML model

In [20]:
Y_prediction_test = tuned_model2.predict(X_test)

In [21]:
print(classification_report(Y_test.values, Y_prediction_test, target_names=Y.columns.values))

                        precision    recall  f1-score   support

               related       0.76      1.00      0.86      4938
               request       0.62      0.40      0.49      1104
                 offer       0.00      0.00      0.00        26
           aid_related       0.80      0.20      0.32      2714
          medical_help       0.59      0.11      0.18       542
      medical_products       1.00      0.01      0.02       347
     search_and_rescue       0.70      0.19      0.30       192
              security       0.00      0.00      0.00       133
              military       0.00      0.00      0.00       225
                 water       0.58      0.87      0.70       430
                  food       0.76      0.70      0.73       731
               shelter       0.88      0.31      0.46       598
              clothing       1.00      0.01      0.02       101
                 money       0.63      0.17      0.27       168
        missing_people       0.58      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### The random forest model has better performance

In [22]:
filename = 'tuned_model.sav'
pickle.dump(tuned_model, open(filename, 'wb'))