In [1]:
# import libraries
import pandas as pd
import nltk
import numpy as np

from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [2]:
nltk.download(['punkt','stopwords','wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tati\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tati\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# save stop words in english
stop_english = stopwords.words("english")

In [4]:
def tokenize(text):
    # normalize and tokenize
    tokens = word_tokenize(text.lower())
    
    # remove stop words
    tokens = [word for word in tokens if word not in stop_english]
    
    # lemmatize nouns
    tokens = [WordNetLemmatizer().lemmatize(word) for word in tokens]
    
    # lemmatize verbs
    tokens = [WordNetLemmatizer().lemmatize(word, pos = 'v') for word in tokens]
    
    # remove trailing space
    tokens = list(map(str.strip, tokens))
    
    return tokens

In [5]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql('SELECT * FROM Messages', engine)

In [6]:
X = df['message']
y = df.drop(columns=['id', 'message', 'original','genre'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=10)

In [8]:
(y_train == 0).mean()

related                   0.228556
request                   0.829155
offer                     0.995858
aid_related               0.584687
medical_help              0.919128
medical_products          0.949918
search_and_rescue         0.971935
security                  0.982125
military                  0.967793
water                     0.935313
food                      0.886921
shelter                   0.910082
clothing                  0.985286
money                     0.976621
missing_people            0.988447
refugees                  0.967193
death                     0.953569
other_aid                 0.868992
infrastructure_related    0.934823
transport                 0.953896
buildings                 0.949809
electricity               0.978147
tools                     0.993733
hospitals                 0.989046
shops                     0.995150
aid_centers               0.988501
other_infrastructure      0.955804
weather_related           0.718692
floods              

In [9]:

from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))),
               ])

In [10]:
sgd.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x000001BF457983A8>,
                   vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=SGDClassifier(alpha=0.001, average=False,
                                                 class_weight=None,
                                                 early_stopping=False, epsilon=0.1,
                                                 eta0=0.0, fit_intercept=True,
                                                 l1_ratio=

In [11]:
import time

start = time.time()

sgd.fit(X_train, y_train)

y_pred_sgd = sgd.predict(X_test)

end = time.time()
print(end - start)

15.988226413726807


In [12]:
test_pred = pd.DataFrame(y_pred_sgd, columns=list(y.columns))

In [13]:
test_y = y_test.reset_index(drop=True)

In [14]:
for col in list(y.columns):
    print(col)
    print (classification_report(test_y[col], test_pred[col], zero_division=0))

related
              precision    recall  f1-score   support

           0       0.79      0.06      0.11      1928
           1       0.77      0.99      0.87      5937

    accuracy                           0.77      7865
   macro avg       0.78      0.53      0.49      7865
weighted avg       0.77      0.77      0.68      7865

request
              precision    recall  f1-score   support

           0       0.88      0.99      0.93      6526
           1       0.89      0.34      0.50      1339

    accuracy                           0.88      7865
   macro avg       0.88      0.67      0.71      7865
weighted avg       0.88      0.88      0.86      7865

offer
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7823
           1       0.00      0.00      0.00        42

    accuracy                           0.99      7865
   macro avg       0.50      0.50      0.50      7865
weighted avg       0.99      0.99      0.99      786

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      7121
           1       0.89      0.49      0.63       744

    accuracy                           0.95      7865
   macro avg       0.92      0.74      0.80      7865
weighted avg       0.94      0.95      0.94      7865

cold
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      7710
           1       0.00      0.00      0.00       155

    accuracy                           0.98      7865
   macro avg       0.49      0.50      0.50      7865
weighted avg       0.96      0.98      0.97      7865

other_weather
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      7455
           1       0.00      0.00      0.00       410

    accuracy                           0.95      7865
   macro avg       0.47      0.50      0.49      7865
weighted avg       0.90      0.95      0.92      7865



In [15]:
acc = (test_pred == test_y).mean().mean()
print('Total Accuracy: {:.4f}'.format(acc))

Total Accuracy: 0.9384


In [16]:
start = time.time()
parameters = {
        'clf__estimator__penalty': ['l1', 'l2']}

cv = GridSearchCV(sgd, parameters)
cv.fit(X_train, y_train)
y_pred_cv = cv.predict(X_test)
end = time.time()
print(end - start)

199.2544505596161


In [17]:
cv.best_params_

{'clf__estimator__alpha': 0.001, 'clf__estimator__penalty': 'l1'}

In [18]:
test_pred = pd.DataFrame(y_pred_cv, columns=list(y.columns))
test_y = y_test.reset_index(drop=True)
for col in list(y.columns):
    print(col)
    print (classification_report(test_y[col], test_pred[col], zero_division=0))
acc = (test_pred == test_y).mean().mean()
print('Total Accuracy: {:.4f}'.format(acc))

related
              precision    recall  f1-score   support

           0       0.73      0.07      0.13      1928
           1       0.77      0.99      0.86      5937

    accuracy                           0.77      7865
   macro avg       0.75      0.53      0.50      7865
weighted avg       0.76      0.77      0.68      7865

request
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      6526
           1       0.79      0.29      0.42      1339

    accuracy                           0.87      7865
   macro avg       0.83      0.63      0.67      7865
weighted avg       0.86      0.87      0.84      7865

offer
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7823
           1       0.00      0.00      0.00        42

    accuracy                           0.99      7865
   macro avg       0.50      0.50      0.50      7865
weighted avg       0.99      0.99      0.99      786

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      7121
           1       0.89      0.67      0.77       744

    accuracy                           0.96      7865
   macro avg       0.93      0.83      0.87      7865
weighted avg       0.96      0.96      0.96      7865

cold
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      7710
           1       0.00      0.00      0.00       155

    accuracy                           0.98      7865
   macro avg       0.49      0.50      0.50      7865
weighted avg       0.96      0.98      0.97      7865

other_weather
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      7455
           1       0.00      0.00      0.00       410

    accuracy                           0.95      7865
   macro avg       0.47      0.50      0.49      7865
weighted avg       0.90      0.95      0.92      7865



In [134]:
import pickle

In [138]:
# save the model to disk
filename = 'finalized_model.pkl'
pickle.dump(sgd, open(filename, 'wb'))

In [71]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

0.23256320836966
