In [2]:
import sys

# Import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# ML libraries
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

# NLP libraries
import re
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download(['punkt', 'wordnet'])
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alisurmeli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alisurmeli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alisurmeli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# LOADING DATA

In [3]:
df=pd.read_sql_table('DisasterData_Clean', 'sqlite:///disaster_response_pipe.db')  


In [4]:
df.head()


Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# LOAD DATA FUNCTION


def load_data(database_filepath):
    
    
    engine = create_engine('sqlite:///{}'.format(database_filepath))
    df = pd.read_sql_table('DisasterData_Clean', engine)
    
    X=df['message']
    y=df.iloc[:,4:]
    category_names = y.columns.tolist()
    
    return X,y,category_names
    
    

# TOKENIZING DATA

In [6]:
a=df['message'][3]
print(a)


UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.


In [7]:
b=re.sub(r"[^a-zA-Z0-9]", " ", a.lower())

print(b)


un reports leogane 80 90 destroyed  only hospital st  croix functioning  needs supplies desperately 


In [8]:
c=word_tokenize(b)
print(c)


['un', 'reports', 'leogane', '80', '90', 'destroyed', 'only', 'hospital', 'st', 'croix', 'functioning', 'needs', 'supplies', 'desperately']


In [9]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")



d=[lemmatizer.lemmatize(i) for i in c if i not in stop_words]
print(d)


['un', 'report', 'leogane', '80', '90', 'destroyed', 'hospital', 'st', 'croix', 'functioning', 'need', 'supply', 'desperately']


In [10]:
# TOKENIZING DATA FUNCTION


def tokenize_data(text):
    
    new_text=re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    words=word_tokenize(new_text)
    
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words("english")
    
    tokens= [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return tokens

    

# BUILDING MODEL

In [11]:
X,y,category_names = load_data('disaster_response_pipe.db')



In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((20944,), (5236,), (20944, 36), (5236, 36))

In [None]:
vect = CountVectorizer(tokenizer=tokenize_data)
tfidf = TfidfTransformer()
rfc = MultiOutputClassifier(estimator=RandomForestClassifier(),n_jobs=1)

X_train_vect=vect.fit_transform(X_train)
X_train_tfidf=tfidf.fit_transform(X_train_vect)

rfc.fit(X_train_tfidf,y_train)



In [95]:
X_test_vect=vect.transform(X_test)
X_test_tfidf=tfidf.transform(X_test_vect)

y_pred=rfc.predict(X_test_tfidf)



In [76]:
from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score


In [104]:
def build_model():
    
    pipeline=Pipeline([

    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf', MultiOutputClassifier(estimator=RandomForestClassifier(random_state=101)))
        
    ])
    
    parameter_grid = {
        'tfidf__norm':['l2','l1'],
        'clf__estimator__min_samples_split':[2,3]
    
    }

    model=GridSearchCV(estimator=pipeline, param_grid=parameter_grid,n_jobs=1)
    model.fit(X_train, y_train)

    return model



In [105]:
def evaluate_model(model, X_test, y_test, category_names):
    y_pred = model.predict(X_test)
    for i, col in enumerate(category_names): 
        print('***********',col,'***********')
        print(classification_report(y_test.iloc[:,i], y_pred[:,i]))
        
        

In [108]:
def save_model(model, model_filepath):
    
    pickle.dump(model, open('/Users/alisurmeli/Documents/Python_Nanodegree/Project_Disaster Response/workspace/models/classifier.pkl','wb')           )
    
    

In [109]:
def main():
    
        database_filepath='/Users/alisurmeli/Documents/Python_Nanodegree/Project_Disaster Response/workspace/data/DisasterResponse.db'
        model_filepath='/Users/alisurmeli/Documents/Python_Nanodegree/Project_Disaster Response/classifier.pkl'
    
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, y, category_names = load_data(database_filepath)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        
        print('Building model...')
        model = build_model()
        
        print('Training model...')
        model.fit(X_train, y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')
        

In [110]:
if __name__ == '__main__':
    main()
    

Loading data...
    DATABASE: /Users/alisurmeli/Documents/Python_Nanodegree/Project_Disaster Response/disaster_response_pipe.db
Building model...






























Training model...






























Evaluating model...
*********** related ***********
              precision    recall  f1-score   support

           0       0.64      0.37      0.47      1258
           1       0.82      0.93      0.87      3936
           2       0.42      0.40      0.41        42

    accuracy                           0.79      5236
   macro avg       0.63      0.57      0.59      5236
weighted avg       0.77      0.79      0.77      5236

*********** request ***********
              precision    recall  f1-score   support

           0       0.89      0.98      0.93      4377
           1       0.83      0.38      0.52       859

    accuracy                           0.89      5236
   macro avg       0.86      0.68      0.73      5236
weighted avg       0.88      0.89      0.87      5236

*********** offer ***********
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5216
           1       0.00      0.00      0.00        20

    accuracy  

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.97      1.00      0.98      5015
           1       0.75      0.18      0.29       221

    accuracy                           0.96      5236
   macro avg       0.86      0.59      0.64      5236
weighted avg       0.96      0.96      0.95      5236

*********** other_aid ***********
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      4536
           1       0.55      0.02      0.05       700

    accuracy                           0.87      5236
   macro avg       0.71      0.51      0.49      5236
weighted avg       0.83      0.87      0.81      5236

*********** infrastructure_related ***********
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      4919
           1       0.43      0.01      0.02       317

    accuracy                           0.94      5236
   macro avg       0.68      0.50      0.49    