Machine Learning Model 

In [1]:
# import libraries
import sqlite3
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
import nltk
import re
from sklearn.metrics import classification_report
from sklearn.utils.multiclass import type_of_target
import pickle 
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/xiaohanliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xiaohanliu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def load_data(database_filepath):
    engine = create_engine('sqlite:///{}'.format(database_filepath))
    df = pd.read_sql_table('df_new', con=engine)
    df = df.dropna()
    X=df['message'].astype(str)
    y=df.iloc[:,4:40]
    return X,y

In [3]:
def tokenize(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

In [4]:
def build_model():
    pipeline = Pipeline(
    [('vect', CountVectorizer(tokenizer = tokenize)), 
        ('tfidf', TfidfTransformer()), 
        ('clf',MultiOutputClassifier(KNeighborsClassifier())) 
    ])
    
    return pipeline

In [5]:
def evaluate_model(model, X_test, y_test):
    model = build_model()
    y_pred = model.predict(X_test)
    for col in range(y_test.shape[1]): 
        cl = classification_report(y_test[:,i],y_pred[:,i])
        print(cl)

In [6]:
def save_model(model, model_filepath):
      pickle.dump(model, open(model_filepath, 'wb'))

In [7]:
def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, y = load_data(database_filepath)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        
        print('Building model...')
        model = build_model()
        
        print('Training model...')
        model.fit(X_train, y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


    if __name__ == '__main__':
        main()