# Import Packages

In [1]:
#Reading in Data
from sqlalchemy import create_engine
import pandas as pd

#Cleaning Data
import nltk
import re
import string

#Model Building
from sklearn.model_selection import train_test_split

# Read in Training Data

In [2]:
pd.set_option('display.max_colwidth', 100)

db_pw = open('C:/Users/Jake/Documents/Projects/jakes_db.txt').read()

engine = create_engine('postgresql://postgres:%s@localhost:5432/disaster' % db_pw)
conn = engine.connect()

train = pd.read_sql('SELECT * FROM train;', conn)

stopwords = nltk.corpus.stopwords.words('english')

Create a holdout test set from training data.

In [3]:
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['target'], random_state=0, test_size=0.2)

# Train ML Models

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import precision_recall_fscore_support as score
from joblib import dump, load

## Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier

Create a class tokenizer to perform the lemmatize action on the tweets.

In [6]:
#from nltk import word_tokenize          
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = nltk.WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in nltk.word_tokenize(articles)]

Create a pipeline for training and fitting the model.

In [5]:
# build the pipeline
rf_pipe = Pipeline([
    ('tfidf_vec', TfidfVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', stop_words = stopwords, strip_accents = 'ascii')),
    ('clf',   RandomForestClassifier(criterion = 'gini', max_depth = None, min_samples_split = 10, max_features = 'log2'))
])

#try with TFIDF and stemming rather than lemmatizing

Build a parameter grid to tune the RF model and search for the best combination.

In [6]:
#Parameter Grid for Searching
rf_param_grid = {
    'tfidf_vec__ngram_range': [(1, 1), (1, 2)],
    'tfidf_vec__max_df': [0.75, 0.9],
    'tfidf_vec__min_df': [10, 20],
    'clf__n_estimators': [25, 100, 250]
}

rf_cv = GridSearchCV(rf_pipe, rf_param_grid, n_jobs= -1, verbose=5, return_train_score=True)
                  
rf_cv_models = rf_cv.fit(X_train, y_train)  

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  5.4min finished
  'stop_words.' % sorted(inconsistent))


In [7]:
print(rf_cv_models.best_params_)    
print(rf_cv_models.best_score_)

{'clf__n_estimators': 250, 'tfidf_vec__max_df': 0.75, 'tfidf_vec__min_df': 10, 'tfidf_vec__ngram_range': (1, 1)}
0.7829228243021346


Print out the top 5 models (based on accuracy).

In [8]:
rf_gs_results = pd.DataFrame(rf_cv_models.cv_results_).sort_values('mean_test_score', ascending=False)
rf_gs_results[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__n_estimators,param_tfidf_vec__max_df,param_tfidf_vec__min_df,param_tfidf_vec__ngram_range,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
16,7.736946,0.064861,1.053868,0.028108,250,0.75,10,"(1, 1)","{'clf__n_estimators': 250, 'tfidf_vec__max_df': 0.75, 'tfidf_vec__min_df': 10, 'tfidf_vec__ngram...",0.791461,...,0.782923,0.007885,1,0.982348,0.980296,0.983785,0.981527,0.980501,0.981691,0.001281
20,8.872314,0.214473,1.284441,0.16638,250,0.9,10,"(1, 1)","{'clf__n_estimators': 250, 'tfidf_vec__max_df': 0.9, 'tfidf_vec__min_df': 10, 'tfidf_vec__ngram_...",0.788998,...,0.781117,0.004604,2,0.983169,0.97968,0.982553,0.981117,0.979475,0.981199,0.001484
8,5.291637,0.061668,0.945272,0.023495,100,0.75,10,"(1, 1)","{'clf__n_estimators': 100, 'tfidf_vec__max_df': 0.75, 'tfidf_vec__min_df': 10, 'tfidf_vec__ngram...",0.784893,...,0.780788,0.005943,3,0.981527,0.978654,0.979269,0.978859,0.979269,0.979516,0.001034
12,5.075155,0.036021,0.942256,0.040764,100,0.9,10,"(1, 1)","{'clf__n_estimators': 100, 'tfidf_vec__max_df': 0.9, 'tfidf_vec__min_df': 10, 'tfidf_vec__ngram_...",0.793924,...,0.780624,0.008034,4,0.981322,0.979475,0.982759,0.979064,0.979885,0.980501,0.001362
17,7.974532,0.069731,1.078106,0.009888,250,0.75,10,"(1, 2)","{'clf__n_estimators': 250, 'tfidf_vec__max_df': 0.75, 'tfidf_vec__min_df': 10, 'tfidf_vec__ngram...",0.787356,...,0.777504,0.006506,5,0.981938,0.97968,0.982348,0.979475,0.978859,0.98046,0.001407


Save all model results and save best model for use in a later notebook comparing different classifiers.

In [9]:
rf_model_dir = 'C:/Users/Jake/Documents/Projects/Disaster-Tweets/models/random_forest'
rf_gs_results.to_excel(rf_model_dir + '/rf_gs_results.xlsx', index=False)

dump(rf_cv_models.best_estimator_, rf_model_dir + '/rf_best.joblib')

['C:/Users/Jake/Documents/Projects/Disaster-Tweets/models/random_forest/rf_best.joblib']

In [8]:
rf_best_model = load(rf_model_dir + '/rf_best.joblib')

Calculate best models precision and accuracy.

In [9]:
# test the classifier
rf_predict = rf_best_model.predict(X_test)
precision, recall, fscore, support = score(y_test, rf_predict, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((rf_predict==y_test).sum() / len(rf_predict),3)))

  'stop_words.' % sorted(inconsistent))


Precision: 0.81 / Recall: 0.654 / Accuracy: 0.801


## Gradient Boost

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

In [11]:
gb_pipe = Pipeline([
    ('tfidf_vec', TfidfVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', stop_words = stopwords, strip_accents = 'ascii')),
    ('clf',   GradientBoostingClassifier())
])

Build a parameter grid to tune the GB model and search for the best combination.

In [12]:
#Parameter Grid for Searching
gb_param_grid = {
    'tfidf_vec__ngram_range': [(1,1), (1,2)],
    'tfidf_vec__max_df': [0.75, 0.9],
    'tfidf_vec__min_df': [10, 20],
#     'tfidf_vec__norm': ['l1', 'l2'],
#     'tfidf_vec__use_idf': [True, False],
#     'tfidf_vec__sublinear_tf': [True, False],
#     'tfidf_vec__smooth_idf': [True, False],
#     'clf__loss': ['deviance', 'exponential'],
#     'clf__learning_rate': [0.05, 0.1, 0.15],
    'clf__n_estimators': [50, 100, 200],
#     'clf__subsample': [1, 0.9],
#     'clf__criterion': ['friedman_mse', 'mse', 'mae'],
    'clf__max_depth': [2, 5, 10]
}

gb_cv = GridSearchCV(gb_pipe, gb_param_grid, n_jobs= -1, verbose=5, return_train_score=True)
                  
gb_cv_models = gb_cv.fit(X_train, y_train)  

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 18.1min finished
  'stop_words.' % sorted(inconsistent))


In [13]:
print(gb_cv_models.best_params_)    
print(gb_cv_models.best_score_)

{'clf__max_depth': 10, 'clf__n_estimators': 200, 'tfidf_vec__max_df': 0.9, 'tfidf_vec__min_df': 10, 'tfidf_vec__ngram_range': (1, 2)}
0.7692939244663383


Print out the top 5 models (based on accuracy).

In [14]:
gb_gs_results = pd.DataFrame(gb_cv_models.cv_results_).sort_values('mean_test_score', ascending=False)
gb_gs_results[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__max_depth,param_clf__n_estimators,param_tfidf_vec__max_df,param_tfidf_vec__min_df,param_tfidf_vec__ngram_range,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
69,18.656856,0.081469,0.869072,0.007993,10,200,0.9,10,"(1, 2)","{'clf__max_depth': 10, 'clf__n_estimators': 200, 'tfidf_vec__max_df': 0.9, 'tfidf_vec__min_df': ...",...,0.769294,0.008812,1,0.955665,0.955049,0.958539,0.95546,0.956486,0.95624,0.001241
65,18.688061,0.032719,0.871919,0.031845,10,200,0.75,10,"(1, 2)","{'clf__max_depth': 10, 'clf__n_estimators': 200, 'tfidf_vec__max_df': 0.75, 'tfidf_vec__min_df':...",...,0.768966,0.00497,2,0.958744,0.952997,0.957512,0.954433,0.956486,0.956034,0.002075
64,15.93168,0.065996,0.840712,0.030295,10,200,0.75,10,"(1, 1)","{'clf__max_depth': 10, 'clf__n_estimators': 200, 'tfidf_vec__max_df': 0.75, 'tfidf_vec__min_df':...",...,0.768801,0.008088,3,0.954639,0.95197,0.957718,0.956281,0.952791,0.95468,0.002131
45,10.847158,0.033341,0.850061,0.00765,5,200,0.9,10,"(1, 2)","{'clf__max_depth': 5, 'clf__n_estimators': 200, 'tfidf_vec__max_df': 0.9, 'tfidf_vec__min_df': 1...",...,0.768801,0.00551,3,0.888547,0.890394,0.893268,0.894704,0.889573,0.891297,0.002317
41,10.840908,0.042381,0.878142,0.03189,5,200,0.75,10,"(1, 2)","{'clf__max_depth': 5, 'clf__n_estimators': 200, 'tfidf_vec__max_df': 0.75, 'tfidf_vec__min_df': ...",...,0.768473,0.006253,5,0.893062,0.88711,0.893473,0.89532,0.886494,0.891092,0.00359


Save all model results and save best model for use in a later notebook comparing different classifiers.

In [15]:
gb_model_dir = 'C:/Users/Jake/Documents/Projects/Disaster-Tweets/models/gradient_boost'
gb_gs_results.to_excel(gb_model_dir + '/gb_gs_results.xlsx', index=False)

dump(gb_cv_models.best_estimator_, gb_model_dir + '/gb_best.joblib')

['C:/Users/Jake/Documents/Projects/Disaster-Tweets/models/gradient_boost/gb_best.joblib']

In [16]:
gb_best_model = load(gb_model_dir + '/gb_best.joblib')

Calculate best models precision and accuracy.

In [17]:
# test the classifier
gb_predict = gb_best_model.predict(X_test)
precision, recall, fscore, support = score(y_test, gb_predict, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((gb_predict==y_test).sum() / len(gb_predict),3)))

  'stop_words.' % sorted(inconsistent))


Precision: 0.773 / Recall: 0.667 / Accuracy: 0.789


## K Nearest Neighbors