In [28]:
import datetime as dt
import pandas as pd
import re
import numpy as np
import nltk
import regex as reg
import matplotlib.pyplot as plt

from twitterscraper import query_tweets
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_multilabel_classification
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from bs4 import BeautifulSoup

pd.set_option('display.max_colwidth',-1)
pd.set_option('display.max_rows', 1500)
pd.set_option('display.max_columns', 500)

## EXPLORATORY DATA ANALYSIS ALL LABELED TWEETS FIRE & HURRICANE

In [13]:
df8 =pd.read_csv('df8_SUPER.csv')

In [14]:
df9=df8[['text','requesting_help','disaster']]

In [15]:
X = df9['text']
y= df9['requesting_help']

In [16]:
df9.groupby('disaster')['requesting_help'].value_counts()

disaster   requesting_help
fire       0                  427
           1                  75 
hurricane  0                  351
           1                  153
Name: requesting_help, dtype: int64

In [41]:
427+75+351+153

1006

In [42]:
75+153

228

In [44]:
# Baseline
1 - 228/1006

0.7733598409542743

## MODELING

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, random_state=42)

## LOGISTIC REGRESSION WITH COUNT VECTORIZER (SEE LAB 5.02)

In [37]:
# Instantiate Pipline for Logistic Regression using Count Vectorizer
# Found this terrific website that explains the different solver options in LogisticRegression
# https://towardsdatascience.com/dont-sweat-the-solver-stuff-aea7cddc3451
pipe_a = Pipeline([
    ('cvec_a', CountVectorizer()),
    ('lr_a', LogisticRegression(solver= 'lbfgs'))
])

params_a = {
    'cvec_a__max_features': [2000, 3000, 4000, 5000],
    'cvec_a__min_df': [2, 3],
    'cvec_a__max_df': [.9, .95],
    'cvec_a__ngram_range': [(1,1), (1,4)],
    'cvec_a__stop_words':['english', None]
}

gs_a = GridSearchCV(pipe_a,params_a, cv=5, n_jobs=3)
    
gs_a.fit(X_train, y_train)
    
print(f'Gridsearch with Count Vectorizer for training data is {gs_a.score(X_train, y_train)}')
print()
print(f'Gridsearch with Count Vectorizer for test data is {gs_a.score(X_test, y_test)}')

Gridsearch with Count Vectorizer for training data is 0.993368700265252

Gridsearch with Count Vectorizer for test data is 0.8928571428571429


In [40]:
gs_a.best_params_

{'cvec_a__max_df': 0.9,
 'cvec_a__max_features': 2000,
 'cvec_a__min_df': 2,
 'cvec_a__ngram_range': (1, 1),
 'cvec_a__stop_words': None}

## LOGIST REGRESSION WITH TFIDF VECTORIZER

In [46]:
pipe_b = Pipeline([
    ('tfid_b', TfidfVectorizer()),
    ('lr_b', LogisticRegression(solver= 'lbfgs'))
])

params_b = {
    'tfid_b__max_features': [2000, 3000, 4000, 5000],
    'tfid_b__min_df': [2, 3],
    'tfid_b__max_df': [.9, .95],
    'tfid_b__ngram_range': [(1,1), (1,4)],
    'tfid_b__stop_words':['english', None]
}

gs_b = GridSearchCV(pipe_b,params_b, cv=5, n_jobs=3)
    
gs_b.fit(X_train, y_train)
    
print(f'Gridsearch with TFIDF Vectorizer for training data is {gs_b.score(X_train, y_train)}')
print()
print(f'Gridsearch with TFIDF Vectorizer for test data is {gs_b.score(X_test, y_test)}')

Gridsearch with TFIDF Vectorizer for training data is 0.8952254641909815

Gridsearch with TFIDF Vectorizer for test data is 0.8650793650793651


In [48]:
gs_b.best_params_

{'tfid_b__max_df': 0.9,
 'tfid_b__max_features': 2000,
 'tfid_b__min_df': 3,
 'tfid_b__ngram_range': (1, 1),
 'tfid_b__stop_words': 'english'}

## SVC WITH COUNT VECTORIZER

In [23]:
pipe_c = Pipeline([
    ('cvec_c', CountVectorizer()),
    ('svc', SVC(C=2,gamma='scale'))
])

params_c = {
    'cvec_c__max_features': [2000, 3000, 4000, 5000],
    'cvec_c__min_df': [2, 3],
    'cvec_c__max_df': [.9, .95],
    'cvec_c__ngram_range': [(1,1), (1,4)],
    'cvec_c__stop_words':['english', None]
}

gs_c = GridSearchCV(pipe_c,params_c, cv=5, n_jobs=3)
    
gs_c.fit(X_train, y_train)
    
print(f'Gridsearch with Count Vectorizer for training data is {gs_c.score(X_train, y_train)}')
print()
print(f'Gridsearch with Count Vectorizer for test data is {gs_c.score(X_test, y_test)}')

Gridsearch with Count Vectorizer for training data is 0.9960212201591512

Gridsearch with Count Vectorizer for test data is 0.9126984126984127


In [26]:
# Measure performance based on accuracy.

accuracy_score(y_test, y_pred)

0.7936507936507936

In [29]:
# Generate confusion matrix.
# Documentation here: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# tn, fp  positive = asking for help
# fn, tp  negative = not asking for help
confusion_matrix(y_test, # True values.
                 y_pred)  # Predicted values.

array([[195,   0],
       [ 52,   5]])

In [25]:
y_pred = gs_f.predict(X_test)

In [24]:
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## SVC WITH TFIDF VECTORIZER

In [50]:
pipe_d = Pipeline([
    ('tfid_d', TfidfVectorizer()),
    ('svc', SVC(C=2,gamma='scale'))
])

params_d = {
    'tfid_d__max_features': [2000, 3000, 4000, 5000],
    'tfid_d__min_df': [2, 3],
    'tfid_d__max_df': [.9, .95],
    'tfid_d__ngram_range': [(1,1), (1,4)],
    'tfid_d__stop_words':['english', None]
}

gs_d = GridSearchCV(pipe_d,params_d, cv=5, n_jobs=3)
    
gs_d.fit(X_train, y_train)
    
print(f'Gridsearch with Count Vectorizer for training data is {gs_d.score(X_train, y_train)}')
print()
print(f'Gridsearch with Count Vectorizer for test data is {gs_d.score(X_test, y_test)}')

Gridsearch with Count Vectorizer for training data is 1.0

Gridsearch with Count Vectorizer for test data is 0.9087301587301587


## MULTINOMIAL NAIVE BAYES

In [20]:
pipe_e = Pipeline([
    ('cvec_e', CountVectorizer()),
    ('mnb', MultinomialNB())
])

params_e = {}

gs_e = GridSearchCV(pipe_e, params_e, cv=5, n_jobs=3)

gs_e.fit(X_train, y_train)

print(f'Gridsearch with Count Vectorizer for training data is {gs_e.score(X_train, y_train)}')
print(f'Gridsearch with Count Vectorizer for test data is {gs_e.score(X_test, y_test)}')

Gridsearch with Count Vectorizer for training data is 0.980106100795756
Gridsearch with Count Vectorizer for test data is 0.9007936507936508


In [19]:
X_train.shape

(754,)

## GAUSSIAN NAIVE BAYES

In [11]:
pipe_f = Pipeline([
    ('tfid_e', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

params_f = {}

gs_f = GridSearchCV(pipe_f, params_f, cv=5, n_jobs=3)

gs_f.fit(X_train, y_train)

print(f'Gridsearch with Tfidf Vectorizer for training data is {gs_f.score(X_train, y_train)}')
print(f'Gridsearch with Tfidf Vectorizer for training data is {gs_f.score(X_train, y_train)}')

Gridsearch with Tfidf Vectorizer for training data is 0.8355437665782494
Gridsearch with Tfidf Vectorizer for training data is 0.8355437665782494
