In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import nltk
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

In [3]:
data = pd.read_csv('CleanedDisasterTweets.csv')
target = pd.read_csv('TargetDisasterTweets.csv')
tweets = data[['Text']].values.ravel()
target = target[['target']].values.ravel()

In [4]:
tweets_train, tweets_test, target_train, target_test = train_test_split(tweets, target, random_state = 0)

In [5]:
# CountVectorizer unigram Train features 
cv=CountVectorizer(analyzer='word', strip_accents = 'ascii', ngram_range = (1,1))
cv_train =cv.fit_transform(tweets_train)
df_cv_train = pd.DataFrame(cv_train.toarray(), columns=cv.get_feature_names())
df_cv_train.head()

Unnamed: 0,aa,aaaa,aaaaaaallll,aaarrrgghhh,aaemiddleaged,aampb,aampw,aashiqui,ab,aba,...,zonewolf,zoom,zotar,zouma,zourryart,zss,zumiez,zurich,zxathetis,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# TfidfVectorizer unigram Train features
tfid = TfidfVectorizer(sublinear_tf=True, strip_accents = 'ascii', ngram_range = (1,1))
tfid_train = tfid.fit_transform(tweets_train)
df_tfid_train = pd.DataFrame.sparse.from_spmatrix(tfid_train, columns = tfid.get_feature_names())
df_tfid_train.head()

Unnamed: 0,aa,aaaa,aaaaaaallll,aaarrrgghhh,aaemiddleaged,aampb,aampw,aashiqui,ab,aba,...,zonewolf,zoom,zotar,zouma,zourryart,zss,zumiez,zurich,zxathetis,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# uni Test features
cv_test = cv.transform(tweets_test)
tfid_test = tfid.transform(tweets_test)

In [8]:
print('CountVectorizer Train features shape:', cv_train.shape, '| TfidVectorizer Train features shape:', tfid_train.shape, 
      '\nCountVectorizer Test features shape: ', cv_test.shape, '| TfidVectorizer Test features shape: ', tfid_test.shape)

CountVectorizer Train features shape: (5627, 17778) | TfidVectorizer Train features shape: (5627, 17778) 
CountVectorizer Test features shape:  (1876, 17778) | TfidVectorizer Test features shape:  (1876, 17778)


In [9]:
# Possible KFold evaluation

# folds = list(range(2,11))
# for k in folds:
#     kfold = KFold(n_splits=k,random_state = 0, shuffle=False)

In [10]:
# Tuning the hyperparameter "C"
Cs = [10**(x) for x in [-2,-1,0,1,2,3,4]]
for c in Cs:
    log = LogisticRegression(solver = 'liblinear', C = c, random_state = 0)
    log.fit(cv_train, target_train)
    # evaluate model
    log_score = cross_val_score(log, cv_test, target_test, scoring = 'accuracy', n_jobs=-1).mean()
    # report performance
    print(f'Accuracy for C = {c}: {round(log_score, 4)}')    

Accuracy for C = 0.01: 0.6253
Accuracy for C = 0.1: 0.7585
Accuracy for C = 1: 0.7799
Accuracy for C = 10: 0.7671
Accuracy for C = 100: 0.7527
Accuracy for C = 1000: 0.7489
Accuracy for C = 10000: 0.7425


In [11]:
# LogisticRegression with CountVectorizer uni features 
max_c = 1.0
log = LogisticRegression(solver = 'liblinear', C = max_c, random_state = 0)
log.fit(cv_train, target_train)
log_cv_prediction = log.predict(cv_test)
labels = ['negative', 'positive']
print(classification_report(target_test, log_cv_prediction))
pd.DataFrame(confusion_matrix(target_test, log_cv_prediction), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.80      0.89      0.84      1092
           1       0.81      0.68      0.74       784

    accuracy                           0.80      1876
   macro avg       0.80      0.79      0.79      1876
weighted avg       0.80      0.80      0.80      1876



Unnamed: 0,negative,positive
negative,968,124
positive,247,537


In [12]:
# LogisticRegression with TfidVectorizer uni features
max_c = 1.0
log = LogisticRegression(solver = 'liblinear', C = max_c, random_state = 0)

log.fit(tfid_train, target_train)
log_tfid_prediction = log.predict(tfid_test)

labels = ['negative', 'positive']
print(classification_report(target_test, log_tfid_prediction))
pd.DataFrame(confusion_matrix(target_test, log_tfid_prediction), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.79      0.91      0.84      1092
           1       0.84      0.66      0.74       784

    accuracy                           0.80      1876
   macro avg       0.81      0.78      0.79      1876
weighted avg       0.81      0.80      0.80      1876



Unnamed: 0,negative,positive
negative,992,100
positive,268,516


In [13]:
# CountVectorizer unigram and bigram features
cv=CountVectorizer(analyzer='word', strip_accents = 'ascii', ngram_range = (1,2))
cv_train =cv.fit_transform(tweets_train)
df_cv_train = pd.DataFrame(cv_train.toarray(), columns=cv.get_feature_names())
df_cv_train.head()

Unnamed: 0,aa,aa ayyo,aa batteries,aa httptcobbmsrwow,aa near,aaaa,aaaa ok,aaaaaaallll,aaaaaaallll iuam,aaarrrgghhh,...,zourryart forgot,zss,zss vs,zumiez,zumiez httptcozwjpvku,zurich,zurich swiss,zxathetis,zxathetis are,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# TfidfVectorizer unigram and bigram Train features
tfid = TfidfVectorizer(sublinear_tf=True, strip_accents = 'ascii', ngram_range = (1,2))
tfid_train = tfid.fit_transform(tweets_train)
df_tfid_train = pd.DataFrame.sparse.from_spmatrix(tfid_train, columns = tfid.get_feature_names())
df_tfid_train.head()

Unnamed: 0,aa,aa ayyo,aa batteries,aa httptcobbmsrwow,aa near,aaaa,aaaa ok,aaaaaaallll,aaaaaaallll iuam,aaarrrgghhh,...,zourryart forgot,zss,zss vs,zumiez,zumiez httptcozwjpvku,zurich,zurich swiss,zxathetis,zxathetis are,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Uni and bi Test features
cv_test = cv.transform(tweets_test)
tfid_test = tfid.transform(tweets_test)

In [16]:
print('CountVectorizer Train features shape:', cv_train.shape, '| TfidVectorizer Train features shape:', tfid_train.shape, 
      '\nCountVectorizer Test features shape: ', cv_test.shape, '| TfidVectorizer Test features shape: ', tfid_test.shape)

CountVectorizer Train features shape: (5627, 59813) | TfidVectorizer Train features shape: (5627, 59813) 
CountVectorizer Test features shape:  (1876, 59813) | TfidVectorizer Test features shape:  (1876, 59813)


In [17]:
# LogisticRegression with CountVectorizer uni and bi features 
max_c = 1.0
log = LogisticRegression(solver = 'liblinear', C = max_c, random_state = 0)
log.fit(cv_train, target_train)
log_cv_prediction = log.predict(cv_test)
labels = ['negative', 'positive']
print(classification_report(target_test, log_cv_prediction))
pd.DataFrame(confusion_matrix(target_test, log_cv_prediction), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.79      0.91      0.84      1092
           1       0.84      0.67      0.74       784

    accuracy                           0.81      1876
   macro avg       0.81      0.79      0.79      1876
weighted avg       0.81      0.81      0.80      1876



Unnamed: 0,negative,positive
negative,990,102
positive,262,522


In [18]:
# LogisticRegression with TfidVectorizer uni and bi features
max_c = 1.0
log = LogisticRegression(solver = 'liblinear', C = max_c, random_state = 0)

log.fit(tfid_train, target_train)
log_tfid_prediction = log.predict(tfid_test)

labels = ['negative', 'positive']
print(classification_report(target_test, log_tfid_prediction))
pd.DataFrame(confusion_matrix(target_test, log_tfid_prediction), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.78      0.92      0.84      1092
           1       0.85      0.64      0.73       784

    accuracy                           0.80      1876
   macro avg       0.81      0.78      0.79      1876
weighted avg       0.81      0.80      0.79      1876



Unnamed: 0,negative,positive
negative,1001,91
positive,283,501


In [19]:
# CountVectorizer bigram Train features
cv=CountVectorizer(analyzer='word', strip_accents = 'ascii', ngram_range = (2,2))
cv_train =cv.fit_transform(tweets_train)

# TfidfVectorizer bigram Train features
tfid = TfidfVectorizer(sublinear_tf=True, strip_accents = 'ascii', ngram_range = (2,2))
tfid_train = tfid.fit_transform(tweets_train)

In [20]:
# Bi Test features
cv_test = cv.transform(tweets_test)
tfid_test = tfid.transform(tweets_test)

In [21]:
print('CountVectorizer Train features shape:', cv_train.shape, '| TfidVectorizer Train features shape:', tfid_train.shape, 
      '\nCountVectorizer Test features shape: ', cv_test.shape, '| TfidVectorizer Test features shape: ', tfid_test.shape)

CountVectorizer Train features shape: (5627, 42035) | TfidVectorizer Train features shape: (5627, 42035) 
CountVectorizer Test features shape:  (1876, 42035) | TfidVectorizer Test features shape:  (1876, 42035)


In [22]:
# LogisticRegression with CountVectorizer bi features 
max_c = 1.0
log = LogisticRegression(solver = 'liblinear', C = max_c, random_state = 0)
log.fit(cv_train, target_train)
log_cv_prediction = log.predict(cv_test)
labels = ['negative', 'positive']
print(classification_report(target_test, log_cv_prediction))
pd.DataFrame(confusion_matrix(target_test, log_cv_prediction), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.69      0.97      0.81      1092
           1       0.90      0.40      0.56       784

    accuracy                           0.73      1876
   macro avg       0.79      0.68      0.68      1876
weighted avg       0.78      0.73      0.70      1876



Unnamed: 0,negative,positive
negative,1056,36
positive,469,315


In [23]:
# LogisticRegression with TfidVectorizer bi features
max_c = 1.0
log = LogisticRegression(solver = 'liblinear', C = max_c, random_state = 0)
log.fit(tfid_train, target_train)
log_tfid_prediction = log.predict(tfid_test)
labels = ['negative', 'positive']
print(classification_report(target_test, log_tfid_prediction))
pd.DataFrame(confusion_matrix(target_test, log_tfid_prediction), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.70      0.97      0.81      1092
           1       0.91      0.41      0.56       784

    accuracy                           0.74      1876
   macro avg       0.80      0.69      0.69      1876
weighted avg       0.78      0.74      0.71      1876



Unnamed: 0,negative,positive
negative,1060,32
positive,464,320
