In [None]:
import nltk 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import re
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV 
from nltk.tokenize import punkt  
from nltk.corpus import stopwords 
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
!pip install contractions
import contractions 
import string
import seaborn as sns 

In [None]:
pd.set_option('display.max_colwidth',100)
# Load training set 
raw_ = pd.read_csv('text-data/train.csv')
raw_.head(30)

In [None]:
raw_.info()

In [None]:
# split data into training and test 
X_train, X_test, y_train, y_test = train_test_split(raw_[['text']],raw_['target'],random_state=42)
print(X_train.shape, X_test.shape)
X_train[50:70]

In [None]:
# Exploring what the unique keywords and locations are 
non_null_kw = raw_.keyword.notnull()
non_null_loc = raw_.location.notnull()
raw_['keyword'][non_null_kw].unique()[:30]

In [None]:
raw_['location'][non_null_loc].unique()[:30]

In [None]:
raw_['target'].value_counts()

In [None]:
from wordcloud import WordCloud
# Generate word cloud
all_words = ' '.join([text for text in X_train['text']])
wordcloud = WordCloud(width=800, height = 500, max_font_size=110).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


#### Data preprocessing and manipulation 


In [None]:
# create a data cleaning function that tokenizes, 
# removes english stopwords and punctuations and returns tokenized text in lowercase 

def clean_text_lm(text):
    '''Removes punctuations and stopwords and returns lowercase tokenized text for input text and pattern'''
    
    # expand contracted sentences
    doc = contractions.fix(text) 
    eng_stop = stopwords.words('english') # english stopwords
    wn = nltk.WordNetLemmatizer() # Instantiate word lemmatizer
    
    # match regex pattern and remove users, replace with empty string
    #  doc_nousr = re.sub(r'@[^\s]+',r'',doc)
    
    # remove any ascii symbols 
    doc_noascii= doc.encode("ascii", errors="ignore").decode()
    # remove any links
    doc_nourl = re.sub(r'(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+',r'',doc_noascii)
    # remove any remaining special characters
    doc_nospchar = re.sub(r'^a-zA-Z\s\W+',r'',doc_nourl,re.I | re.A) 
    
    # remove punctuations from previous out
    doc_nopunct = ''.join([char for char in doc_nospchar if char not in string.punctuation])
    
    # convert text to lower case and strip white space if any
    doc_lower_nospc = doc_nopunct.lower().strip() 
    
    # lemmatize and store in list format
    lem_text = [wn.lemmatize(word) for word in re.split('\W+',doc_lower_nospc)] 
    #  removes any nonsensical words
    lem_text = [word for word in lem_text if len(word) > 2]
    
     # join list into string with no stopwords
    no_stop_docs = ' '.join([word for word in lem_text if word not in eng_stop])
    
    return no_stop_docs

# Vectorize the function to apply accross dataframe
cleaner = np.vectorize(clean_text_lm)

In [None]:
# store clean text in separate column in df
X_train['cleaned_text'] = cleaner(X_train[['text']])
X_train.head()

In [None]:
# clean up text in test set
X_test['cleaned_text'] = cleaner(X_test[['text']])
X_test.head()

In [None]:
raw_['cleaned_text'] = cleaner(raw_['text'])
disaster_words = ' '.join(word for word in raw_['cleaned_text'][raw_['target']==1])
wordcloud = WordCloud(width=850, height=500, max_font_size=110).generate(disaster_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
wordcloud.to_file('images/disaster_wordcloud.jpg')

This word cloud shows us word frequency in tweets that are associated to disaster, the larger the text would indicate a higher frequency of the word being used.  
The cleaned text is now devoid of any special characters or stopwords, however its still not ready to be vectorized. The first action required is to tokenize the words, ie, converting the sentence into a list of words, and then, there can be many words that have a similar meaning such search, searching, searched, etc. I used a lemmatizer (WordNetLemmatizer) to correlate words with similar meaning and keeps the root words. 

#### Feature engineering
As we can see from prior modeling, the accuracy seems to be stagnant at ~77% even though the parameters were tuned using cross validation. Creating new features from the dataset might help with this issue. 

Utilized the guide on [AnalyticsVidhya](https://www.analyticsvidhya.com/blog/2021/04/a-guide-to-feature-engineering-in-nlp/) as a reference for this task.

In [None]:
# Length of text 
raw_['doc_len'] = raw_['text'].apply(len)
raw_['word_count'] = raw_['text'].apply(lambda x: len(x.split()))
# Number of caps per tweet 
raw_['CAPS_len'] = raw_['text'].apply(lambda x: 
                                   len([word for word in x.split() if word.isupper()])
                                  )
raw_['sent_len'] = raw_['text'].apply(lambda x: len(nltk.sent_tokenize(x)))
raw_['hashtag_count'] = raw_['text'].apply(lambda x: len(re.findall(r'(\#[A-Za-z0-9]*)',x)))
raw_['unique_word_count'] = raw_['text'].apply(lambda x: len(set(x.split())))

raw_.head()


In [None]:
bins = np.linspace(0, 100, 30)

# create subplots, figsize to control size of the image
fig, ax = plt.subplots(1,3, figsize=(10,5)) 
# plt.subplot(1,3,1) # 1 line, 2 rows, index nr 1 (first position in the subplot)
ax[0].hist(raw_[raw_['target']==1]['doc_len'], bins, alpha=0.5, label='disaster')
ax[0].hist(raw_[raw_['target']==0]['doc_len'], bins, alpha=0.5, label='non-disaster')
ax[0].legend(loc='best')
# plt.subplot(1, 3, 2) 
ax[1].hist(raw_[raw_['target']==1]['word_count'], bins, alpha=0.5, label='disaster')
ax[1].hist(raw_[raw_['target']==0]['word_count'], bins, alpha=0.5, label='non-disaster')
ax[1].legend(loc='best')
# plt.subplot(1,3,3)
ax[2].hist(raw_[raw_['target']==1]['unique_word_count'], bins, alpha=0.5, label='disaster')
ax[2].hist(raw_[raw_['target']==0]['unique_word_count'], bins, alpha=0.5, label='non-disaster')
ax[2].legend(loc='best')
plt.show()

In [None]:
small_bins = np.linspace(0, 21, 3)
fig, ax = plt.subplots(1,3, figsize=(10,5)) 
# plt.subplot(1,3,1) # 1 line, 2 rows, index nr 1 (first position in the subplot)
ax[0].hist(raw_[raw_['target']==1]['CAPS_len'], small_bins, alpha=0.5, label='disaster')
ax[0].hist(raw_[raw_['target']==0]['CAPS_len'], small_bins, alpha=0.5, label='non-disaster')
ax[0].legend(loc='best')
# plt.subplot(1, 3, 2) 
ax[1].hist(raw_[raw_['target']==1]['hashtag_count'], small_bins, alpha=0.5, label='disaster')
ax[1].hist(raw_[raw_['target']==0]['hashtag_count'], small_bins, alpha=0.5, label='non-disaster')
ax[1].legend(loc='best')
# plt.subplot(1,3,3)
ax[2].hist(raw_[raw_['target']==1]['sent_len'], small_bins, alpha=0.5, label='disaster')
ax[2].hist(raw_[raw_['target']==0]['sent_len'], small_bins, alpha=0.5, label='non-disaster')
ax[2].legend(loc='best')
plt.show()

Since we are dealing with disasters, we can try and derive the sentiment of each tweet based on the commonly used words that are associated to disasters such as death, hurricane, flood etc. 
NLTK conveniently has a library that can allow us to perform this kind of analysis. 


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
nltk.download('vader_lexicon')
sent_analyze = SentimentIntensityAnalyzer()
sample = X_train[:100]
sample.head()

In [None]:
polar_ = np.vectorize(sent_analyze.polarity_scores)

sample['polarity'] = polar_(sample['cleaned_text'])
sample.head()

If we look at the output generated though, its generating negative values and compound values somewhat closely to what we'd like it to be, for eg. entry 7037: we can see negative polarity is ~0.506, compound ~-0.62, however if we observe the entry above "war on drus.." has a higher negative polarity rating, which may make it not as useful for our use case. But its worth a try. 

In [None]:
# Instantiate CountVectorizer, bag of words model
CountVec = CountVectorizer(analyzer = 'word',ngram_range = (1,1))
# fit and transform using bag of words model
train_cmatrix = CountVec.fit_transform(X_train['cleaned_text']).toarray()
# convert count matrix to dataframe
train_cmatrix_df = pd.DataFrame(train_cmatrix, columns = CountVec.get_feature_names() )

# performing similar steps on test data
test_cmatrix = CountVec.transform(X_test['cleaned_text']).toarray()
test_cmatrix_df = pd.DataFrame(test_cmatrix,columns = CountVec.get_feature_names())
train_cmatrix_df.shape, test_cmatrix_df.shape

In [None]:
# range of uni- and bi-gram vectors 
uni_bigram_vec = CountVectorizer(analyzer = 'word',ngram_range = (1,2))
unibigram_train_cmatrix = uni_bigram_vec.fit_transform(X_train['cleaned_text']).toarray()
unibigram_test_cmatrix = uni_bigram_vec.transform(X_test['cleaned_text']).toarray()
unibigram_train_cmatrix.shape, unibigram_test_cmatrix.shape

In [None]:
bi_trigram_vec = CountVectorizer(analyzer = 'word',ngram_range = (2,3))
bi_trigram_train_cmatrix = bi_trigram_vec.fit_transform(X_train['cleaned_text']).toarray()
bi_trigram_test_cmatrix = bi_trigram_vec.transform(X_test['cleaned_text']).toarray()
bi_trigram_train_cmatrix.shape, bi_trigram_test_cmatrix.shape

#### Classification of tweets 

In [None]:
# Instantiate classifier
rf_clf = RandomForestClassifier(random_state=42,n_jobs=-1)
# fit the model 
rf_model = rf_clf.fit(train_cmatrix, y_train)
# predict on test 
y_hat = rf_model.predict(test_cmatrix)

# evaluate model 
print(classification_report(y_test, y_hat))
confusion_ = pd.DataFrame(confusion_matrix(y_test, y_hat),index=['non-disaster','disaster'])
print(confusion_)

#### Hyperparameter tuning 
While the performance using default parameters of the classifier yielded decent scoring in precision, recall and f1-score, we should try to see what would be configuration would provide the best score possible using the RandomForest classifier. To do this, we will use grid search cross validation to determine these parameters. 

In [None]:
# Find best params using GridSearchCV
params = {'criterion':['gini','entropy'],
          'n_estimators':[50,100,150], 
          'max_depth':[50,75,100,None]
         }
rf = RandomForestClassifier(random_state=42)
rf_gs = GridSearchCV(rf ,param_grid = params ,cv = 5, scoring='accuracy', n_jobs=-1)
rf_gs.fit(train_cmatrix,y_train)
rf_grid_pred = rf_gs.predict(test_cmatrix)
best_params = rf_gs.best_params_
print("Best params: {}\n Best Score: {}".format(best_params,rf_gs.best_score_))
print(classification_report(y_test,rf_grid_pred))


Looks like default parameters are best for the classification using RandomForest. Trying out other classifiers and vectorizing methods to see if we can get better results

In [None]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
steps = [('Cvect',CountVec),
         ('rf',rf)]
pipeline = Pipeline(steps=steps)
# cross val > final pipeline

rf_param_grid = {'criterion':['gini','entropy'],
          'n_estimators':np.arange(50, 250, 50), 
          'max_depth':np.arange(20, 500, 25)
         }
# Find best params using RandomizedgridsearchCV
rf_randomizedcv_roc_auc = RandomizedSearchCV(rf, 
                                        param_distributions=rf_param_grid,
                                        n_iter=1, 
                                        scoring="roc_auc", 
                                        verbose=1, 
                                        cv=5,
                                         n_jobs=-1)
rf_randomizedcv_roc_auc.fit(train_cmatrix,y_train)
print(rf_randomizedcv_roc_auc.best_score_)
print(rf_randomizedcv_roc_auc.best_estimator_)
best_rf_clf = rf_randomizedcv_roc_auc.best_estimator_



In [None]:
y_hat = best_rf_clf.transform(test_cmatrix)
print(classification_report(y_test, y_hat))
confusion_ = pd.DataFrame(confusion_matrix(y_test, y_hat),index=['non-disaster','disaster'])
sns.heatmap(confusion_)

In [None]:
# Rerun RF classifier with optimized parameters
rf_clf = RandomForestClassifier(max_depth = 70,random_state=42,verbose=1,n_jobs = -1)

# fit the model 
rf_model = rf_clf.fit(train_cmatrix, y_train)
# predict on test 
y_hat = rf_model.predict(test_cmatrix)
# evaluate model 
print(classification_report(y_test, y_hat))
confusion_ = pd.DataFrame(confusion_matrix(y_test, y_hat),index=['non-disaster','disaster'])
print(confusion_)

In [None]:
# uni-bigram model 
rf_model_unibigram = rf_clf.fit(unibigram_train_cmatrix,y_train)
unibigram_y_hat = rf_model_unibigram.predict(unibigram_test_cmatrix)
print(classification_report(y_test, unibigram_y_hat))
confusion_ = pd.DataFrame(confusion_matrix(y_test, unibigram_y_hat),index=['non-disaster','disaster'])
print(confusion_)

In [None]:
# bi-trigram model 
rf_model_bitrigram = rf_clf.fit(bi_trigram_train_cmatrix,y_train)
bi_trigram_y_hat = rf_model_unibigram.predict(bi_trigram_test_cmatrix)
print(classification_report(y_test, bi_trigram_y_hat))
confusion_ = pd.DataFrame(confusion_matrix(y_test, bi_trigram_y_hat),index=['non-disaster','disaster'])
print(confusion_)

In [None]:
#Calculate the y_score
rf_y_score = rf_model_unibigram.predict_proba(test_cmatrix)

from sklearn.preprocessing import label_binarize
#Binarize the output
rf_y_test_bin = label_binarize(y_test, classes=[0,1])
n_classes = rf_y_test_bin.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
#create ROC curve
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(rf_y_test_bin[:, i], rf_y_score[:, i])
    plt.plot(fpr[i], tpr[i], color='darkorange', lw=2)
    print('AUC for Class {}: {}'.format(i+1, auc(fpr[i], tpr[i])))

plt.plot([0, 1], [0, 1], color='maroon', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curves')
plt.show()



Drew inspiration for code from [laurenlizz22](https://laurenliz22.github.io/roc_curve_multiclass_predictions_random_forest_classifier) 

##### Analysis of results
Our optimized model was able to distinguish disaster from non-disaster tweets with ~<b>77%</b> accuracy.
<br>
From the confusion matrix, there seems to be a marked difference in the classification of true negatives and positives. However, we also see any improvements in the precision, recall or f1-scores when compared to the default model. We also notice a drastic difference in recall scores between the 0 (non-disaster) and 1 (disaster) labels. One reason might because of the class imbalance in the training dataset. 
<br>
It might be worthwhile to look into other classifiers as well as adding features to improve our classification. 


#### Using TF-IDF vectorizer and XGB classifier 


In [None]:
gb = GradientBoostingClassifier(random_state=42)
tf_vect = TfidfVectorizer(analyzer=clean_text_lm, min_df=2, max_df=0.9)
train_tfidf_matrix = tf_vect.fit_transform(X_train['cleaned_text']).toarray()
test_tfidf_matrix = tf_vect.transform(X_test['cleaned_text']).toarray()
gb.fit(train_tfidf_matrix,y_train)
gb_y_hat = gb.predict(test_tfidf_matrix)
print(classification_report(y_test, gb_y_hat))
confusion_ = pd.DataFrame(confusion_matrix(y_test, gb_y_hat),index=['non-disaster','disaster'])
print(confusion_)

In [None]:
gbm_param_grid = {
    'learning_rate': np.arange(0.05, 1, 0.05),
    'max_depth': np.arange(3, 10, 1),
    'n_estimators': np.arange(50, 200, 50)
}
# gb_steps=[('tf',tf_vect),('XGB',gb)]
# gb_pipeline = Pipeline(steps=gb_steps)
# gb_pipeline.fit(train_tfidf_matrix,y_train)
gb_randomizedcv_roc_auc = RandomizedSearchCV(gb, 
                                        param_distributions=gbm_param_grid,
                                        n_iter=1, 
                                        scoring="roc_auc", 
                                        verbose=1, 
                                        cv=5,
                                         n_jobs=-1)
gb_randomizedcv_roc_auc.fit(train_tfidf_matrix,y_train)
grid_pred_gbm = gb_randomizedcv_roc_auc.predict(test_tfidf_matrix)
best_params_gbm = gb_randomizedcv_roc_auc.best_params_
print("Best params: {}\n Best Score: {}".format(best_params_gbm,randomizedcv_roc_auc.best_score_))
print(classification_report(y_test,grid_pred_gbm))