# Predicting Diesaster Tweets ******
Flatiron School Data Science: Project 4

Advanced Machine Learning Topics **********
- **Author**: Zaid Shoorbajee
- **Instructor**: Morgan Jones
- **Pace**: Flex, 40 weeks

## Overview and Business Understanding
Lorem ipsum
## Data Undersanding
Lorem ipsum
### Dataset and features
Lorem ipsum
### Target variable
Lorem ipsum
### Scoring
Lorem ipsum

In [1]:
import time
start_time = time.time()

In [2]:
import numpy as np
import pandas as pd

from nltk import FreqDist
from nltk.tokenize import \
    regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import plot_confusion_matrix, f1_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

from keras import models
from keras import layers
from keras import regularizers
from keras.metrics import Precision, Recall, AUC
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import LeakyReLU

import re
from collections import OrderedDict, defaultdict, Counter
import itertools
import string

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
%matplotlib inline

pd.options.display.max_colwidth = 150
seed = 7

In [3]:
# Need to revise headers


## Data Preparation

In [4]:
# Loading and previewing the dataset

df = pd.read_csv('./data/disaster_tweets/train.csv')
df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1
8,14,,,There's an emergency evacuation happening now in the building across the street,1
9,15,,,I'm afraid that the tornado is coming to our area...,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
df.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

### Keyword and location columns

The `location` column doesn't have much usable information in some cases it's just nonsense:

In [7]:
print('Sample of some of the location values')
df['location'].unique()[7:17].tolist()

Sample of some of the location values


['World Wide!!',
 'Paranaque City',
 'Live On Webcam',
 'milky way',
 'GREENSBORO,NORTH CAROLINA',
 'England.',
 'Sheffield Township, Ohio',
 'India',
 'Barbados',
 'Anaheim']

Therefore I can't use the `location` column without some pre-processing. I will fill the missing values with `location_missing` for now. Later, when I'm processing more of the text data, I'll extract some information from this column.

In [8]:
df['location'] = df['location'].fillna('location_missing')

The `keyword` column shows what was used to search for relevant tweets. This column can give use insight as to what kinds of tweets the keywords yield. 

In [9]:
df['keyword'].value_counts(dropna=False)
# Counter(df['keyword'])[np.NaN]

NaN                      61
fatalities               45
deluge                   42
armageddon               42
harm                     41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 222, dtype: int64

`keyword` has 61 missing values

In [10]:
# Filling missing values with 'missing_keyword' so I can visualize.

df['keyword'] = df['keyword'].fillna('missing_keyword')

In [11]:
df['keyword'] = df['keyword'].apply(lambda x: x.replace('%20', ' '))

In [12]:
# df['keyword_count_0'] = \
#     df['keyword'].apply(
#         lambda x: Counter(df['keyword'][df['target'] == 0])[x]
#         )
# df['keyword_count_1'] = \
#     df['keyword'].apply(
#         lambda x: Counter(df['keyword'][df['target'] == 1])[x]
#         )

In [13]:
# fig, axes = plt.subplots(ncols=2, figsize=(15, 50))

# sns.countplot(
#     y=df[df['target']==0].sort_values(
#         by='keyword_count_0', ascending=False
#         )['keyword'],
#     color='blue',
#     ax=axes[0]
#     )
# sns.countplot(
#     y=df[df['target']==1].sort_values(
#         by='keyword_count_1', ascending=False
#         )['keyword'],
#     color='red',
#     ax=axes[1]
#     )

# axes[0].set_title('Non-disaster tweets')
# axes[1].set_title('Disaster tweets')

# plt.tight_layout()
# plt.show()

### Tweets and Target
The main feature is `text`, which is the full text of the given tweet. Each tweet is labeled in `target` as referring to a disaster (1) or not (0).
#### Duplicate tweets
Before proceeding, I'll check if there are any tweets that are duplicated.

In [14]:
df['text'].duplicated().sum()

110

There are numerous duplicate tweets, perhaps tweeted by different accounts. The most concerning part of this is that some of these duplicates have contradicting labels. Here are a few examples:

In [15]:
df[df['text'] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam"]

Unnamed: 0,id,keyword,location,text,target
3240,4656,engulfed,location_missing,He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam,0
3243,4659,engulfed,Kuwait,He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam,1
3248,4669,engulfed,Bahrain,He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam,1
3251,4672,engulfed,location_missing,He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam,0
3261,4684,engulfed,location_missing,He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam,0
3266,4691,engulfed,location_missing,He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam,0


In [16]:
df[df['text'] == ".POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4"]

Unnamed: 0,id,keyword,location,text,target
2830,4068,displaced,Pedophile hunting ground,.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4,1
2831,4072,displaced,Pedophile hunting ground,.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4,1
2832,4076,displaced,Pedophile hunting ground,.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4,0
2833,4077,displaced,Pedophile hunting ground,.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4,1


In [17]:
df[df['text'] == "Caution: breathing may be hazardous to your health."]

Unnamed: 0,id,keyword,location,text,target
4232,6012,hazardous,location_missing,Caution: breathing may be hazardous to your health.,1
4235,6017,hazardous,location_missing,Caution: breathing may be hazardous to your health.,0


In [18]:
df[df['text'] == "like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit"]

Unnamed: 0,id,keyword,location,text,target
1221,1760,buildings burning,dallas,like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit,1
1349,1950,burning buildings,dallas,like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit,0


It seems that some of these tweets were labeled sloppily or are difficult to interpret. In any case, having identical tweets labeled differently will cause unwanted noise in the model. Therefore, I'll drop all duplicated tweets, which account for less than 2% of the dataset.

In [19]:
# Proportion of duplicate tweets

df['text'].duplicated().sum() / len(df)

0.0144489688690398

In [20]:
df = df.drop_duplicates(subset = 'text', keep=False)

#### Separating features from target

In [21]:
tweets = df.drop(columns='target')
target = df['target']

In [22]:
print('Distribution of Target')
print()
print(target.value_counts())
print()
print('Normalized:')
print(target.value_counts(normalize=True))

Distribution of Target

0    4284
1    3150
Name: target, dtype: int64

Normalized:
0    0.576271
1    0.423729
Name: target, dtype: float64


About 42% of the tweets have been labeled as actual disaster tweets. This is not a major imbalance, and this I won't need to artificially rebalance the dataset.

#### Train-Test Split

In [23]:
# Splitting the full dataset into training and testing data

X_train, X_test, y_train, y_test = train_test_split(
    tweets, target, test_size=0.2, random_state=seed
)

# Splitting off a validation set

X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=.5, random_state=seed
)

In [24]:
# Secret

In [82]:
X_train['keyword']

39                 ablaze
418              arsonist
4916               mayhem
2126               deaths
1054            body bags
              ...        
5848                 ruin
2600            destroyed
545             avalanche
1245    buildings on fire
4399            hijacking
Name: keyword, Length: 5947, dtype: object

In [27]:
ohe_keyword = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_keyword.fit(X_train[['keyword']])

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [88]:
pd.DataFrame(ohe_keyword.transform(X_train[['keyword']]), index=X_train.index, columns=ohe_keyword.get_feature_names())

Unnamed: 0,x0_ablaze,x0_accident,x0_aftershock,x0_airplane accident,x0_ambulance,x0_annihilated,x0_annihilation,x0_apocalypse,x0_armageddon,x0_army,...,x0_weapons,x0_whirlwind,x0_wild fires,x0_wildfire,x0_windstorm,x0_wounded,x0_wounds,x0_wreck,x0_wreckage,x0_wrecked
39,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# End Secret

### Cleaning and tokenizing the tweet text

In [None]:
# Saving a copy of untouched tweets
X_train_tweets_unprocessed = X_train.copy()['text'] 

X_test_tweets_unprocessed = X_test.copy()['text'] 

X_val_tweets_unprocessed = X_val.copy()['text'] 

In [None]:
# Make everything lowercase

X_train['text'] = X_train['text'].apply(lambda x: x.lower())

In [None]:
X_train.head(10)

### Dealing with URLs

Many tweets contain URLs, which, from an NLP standpoint are essentially random strings and thus won't be useful as vectorized tokens. But before I remove them, I will turn the presence of a URL into a binary feature.

In [None]:
def binary_url(text):
    """
    Returns 1 if a string contains a URL, else returns 0.
    """
    search = re.search(pattern=r'http\S+', string=text)
    return int(bool(search))

# Making a series indicating whether the tweet has a URL. I will use this later when extracting more meta-features.

has_url_Series_train = X_train['text'].apply(binary_url).rename('has_url')

In [None]:
# Removing URLs from all tweets

X_train['text'] = X_train['text'].apply(
    lambda x: re.sub(
        pattern=r'http\S+', repl='', string=x)
        )

### Tokenizing tweets

#### I plan to make multiple tokenized versions of each tweet.
* Basic version: Any word with at least two letters. Strips the symbols for hashtags (#) and mentions(@)
* Basic version, excluding stop words
* Lemmatized version of basic version
* Lemmatized version of basic version, exluding stop words.

In [None]:
example_tweet = X_train['text'].loc[1245]
example_tweet

In [None]:
token_pattern = r"[a-zA-Z]+'?[a-zA-Z]+|\b[iIaA]\b"
# Pattern: Any word with at least two characters, including up to one apostrophe
# Also captures the English words "I" and "and".

tokenizer = RegexpTokenizer(token_pattern)

Let's see how well the tokenizer does with the pattern I used above.

In [None]:
print(tokenizer.tokenize(example_tweet))

That seems to work. However, 'amp' is an artifact of "&amp;", which is the HTML entity for an ampersand ("&").

I will replace any instance of "&amp;" with the word "and" in the original text.

In [None]:
X_train['text'] = X_train['text'].apply(lambda x: re.sub("&amp;", "and", x))

In [None]:
X_train['tokens'] = X_train['text'].apply(tokenizer.tokenize)

In [None]:
# Check most common tokens

top_20_tokens = FreqDist(X_train['tokens'].explode()).most_common(20)
top_20_tokens

#### All tweets have been tokenized. Now I will make a version without stop words.

To make a comprehensive list of stop words, I will combine the default lists from the NLTK and SpaCy libraries.

In [None]:
sw_spacy = list(nlp.Defaults.stop_words)
sw_nltk = stopwords.words('english')
stopword_list = list(set(sw_spacy + sw_nltk))

In [None]:
X_train['tokens_no_sw'] = \
    X_train['tokens'].apply(
        lambda x: [w for w in x if not w in stopword_list]
    )

In [None]:
# Check most common tokens (without stop words)

FreqDist(X_train['tokens_no_sw'].explode()).most_common(20)

"i'm" should also be considered a stop word

In [None]:
stopword_list.extend(["i'm"])

In [None]:
X_train['tokens_no_sw'] = \
    X_train['tokens'].apply(
        lambda x: [w for w in x if not w in stopword_list]
    )

In [None]:
# Check most common tokens (without stop words)

top_20_tokens_no_sw = FreqDist(X_train['tokens_no_sw'].explode()).most_common(20)
top_20_tokens_no_sw

Previewing the restulting features:

In [None]:
X_train.head()

### Lemmatizing tweets

Now I want to make a version of these tokenized tweets where each word is lemmatized. **Lemmatization** is _________.

I will the tokenizer I made above, then the SpaCy library's lemmatizer to do this. Essentially I am making my own tokenizing function, where the output is lemmatized tokens instead of just plain tokens.

In [None]:
def spacy_lemmatize(text):
    """
    Uses NLTK and SpaCy to tokenize a string and return the lemma of each token.
    """
    sents = [s.text for s in nlp(text).sents]
    sents_tokenized = [tokenizer.tokenize(sent) for sent in sents]
    docs = [nlp(' '.join(tokens)) for tokens in sents_tokenized]
    docs_lemmatized = [[t.lemma_.lower() for t in doc] for doc in docs]
    return list(itertools.chain.from_iterable(docs_lemmatized))

In [None]:
# Lemmatized tokens, including stop words

X_train['lemmas'] = X_train['text'].apply(spacy_lemmatize)

In [None]:
# Check most common lemmas

top_20_lemmas = FreqDist(X_train['lemmas'].explode()).most_common(20)
top_20_lemmas

In order to make the lemmatized tokens without stop words, I will also need to lemmatize the stop words.

This has to be done to the stop words while they are still within the string because  SpaCy uses grammatical context to lemmatize.

In [None]:
# Lemmatized version of stop words based on training set

stopword_list_lem = []

sw_lem_Series = \
    X_train['tokens'].apply(
    lambda x: [t.lemma_.lower() for t in nlp(' '.join(x)) if t.text in stopword_list]
    )

for row in sw_lem_Series:
    stopword_list_lem.extend(row)

stopword_list_lem = list(set(stopword_list_lem))

In [None]:
# Lemmatized tokens, excluding stop words

X_train['lemmas_no_sw'] = X_train['lemmas'].apply(
    lambda x: [l for l in x if l not in stopword_list_lem]
    )

In [None]:
# Check most common lemmas (without stop words)

top_20_lemmas_no_sw = FreqDist(X_train['lemmas_no_sw'].explode()).most_common(20)
top_20_lemmas_no_sw

### Visualizing and comparing frequency
Here, I'll visualize the top 20 words in `X_train` for disaster tweets vs. non-disaster tweets. I should be able to see which words the classes have in common, as well as which processed version of the tweets is most different.

In [None]:
def plot_freqdict_classes(series, y, cutoff=20):
    fig, axes = plt.subplots(ncols=2, figsize=(10, 5))
    fd_0 = FreqDist(series[y==0].explode()).most_common(cutoff)
    fd_0 = OrderedDict(fd_0)
    tokens_0 = list(fd_0.keys())[::-1]
    freq_0 = list(fd_0.values())[::-1]
    fd_1 = FreqDist(series[y==1].explode()).most_common(cutoff)
    fd_1 = OrderedDict(fd_1)
    tokens_1 = list(fd_1.keys())[::-1]
    freq_1 = list(fd_1.values())[::-1]
    shared_tokens = [t for t in tokens_0 if t in tokens_1]
    axes[0].barh(y=tokens_0, width=freq_0, color=['C6' if token in shared_tokens else 'C0' for token in tokens_0])
    axes[1].barh(y=tokens_1, width=freq_1, color=['C6' if token in shared_tokens else 'C0' for token in tokens_1])
    axes[0].set_ylabel('Tokens', size=10)
    axes[0].set_xlabel('Frequency', size=10)
    axes[1].set_xlabel('Frequency', size=10)
    axes[0].set_title(f'Top {cutoff} {series.name.upper()} (Non-Disaster)')
    axes[1].set_title(f'Top {cutoff} {series.name.upper()} (Disaster)')
    custom_bars = [Line2D([0], [0], color='C6', lw=10), Line2D([0], [0], color='C0', lw=10)]
    axes[0].legend(custom_bars, ['In common', 'Not in common'])
    axes[1].legend(custom_bars, ['In common', 'Not in common'])
    plt.tight_layout()
    plt.show()

In [None]:
plot_freqdict_classes(X_train['tokens'], y_train)
plot_freqdict_classes(X_train['tokens_no_sw'], y_train)
plot_freqdict_classes(X_train['lemmas'], y_train)
plot_freqdict_classes(X_train['lemmas_no_sw'], y_train)

I can tell from the charts above the version of tweets that are the most different in terms of token frequency is the **lemmatized tweets with no stop words.** In the top 20 tokens from each class, there are only two shared tokens. In contrast, if stop words are not removed, most tokens are shared between classes.

### More linguistic feature engineering

I will use the SpaCy library to extract more linguistic features from the tweets.

### Vectorized part of speech (POS) tags

The SpaCy library is pre-trained to parse through sentences and identify each word's grammatical part of speech.

Here are some examples of what the tool can identify:

In [None]:
# POS VISUALIZATIONS
#
# 
# 
# 
# 
# 
#

In [None]:
secret_sample = X_train['text'].sample(5).apply(lambda x: [s.text for s in nlp(x).sents]).apply(lambda x: [' '.join(tokenizer.tokenize(s)) for s in x]).apply(lambda x: '. '.join(x))
secret_sample

In [None]:
options_displacy = {'compact':True, 'distance':110, 'bg':'#3056ff', 'color':'fff'}

for tweet in secret_sample:
    displacy.render(nlp(tweet), style='dep', jupyter=True, options=options_displacy)

I plan to use this tool in order to vectorize the detailed [parts of speech](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) of each tweet.

First, I'll convert each token into a string of its POS tag.

In [None]:
# Spacy POS

def spacy_pos(text):
    """
    Takes in a string and returns a list of part of speech tokens.
    """
    sents = [s.text for s in nlp(text).sents]
    sents_tokenized = [tokenizer.tokenize(sent) for sent in sents]
    docs = [nlp(' '.join(tokens)) for tokens in sents_tokenized]
    pos_tokens = [[t.pos_ for t in doc] for doc in docs]
    return list(itertools.chain.from_iterable(pos_tokens))

X_train['text_as_POS'] = X_train['text'].apply(spacy_pos)

In [None]:
X_train.head()

In order to make the POS vectors, I'm using scikit-learn's CountVectorizer in a slightly unorthodox way. I'm using it to count POS tags rather than tokens.

In [None]:
# Making dataframes of vectorized POS tags

pos_vectorizer = CountVectorizer(tokenizer=spacy_pos)
pos_vec_train = pos_vectorizer.fit_transform(X_train['text'])
pos_vec_df_train = pd.DataFrame(
        pos_vec_train.toarray(),
        columns=pos_vectorizer.get_feature_names(),
        index=X_train.index
)

In [None]:
pos_vec_df_train.head()

This matrix of parts-of speech will hopefully be useful to the model, but I can take it even further. I'm going to look at disparities in the proportions of POS tags between each class.

In [None]:
pos_vec_df_norm_train = pos_vec_df_train.div(pos_vec_df_train.sum(axis=1), axis=0)
pos_vec_df_norm_train.columns = pos_vec_df_train.columns + '_norm'
pos_vec_df_norm_train.head()

In [None]:
fig, ax = plt.subplots(figsize=(5,7))

pos_plot_0 = pos_vec_df_norm_train[y_train==0].mean().plot(
    kind='barh',
    ax=ax,
    color='blue', alpha=0.5,
    label='Non-disaster'
)
pos_plot_1 = pos_vec_df_norm_train[y_train==1].mean().plot(
    kind='barh',
    ax=ax,
    color='orange', alpha=0.5,
    label='Disaster'
)
ax.set_title('Average proportion of POS tags in each tweet')
ax.set_yticklabels(pos_vec_df_train.columns)
ax.legend()
plt.show()

Between the two classes, there are noticeable disparities with the following POS tags:

* ADP: adposition
* ADV: adverb
* AUX: auxiliary
* NOUN: Noun
* PRON: Pronoun
* PROPN: Proper noun

In [None]:
interesting_tags = ['ADP_norm', 'ADV_norm', 'AUX_norm', 'NOUN_norm', 'PRON_norm', 'PROPN_norm']

pos_vec_df_train = pos_vec_df_train.join(pos_vec_df_norm_train[interesting_tags])

In [None]:
pos_vec_df_train.head()

In [None]:
# MAKE THIS A SEPARATE DF OR TACK ONTO META?
# CHANGE ORDER OF META AND LINGUISTIC?
#
# 
# 
# 
# 
# 
#


In [None]:
# VISUALIZE?
# PROPORTIONS?
#
# 
# 
# 
# 
# 
# #

### Vectorized named-entity recognition (NER) tags 
SpaCy has the capability of recognizing "named-entities" such as places, companies, dates, people, and more. Here are some examples of what the tool can identify.

In [None]:
# VISUALIZE NER EXAMPLES
#
# 
# 
#

For the purposes of identifying disaster tweets, here are the [NER tags](https://towardsdatascience.com/explorations-in-named-entity-recognition-and-was-eleanor-roosevelt-right-671271117218) I am interested in:

* **GPE**: Countries, cities, states.
* **LOC**: Non-GPE locations, mountain ranges, bodies of water.
* **NORP**: Nationalities or religious or political groups.
* **ORG**: Companies, agencies, institutions, etc.

I am again using scikit-learn's CountVectorizer to count the entities that SpaCy finds in each tweet.

In [None]:
# Spacy NER

def spacy_ner(
    text, 
    ner_tags=['GPE', 'NORP', 'ORG', 'LOC']
    ):
    """
    Takes in a string and returns a list of named-entity recognition tags.
    Also takes in a specific list of NER tags to look for.
    To look for all NER tags supported by SpaCy, set `ner_tags=None`.
    Intended use is to use this function as a tokenizer in an sklearn vectorizor.
    """
    tokens = tokenizer.tokenize(text)
    doc = nlp(' '.join(tokens))
    ents = doc.ents
    if ner_tags:
        tags = [ent.label_ for ent in ents if ent.label_ in ner_tags]
        return tags
    else:
        tags = [ent.label_ for ent in ents]
        return tags

In [None]:
# Making dataframes of vectorized NER tags

ner_vectorizer = CountVectorizer(tokenizer=spacy_ner)
ner_vec_train = ner_vectorizer.fit_transform(X_train['text'])
ner_vec_df_train = pd.DataFrame(
        ner_vec_train.toarray(),
        columns=ner_vectorizer.get_feature_names(),
        index=X_train.index
)

In [None]:
ner_vec_df_train.head()

### Using NER to encode the `location` column

As noted earlier, the `location` column contains a lot of user-generated nonsense.

In [None]:
print('Sample of some of the location values')
df['location'].unique()[7:17].tolist()

But some of its data may be genuine. I can use named-entity recognition to discern if an entry is referring to an actual location. 

In [None]:
# Find any NER tags in the location column

location_NER_train = X_train['location'].apply(lambda x: [ent.label_ for ent in nlp(x).ents])
location_NER_train.head()

In [None]:
# Binarize based on if the location returns a GPE tag (countries, cities, states)
# Add this new feature to the NER training set 

ner_vec_df_train['location_GPE'] = location_NER_train.apply(lambda x: int('GPE' in x))
ner_vec_df_train.head()

## Meta-feature Engineering

To engineer more features, I'm going to use seemingly arbitrary information from each tweet. Here's the set of meta-features I plan to make:

* Has URL (binary)
* Character count
* Number of stop words
* Character count of non-stop-words / total character count
* Average length of tokens
* Number of tokens
* Number of unique tokens
* Proportion of stop words
* Proportion of words that are hashtags (#)
* Proportion of words that are mentions (@)

I drew inspiration for some of these features from [this Kaggle entry](https://www.kaggle.com/code/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert/notebook).

With the meta-features that are token-oriented, I have options. I have four different versions of tokenized tweets:

* tokens
* tokens without stop words
* lemmas
* lemmas without stop words

In order to pick the version of tokens that will likely be most informative to the model, I will run statistical t-tests on each of them. I'm trying to answer the question: **Which version of tokens, when used to engineer a new feature, has the biggest disparity between classes?**

#### Average length of token
Based on the results of the t-tests below, `lemmas` is the best column to use to engineer this feature. It has the largest t-statistic when the two classes are compared.

In [None]:

for column in ['tokens','tokens_no_sw', 'lemmas', 'lemmas_no_sw']:

    tokens = X_train[column][X_train[column].apply(lambda x: x != [])]

    average_length_0 = \
        tokens[y_train==0].apply(lambda x: np.mean([len(t) for t in x]))
    average_length_1 = \
        tokens[y_train==1].apply(lambda x: np.mean([len(t) for t in x]))

    t_test = ttest_ind(average_length_0, average_length_1)
    print(f'{column:{20}}{t_test}')

#### Number of tokens
`lemmas_no_sw` is the best column to use to engineer this feature.

In [None]:
for column in ['tokens','tokens_no_sw', 'lemmas', 'lemmas_no_sw']:

    n_tokens_0 = \
        X_train[column][y_train==0].apply(len)
    n_tokens_1 = \
        X_train[column][y_train==1].apply(len)
        
    t_test = ttest_ind(n_tokens_0, n_tokens_1)
    print(f'{column:{20}}{t_test}')

#### Number of unique tokens
`lemmas_no_sw` is the best column to use to engineer this feature.

In [None]:
for column in ['tokens','tokens_no_sw', 'lemmas', 'lemmas_no_sw']:

    unique_tokens_0 = \
        X_train[column][y_train==0].apply(lambda x: len(set(x)))
    unique_tokens_1 = \
        X_train[column][y_train==1].apply(lambda x: len(set(x)))
        
    t_test = ttest_ind(unique_tokens_0, unique_tokens_1)
    print(f'{column:{20}}{t_test}')

Now I will make a new DataFrame composed of the meta-features I listed above.

In [None]:
# has_url_Series already exists
# Start a DF with it

meta_features_df_train = pd.DataFrame(has_url_Series_train)

# Character count. Original tweet. No URL.
meta_features_df_train['character_count'] = \
    X_train['text'].apply(len)

# Stop word count. Original tweet. No URL.
meta_features_df_train['sw_count'] = \
    X_train['tokens'].apply(lambda x: len([w for w in x if w in stopword_list]))

# Character count of non-stop-words / original character count. No URL.
meta_features_df_train['non_sw_char_proportion'] = \
    X_train['tokens_no_sw'].apply(lambda x: len(''.join(x))) / X_train['text'].apply(len)

# Average lemma length. Including stop words.
meta_features_df_train['mean_lemma_length'] = \
    X_train['lemmas'].apply(lambda x: np.mean([len(l) for l in x]))

# Lemma count. No stop words.
meta_features_df_train['lemma_count_no_sw'] = \
    X_train['lemmas_no_sw'].apply(len)

# Number of unique lemmas. No stop words.
meta_features_df_train['unique_lemmas_no_sw'] = \
    X_train['lemmas_no_sw'].apply(lambda x: len(set(x)))

# Proportion of stop words
meta_features_df_train['sw_proportion'] = \
    X_train['tokens'].apply(lambda x: len([w for w in x if w in stopword_list]))\
        / X_train['tokens'].apply(len)

# Proportion of hashtags
meta_features_df_train['hashtag_proportion'] = \
    X_train['text'].apply(lambda x: len(re.findall(r'#{1}\w+', x)))\
        / X_train['tokens'].apply(len)

# Proportion of mentions
meta_features_df_train['mention_proportion'] = \
    X_train['text'].apply(lambda x: len(re.findall(r'@{1}\w+', x)))\
        / X_train['tokens'].apply(len)

meta_features_df_train.head()

In [None]:
# VISUALIZING
#
# 
# 
# 
# 
# 
# 
#

In [None]:
color_non = '#2c2fbf'
color_disaster ='#f14848'
kwargs_histplot = {'kde':True, 'stat':"density", 'linewidth':0, 'bins':'auto'}

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(15,10))
fl_ax = axes.flatten()

for idx, ftr in list(enumerate(meta_features_df_train.columns)):
    sns.histplot(meta_features_df_train[ftr][y_train==0], ax=fl_ax[idx], **kwargs_histplot, color=color_non)
    sns.histplot(meta_features_df_train[ftr][y_train==1], ax=fl_ax[idx], **kwargs_histplot, color=color_disaster)

plt.tight_layout()
plt.show()

# sns.countplot(meta_features_df_train['has_url'][y_train==0], ax=fl_ax[0], color=color_non)
# sns.countplot(meta_features_df_train['has_url'][y_train==1], ax=fl_ax[0], color=color_disaster)

In [None]:
# END VISUALIZING
#
#
# 
# 
# 
# 
# 
#  #

## Vectorizing the tweets

I've engineered linguistic features and meta-features. I'm going to make the actual text of the tweets interpretable by a machine learning model. I'm using scikit-learn's TF-IDF vectorizer and the lemmatized tokens of the tweets.

This vectorizor returns ______________. 

In the same step, I'm going to combine the vectors with the other features I've engineered so far into a single DataFrame. **This is the DataFrame that the model will train on.**

In [None]:
tfidf = TfidfVectorizer(
    tokenizer=spacy_lemmatize, 
    stop_words=stopword_list_lem,
    max_features=500,
    # binary=True
    )
X_train_vec = tfidf.fit_transform(X_train['text'])
X_train_vec_df = pd.DataFrame(
    X_train_vec.toarray(), columns=tfidf.get_feature_names(), index=X_train.index
    )

X_train_combined_df = pd.concat(
    [
        X_train_vec_df, 
        pos_vec_df_train, 
        ner_vec_df_train, 
        meta_features_df_train,
    ],
    axis=1
    )

# Scaling all features

scaler = StandardScaler()
X_train_combined_df_scaled = scaler.fit_transform(X_train_combined_df)
X_train_combined_df_scaled = pd.DataFrame(X_train_combined_df_scaled, index=X_train.index, columns=X_train_combined_df.columns)

X_train_combined_df_scaled.head()

#### Apply all pre-processing steps to test and validation sets.
The function below runs the `test` and `val` sets through the exact same preprocessing steps that the `train` set as undergone.

By default, the function makes use of the exact transformer objected that have been trained on `X_train`, in order to avoid data leakage.

In [None]:
def preprocess_tweets(
    df_to_process,
    tokenizer=tokenizer,
    stop_words=stopword_list, 
    lemmatizer=spacy_lemmatize,
    stop_words_lem=stopword_list_lem,
    pos_maker=spacy_pos,
    trained_pos_vectorizer=pos_vectorizer,
    trained_ner_vectorizer=ner_vectorizer,
    trained_tfidf=tfidf,
    trained_scaler=scaler,
    return_scaled=True,
    return_cleaned=False
):
    index=df_to_process.index
    tweet_df = df_to_process.copy()

    tweet_df['location'] = tweet_df['location'].fillna('location_missing')
    
    tweet_df['text'] = tweet_df['text'].apply(lambda x: x.lower())
    has_url_Series = tweet_df['text'].apply(binary_url).rename('has_url')
    tweet_df['has_url'] = has_url_Series
    tweet_df['text'] = \
        tweet_df['text'].apply(lambda x: re.sub(pattern=r'http\S+', repl='', string=x))

    tweet_df['text'] = tweet_df['text'].apply(lambda x: re.sub("&amp;", "and", x))
    tweet_df['tokens'] = tweet_df['text'].apply(tokenizer.tokenize)
    tweet_df['tokens_no_sw'] = \
        tweet_df['tokens'].apply(lambda x: [w for w in x if not w in stop_words])

    tweet_df['lemmas'] = tweet_df['text'].apply(lemmatizer)
    tweet_df['lemmas_no_sw'] = \
        tweet_df['lemmas'].apply(lambda x: [l for l in x if l not in stop_words_lem])

    tweet_df['text_as_pos'] = tweet_df['text'].apply(pos_maker)
    pos_vec = trained_pos_vectorizer.transform(tweet_df['text'])
    pos_vec_df = pd.DataFrame(
        pos_vec.toarray(), 
        columns=trained_pos_vectorizer.get_feature_names(),
        index=index
    )
    pos_vec_df_norm = pos_vec_df.div(pos_vec_df.sum(axis=1), axis=0)
    pos_vec_df_norm.columns = pos_vec_df.columns + '_norm'
    interesting_tags = ['ADP_norm', 'ADV_norm', 'AUX_norm', 'NOUN_norm', 'PRON_norm', 'PROPN_norm']
    pos_vec_df = pos_vec_df.join(pos_vec_df_norm[interesting_tags])

    ner_vec = trained_ner_vectorizer.transform(tweet_df['text'])
    ner_vec_df = pd.DataFrame(
        ner_vec.toarray(),
        columns=trained_ner_vectorizer.get_feature_names(),
        index=index
    )
    ner_vec_df['location_GPE'] = tweet_df['location'].apply(lambda x: int('GPE' in [ent.label_ for ent in nlp(x).ents]))

    meta_features_df = pd.DataFrame(has_url_Series)
    # Character count. Original tweet. No URL.
    meta_features_df['character_count'] = \
        tweet_df['text'].apply(len)
    # Stop word count. Original tweet. No URL.
    meta_features_df['sw_count'] = \
        tweet_df['tokens'].apply(lambda x: len([w for w in x if w in stop_words]))
    # Character count of non-stop-words / original character count. No URL.
    meta_features_df['non_sw_char_proportion'] = \
        tweet_df['tokens_no_sw'].apply(lambda x: len(''.join(x))) / tweet_df['text'].apply(len)
    # Average lemma length. Including stop words.
    meta_features_df['mean_lemma_length'] = \
        tweet_df['lemmas'].apply(lambda x: np.mean([len(l) for l in x]))
    # Lemma count. No stop words.
    meta_features_df['lemma_count_no_sw'] = \
        tweet_df['lemmas_no_sw'].apply(len)
    # Number of unique lemmas. No stop words.
    meta_features_df['unique_lemmas_no_sw'] = \
        tweet_df['lemmas_no_sw'].apply(lambda x: len(set(x)))
    # Proportion of stop words
    meta_features_df['sw_proportion'] = \
        tweet_df['tokens'].apply(lambda x: len([w for w in x if w in stop_words]))\
            / tweet_df['tokens'].apply(len)\
    # Proportion of hashtags
    meta_features_df['hashtag_proportion'] = \
        tweet_df['text'].apply(lambda x: len(re.findall(r'#{1}\w+', x)))\
            / tweet_df['tokens'].apply(len)
    # Proportion of mentions
    meta_features_df['mention_proportion'] = \
        tweet_df['text'].apply(lambda x: len(re.findall(r'@{1}\w+', x)))\
            / tweet_df['tokens'].apply(len)

    tweets_vec = trained_tfidf.transform(tweet_df['text'])
    tweets_vec_df = pd.DataFrame(
        tweets_vec.toarray(), columns=trained_tfidf.get_feature_names(), index=index
    )
    tweets_combined_df = pd.concat(
        [
            tweets_vec_df,
            pos_vec_df,
            ner_vec_df,
            meta_features_df
        ],
        axis=1
    )
    tweets_combined_df_scaled = trained_scaler.transform(tweets_combined_df)
    tweets_combined_df_scaled = pd.DataFrame(tweets_combined_df_scaled, index=index, columns=tweets_combined_df.columns)

    if return_scaled:
        if return_cleaned:
            return {'processed':tweets_combined_df_scaled, 'cleaned':tweet_df}
        else:
            return tweets_combined_df_scaled
    else:
        if return_cleaned:
            return {'processed':tweets_combined_df, 'cleaned':tweet_df}
        else:
            return tweets_combined_df

In [None]:
X_test_combined_df_scaled = preprocess_tweets(X_test)
X_val_combined_df_scaled = preprocess_tweets(X_val)

In [None]:
###########################################################
###########################################################

In [None]:
# Visualizing NN

def plot_nn_curves(model_history):

    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18,12))
    fl_ax = axes.flatten()
    for idx, metric in enumerate(['loss', 'accuracy', 'precision', 'recall', 'auc', 'f1']):
        pair = [m for m in model_history.history.keys() if metric in m]
        fl_ax[idx].plot(model_history.history[pair[0]], label=metric)
        fl_ax[idx].plot(model_history.history[pair[1]], label=metric+'_val')
        fl_ax[idx].set_xlabel('epochs')
        fl_ax[idx].set_ylabel(metric)
        fl_ax[idx].set_title(f'{metric.upper()} Evaluation')
        fl_ax[idx].legend()
        plt.tight_layout();

In [None]:
X_train_combined_df_scaled.shape[1]

In [None]:
n_input = X_train_combined_df_scaled.shape[1]

In [None]:
## Making an f1 scorer for Keras
## https://aakashgoel12.medium.com/how-to-add-user-defined-function-get-f1-score-in-keras-metrics-3013f979ce0d#:~:text=By%20default%2C%20f1%20score%20is,like%20accuracy%2C%20categorical%20accuracy%20etc.

import keras.backend as K

def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

precision = Precision()
recall = Recall()
auc = AUC()
f1 = get_f1 ##

In [None]:
model = models.Sequential()

model.add(layer=layers.Dense(
        units=100,
        activation='relu',
        input_shape=(n_input,)
))

model.add(layer=layers.Dense(
        units=100,
        activation='relu'
))

model.add(layer=layers.Dense(
        units=100,
        activation='relu'
))

model.add(layer=layers.Dense(
    units=1,
    activation='sigmoid',
))

model.compile(
    optimizer='SGD',
    loss='binary_crossentropy',
    metrics=['accuracy', precision, recall, auc, f1]
)

model_val = model.fit(
    np.array(X_train_combined_df_scaled),
    np.array(y_train),
    epochs=100,
    batch_size=32,
    validation_data=(X_val_combined_df_scaled, y_val)
)

In [None]:
plot_nn_curves(model_val)

In [None]:
model = models.Sequential()

model.add(layer=layers.Dense(
        units=200,
        activation=LeakyReLU(0.005),
        input_shape=(n_input,),
        kernel_regularizer=regularizers.l2(0.005)
    ))
model.add(layers.Dropout(0.2))

model.add(layer=layers.Dense(
    units=200,
    activation=LeakyReLU(0.005),
    kernel_regularizer=regularizers.l2(0.005)
))
model.add(layers.Dropout(0.2))

model.add(layer=layers.Dense(
    units=200,
    activation=LeakyReLU(0.005),
    kernel_regularizer=regularizers.l2(0.005)
))
model.add(layers.Dropout(0.2))

model.add(layer=layers.Dense(
    units=200,
    activation=LeakyReLU(0.005),
    kernel_regularizer=regularizers.l2(0.005)
))
model.add(layers.Dropout(0.2))

model.add(layer=layers.Dense(
    units=200,
    activation=LeakyReLU(0.005),
    kernel_regularizer=regularizers.l2(0.005)
))
model.add(layers.Dropout(0.2))

model.add(layer=layers.Dense(
    units=1,
    activation='sigmoid',
))

model.compile(
    optimizer='SGD',
    loss='binary_crossentropy',
    metrics=['accuracy', precision, recall, auc, f1]
)

# early_stopping = [
#     EarlyStopping(monitor='val_loss', patience=15)
#     ]

model_val = model.fit(
    np.array(X_train_combined_df_scaled),
    np.array(y_train),
    epochs=100,
    batch_size=32,
    validation_data=(X_val_combined_df_scaled, y_val)
    # callbacks=early_stopping
)

In [None]:
plot_nn_curves(model_val)

In [None]:
    model.summary()

In [None]:
##############################################
##############################################

In [None]:
from sklearn.metrics import \
    accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, classification_report, plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix

In [None]:
made_up_tweets = pd.read_csv('./stuff_to_ignore/made_up_tweets.csv')
made_up_tweets_processed = preprocess_tweets(made_up_tweets)

In [None]:
probs = pd.Series(model.predict(made_up_tweets_processed).reshape(1,-1)[0]).rename('probs')
preds = pd.Series(model.predict(made_up_tweets_processed).reshape(1,-1)[0]).apply(lambda x: int(x >= 0.5)).rename('preds')

In [None]:
recall_score(made_up_tweets['target'], preds)

In [None]:
print(classification_report(made_up_tweets['target'], preds))

In [None]:
confusion_matrix(made_up_tweets['target'], preds)

In [None]:
with_preds = preprocess_tweets(made_up_tweets, return_cleaned=True, return_scaled=False)['cleaned'][['location', 'text', 'target']].join([preds, probs])

In [None]:
with_preds['outcome'] = (with_preds['preds'] - with_preds['target']).map({-1:'NEGATIVE', 1:'POSITIVE', 0:'true'})

In [None]:
# with_preds

In [None]:
model.evaluate(X_test_combined_df_scaled, y_test)

In [None]:
################

In [None]:
# Total time:
t = time.time() - start_time
print(f'Notebook run time: {t//60:.0f} minutes and {t%60:.0f} seconds')

In [None]:
####################