In [75]:
import numpy as np
import pandas as pd
from nltk import FreqDist
from nltk.tokenize import \
    regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix
import re
from collections import OrderedDict, defaultdict, Counter
import itertools
import string
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.max_colwidth = 150
seed = 7

In [76]:
# Loading dataset

In [77]:
df = pd.read_csv('./data/disaster_tweets/train.csv')
df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1
8,14,,,There's an emergency evacuation happening now in the building across the street,1
9,15,,,I'm afraid that the tornado is coming to our area...,1


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [79]:
df.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [80]:
# Keyword and location columns

In [81]:
# Location column doesn't seem to have usable information. In some cases it's nonsense.

In [82]:
print('Sample of some of the location values')
df['location'].unique()[7:17].tolist()

Sample of some of the location values


['World Wide!!',
 'Paranaque City',
 'Live On Webcam',
 'milky way',
 'GREENSBORO,NORTH CAROLINA',
 'England.',
 'Sheffield Township, Ohio',
 'India',
 'Barbados',
 'Anaheim']

In [83]:
Counter(df['keyword']).most_common(20)

[(nan, 61),
 ('fatalities', 45),
 ('armageddon', 42),
 ('deluge', 42),
 ('body%20bags', 41),
 ('damage', 41),
 ('harm', 41),
 ('sinking', 41),
 ('collided', 40),
 ('evacuate', 40),
 ('fear', 40),
 ('outbreak', 40),
 ('siren', 40),
 ('twister', 40),
 ('windstorm', 40),
 ('collision', 39),
 ('derailment', 39),
 ('earthquake', 39),
 ('explosion', 39),
 ('famine', 39)]

In [84]:
# Features and Target

In [85]:
tweets = df[['text']]
target = df['target']

In [86]:
print('Distribution of Target')
print
print(target.value_counts())
print()
print(target.value_counts(normalize=True))

Distribution of Target
0    4342
1    3271
Name: target, dtype: int64

0    0.57034
1    0.42966
Name: target, dtype: float64


In [87]:
# Train Test Split

In [88]:
X_train, X_test, y_train, y_test = train_test_split(
    tweets, target, test_size=.25, random_state=seed
)

In [89]:
# Cleaning and tokenizing the tweet text

In [90]:
# Saving a copy of untouched tweets
X_train_tweets_unprocessed = X_train.copy()['text'] 

In [91]:
X_train['text'] = X_train['text'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text'] = X_train['text'].apply(lambda x: x.lower())


In [92]:
X_train.head(10)

Unnamed: 0,text
1489,@masochisticmage + catastrophe! it caused people to get reckless and the bottom line is that at least three of your friends will have +
5973,#nochilllukehammings\nim screaming
7589,omg earthquake
3788,it's never a good sign when you pull up to work &amp; there's five ambulances &amp; a fire truck in the bay. wompppp at least it's friday
825,my mic and controllers aren't working one second
4569,my baby girls car wreak this afternoon thank god no serious injuries and she was wearing her seatbelt!!!... http://t.co/njqv45nds2
6371,look at the previous battles. citizens were committing suicide so to not be under american control. the bomb was the only way. @nbcnews
4648,@mistresspip i'm amazed you have not been inundated mistress.
1812,maj muzzamil pilot offr of mi-17 crashed near mansehra today. http://t.co/kl4r1ccwct
492,christian attacked by muslims at the temple mount after waving israeli flag via pamela geller - ... http://t.co/f5miuhqaby


In [93]:
# Remove URLs

In [94]:
# Preserve if tweet has a URL. Will use this later.

def binary_url(text):
    """
    Returns 1 if a string contains a URL, else returns 0.
    """
    search = re.search(pattern=r'http\S+', string=text)
    return int(bool(search))

has_url_Series_train = X_train['text'].apply(binary_url).rename('has_url')

In [95]:
X_train['text'] = X_train['text'].apply(
    lambda x: re.sub(pattern=r'http\S+', repl='', string=x)
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text'] = X_train['text'].apply(


In [96]:
# Tokenizing

In [97]:
example_tweet = X_train['text'].loc[2372]
example_tweet

"such activities of govt can't derail us from our aim &amp; we still remain peaceful and unite for #freesikhpoliticalprisnors &amp; @bapusuratsingh"

In [98]:
X_train['text'].sample(5)

2159                                    walmart is taking steps to keep children safe in hot vehicles. take a look at the innovative car seat here! 
2162    @mayoroflondon pls reduce cyclist deaths with a compulsory highway code test as with every other vehicle that uses a road. #notrocketscience
6532                                                         update: i survived. no canoe. \n\nmay have been the fastest feed out in history though.
255                                                                                                         are souls punished withåêannihilation?  
159                               experts in france begin examining airplane debris found on reunion island: french air accident experts on wedn... 
Name: text, dtype: object

In [99]:
token_pattern = r"[a-zA-Z]+'?[a-zA-Z]+"
# Pattern: Any alphanumeric word with at least two characters, including up to one apostrophy

tokenizer = RegexpTokenizer(token_pattern)

In [100]:
tokenizer.tokenize(example_tweet)

['such',
 'activities',
 'of',
 'govt',
 "can't",
 'derail',
 'us',
 'from',
 'our',
 'aim',
 'amp',
 'we',
 'still',
 'remain',
 'peaceful',
 'and',
 'unite',
 'for',
 'freesikhpoliticalprisnors',
 'amp',
 'bapusuratsingh']

In [101]:
# seems to work
# But "amp" is an artifact of "&amp;" which is the HTML entity for "&"
# Replace it with "and" in original text
# Then tokenize

In [102]:
X_train['text'] = X_train['text'].apply(lambda x: re.sub("&amp;", "and", x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text'] = X_train['text'].apply(lambda x: re.sub("&amp;", "and", x))


In [103]:
X_train['tokens'] = X_train['text'].apply(tokenizer.tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['tokens'] = X_train['text'].apply(tokenizer.tokenize)


In [104]:
# Remove stopwords
# Get comprehensive list
# Combine stopwords from two libraries: NLTK and SpaCy

In [105]:
sw_spacy = list(nlp.Defaults.stop_words)
sw_nltk = stopwords.words('english')
stopword_list = list(set(sw_spacy + sw_nltk))

In [106]:
X_train['tokens_no_sw'] = \
    X_train['tokens'].apply(
        lambda x: [w for w in x if not w in stopword_list]
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['tokens_no_sw'] = \


In [107]:
# Check most common words

In [108]:
FreqDist(X_train['tokens_no_sw'].explode()).most_common(20)

[('like', 264),
 ('fire', 200),
 ("i'm", 184),
 ('new', 176),
 ('news', 164),
 ('people', 151),
 ('video', 119),
 ('disaster', 115),
 ('police', 110),
 ('emergency', 108),
 ('time', 101),
 ('body', 95),
 ('suicide', 90),
 ('california', 90),
 ('storm', 90),
 ('burning', 89),
 ('rt', 88),
 ('crash', 87),
 ('world', 84),
 ('man', 83)]

In [109]:
# "i'm" should also be considered a stop word

In [110]:
stopword_list.extend(["i'm"])

In [111]:
X_train['tokens_no_sw'] = \
    X_train['tokens'].apply(
        lambda x: [w for w in x if not w in stopword_list]
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['tokens_no_sw'] = \


In [112]:
top_20_no_sw = FreqDist(X_train['tokens_no_sw'].explode()).most_common(20)
top_20_no_sw

[('like', 264),
 ('fire', 200),
 ('new', 176),
 ('news', 164),
 ('people', 151),
 ('video', 119),
 ('disaster', 115),
 ('police', 110),
 ('emergency', 108),
 ('time', 101),
 ('body', 95),
 ('suicide', 90),
 ('california', 90),
 ('storm', 90),
 ('burning', 89),
 ('rt', 88),
 ('crash', 87),
 ('world', 84),
 ('man', 83),
 ('bomb', 82)]

In [113]:
X_train.head()

Unnamed: 0,text,tokens,tokens_no_sw
1489,@masochisticmage + catastrophe! it caused people to get reckless and the bottom line is that at least three of your friends will have +,"[masochisticmage, catastrophe, it, caused, people, to, get, reckless, and, the, bottom, line, is, that, at, least, three, of, your, friends, will,...","[masochisticmage, catastrophe, caused, people, reckless, line, friends]"
5973,#nochilllukehammings\nim screaming,"[nochilllukehammings, im, screaming]","[nochilllukehammings, im, screaming]"
7589,omg earthquake,"[omg, earthquake]","[omg, earthquake]"
3788,it's never a good sign when you pull up to work and there's five ambulances and a fire truck in the bay. wompppp at least it's friday,"[it's, never, good, sign, when, you, pull, up, to, work, and, there's, five, ambulances, and, fire, truck, in, the, bay, wompppp, at, least, it's,...","[good, sign, pull, work, there's, ambulances, fire, truck, bay, wompppp, friday]"
825,my mic and controllers aren't working one second,"[my, mic, and, controllers, aren't, working, one, second]","[mic, controllers, working, second]"


In [114]:
# Linguistic Feature engineering 

In [115]:
# Using SpaCy to lemmatize our text
# Using SpaCy to count NER tags and POS tags
# NER tags I'm interested in:
## GPE, LOC, NORP, FAC, EVENT, ORG

In [116]:
# Lemmatize

def spacy_lemmatize(tokens):
    doc = nlp(' '.join(tokens))
    return [t.lemma_ for t in doc]

X_train['lemmas'] = X_train['tokens'].apply(spacy_lemmatize)
X_train['lemmas_no_sw'] = X_train['lemmas'].apply(
    lambda x: [l for l in x if l not in stopword_list]
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['lemmas'] = X_train['tokens'].apply(spacy_lemmatize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['lemmas_no_sw'] = X_train['lemmas'].apply(


In [117]:
X_train.head()

Unnamed: 0,text,tokens,tokens_no_sw,lemmas,lemmas_no_sw
1489,@masochisticmage + catastrophe! it caused people to get reckless and the bottom line is that at least three of your friends will have +,"[masochisticmage, catastrophe, it, caused, people, to, get, reckless, and, the, bottom, line, is, that, at, least, three, of, your, friends, will,...","[masochisticmage, catastrophe, caused, people, reckless, line, friends]","[masochisticmage, catastrophe, it, cause, people, to, get, reckless, and, the, bottom, line, be, that, at, least, three, of, your, friend, will, h...","[masochisticmage, catastrophe, cause, people, reckless, line, friend]"
5973,#nochilllukehammings\nim screaming,"[nochilllukehammings, im, screaming]","[nochilllukehammings, im, screaming]","[nochilllukehamming, I, m, scream]","[nochilllukehamming, I, scream]"
7589,omg earthquake,"[omg, earthquake]","[omg, earthquake]","[omg, earthquake]","[omg, earthquake]"
3788,it's never a good sign when you pull up to work and there's five ambulances and a fire truck in the bay. wompppp at least it's friday,"[it's, never, good, sign, when, you, pull, up, to, work, and, there's, five, ambulances, and, fire, truck, in, the, bay, wompppp, at, least, it's,...","[good, sign, pull, work, there's, ambulances, fire, truck, bay, wompppp, friday]","[it, be, never, good, sign, when, you, pull, up, to, work, and, there, be, five, ambulance, and, fire, truck, in, the, bay, wompppp, at, least, it...","[good, sign, pull, work, ambulance, fire, truck, bay, wompppp, friday]"
825,my mic and controllers aren't working one second,"[my, mic, and, controllers, aren't, working, one, second]","[mic, controllers, working, second]","[my, mic, and, controller, be, not, work, one, second]","[mic, controller, work, second]"


In [118]:
# Spacy POS

def helper_untokenize(token_list):
    """
    Helper function.
    Takes in a list of tokens and combines them as a string.
    Instead of having to repeatedly type `lambda x: ' '.join()`
    """
    return ' '.join(token_list)

def helper_spacy_pos(text):
    """
    Helper function.
    Takes in a string and returns a list of part-of-speech tokens.
    """
    doc = nlp(text)
    pos_tokens = [t.pos_ for t in doc]
    return pos_tokens

def make_pos_tokens_df(text_column):
    """
    Takes in a Series of texts.
    Uses SpaCy to turn the text into part-of-speech tokens.
    Uses sklearn's CountVectorizer to count POS tags for each text. 
    """
    cvec = CountVectorizer(tokenizer=helper_spacy_pos)
    pos_vectorized = cvec.fit_transform(text_column)
    pos_vectorized_df = pd.DataFrame(
        pos_vectorized.toarray(),
        columns=cvec.get_feature_names(),
        index=text_column.index
        )
    return pos_vectorized_df

In [119]:
pos_vectorized_df_train = make_pos_tokens_df(X_train['tokens'].apply(helper_untokenize))
pos_vectorized_df_train.head()

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,VERB,X
1489,2,1,2,2,1,1,0,5,1,1,2,0,0,1,3,0
5973,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0
7589,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3788,2,4,1,2,2,1,0,6,1,0,4,2,0,1,2,0
825,0,0,0,1,1,0,0,3,1,1,1,0,0,0,1,0


In [120]:
# Spacy NER

def helper_spacy_ner(
    text, 
    ner_tags=['GPE', 'LOC', 'NORP', 'EVENT', 'ORG', 'FAC']
    ):
    """
    Helper function.
    Takes in a string and returns a list of named-entity recognition tags.
    Also takes in a specific list of NER tags to look for.
    To look for all NER tags supported by SpaCy, set `ner_tags=None`.
    """
    doc = nlp(text)
    ents = doc.ents
    if ner_tags:
        tags = [ent.label_ for ent in doc.ents if ent.label_ in ner_tags]
        return tags
    else:
        tags = [ent.label_ for ent in doc.ents]
        return tags

def make_ner_tokens_df(text_column):
    """
    Takes in a Series of texts.
    Uses SpaCy to turn the text a list of named entity recognition tags.
    Uses sklearn's CountVectorizer to count NER tags for each text.
    """
    cvec = CountVectorizer(tokenizer=helper_spacy_ner)
    ner_vectorized = cvec.fit_transform(text_column)
    ner_vectorized_df = pd.DataFrame(
        ner_vectorized.toarray(),
        columns=cvec.get_feature_names(),
        index=text_column.index
    )
    return ner_vectorized_df

In [121]:
ner_vectorized_df_train = make_ner_tokens_df(X_train['tokens'].apply(helper_untokenize))
ner_vectorized_df_train.head()

Unnamed: 0,EVENT,FAC,GPE,LOC,NORP,ORG
1489,0,0,0,0,0,0
5973,0,0,0,0,0,0
7589,0,0,0,0,0,0
3788,0,0,0,1,0,0
825,0,0,0,0,0,0


In [48]:
# Meta Feature engineering

# Has URL
# Character count
# Length of tokens
# Number of tokens
# Number of unique tokens
# hashtags? menntions?

In [167]:
# Length of tokens: Which version of tokens have the biggest disparity between classes?

In [123]:
from scipy.stats import ttest_ind

In [162]:
# Average length of token

for column in ['tokens','tokens_no_sw', 'lemmas', 'lemmas_no_sw']:
    average_length_0 = \
        X_train[column][y_train==0].apply(lambda x: np.mean([len(t) for t in x])).dropna()
    average_length_1 = \
        X_train[column][y_train==1].apply(lambda x: np.mean([len(t) for t in x])).dropna()
    t_test = ttest_ind(average_length_0, average_length_1)
    print(f'{column:{20}}{t_test}')

tokens              Ttest_indResult(statistic=-10.888796396358687, pvalue=2.4174752089914336e-27)
tokens_no_sw        Ttest_indResult(statistic=-7.252232336056516, pvalue=4.645366502067098e-13)
lemmas              Ttest_indResult(statistic=-11.611871549172264, pvalue=7.948479235164981e-31)
lemmas_no_sw        Ttest_indResult(statistic=-8.02385912841762, pvalue=1.2340040655766092e-15)


  return _methods._mean(a, axis=axis, dtype=dtype,


In [164]:
# lemma with stopwords

In [163]:
# Number of tokens

for column in ['tokens','tokens_no_sw', 'lemmas', 'lemmas_no_sw']:
    average_length_0 = \
        X_train[column][y_train==0].apply(len).dropna()
    average_length_1 = \
        X_train[column][y_train==1].apply(len).dropna()
    t_test = ttest_ind(average_length_0, average_length_1)
    print(f'{column:{20}}{t_test}')

tokens              Ttest_indResult(statistic=-1.678955081768617, pvalue=0.09321554140560237)
tokens_no_sw        Ttest_indResult(statistic=-11.994417135481633, pvalue=9.38714082821458e-33)
lemmas              Ttest_indResult(statistic=-0.664126802322163, pvalue=0.5066359869017423)
lemmas_no_sw        Ttest_indResult(statistic=-11.913837989508382, pvalue=2.418054692616517e-32)


In [165]:
# Tokens no stopwords

In [166]:
for column in ['tokens','tokens_no_sw', 'lemmas', 'lemmas_no_sw']:
    average_length_0 = \
        X_train[column][y_train==0].apply(lambda x: len(set(x))).dropna()
    average_length_1 = \
        X_train[column][y_train==1].apply(lambda x: len(set(x))).dropna()
    t_test = ttest_ind(average_length_0, average_length_1)
    print(f'{column:{20}}{t_test}')

tokens              Ttest_indResult(statistic=-1.8296863997564934, pvalue=0.06734898548241941)
tokens_no_sw        Ttest_indResult(statistic=-11.408711773799919, pvalue=7.946716409714468e-30)
lemmas              Ttest_indResult(statistic=-1.1392513127629718, pvalue=0.254646164360224)
lemmas_no_sw        Ttest_indResult(statistic=-11.392836921104399, pvalue=9.497750995196684e-30)


In [None]:
# Tokens no stopwords

In [None]:
# has_url_Series already exists
# Start a DF with it

engineered_features_df_train = pd.DataFrame(has_url_Series_train)

# Character count. Original tweet. No URL.
engineered_features_df_train['character_count'] = \
    X_train['text'].apply(len)

# Token count. No stop words.
engineered_features_df_train['token_count'] = \
    X_train['tokens_no_sw'].apply(len)

# Number of unique lemma
engineered_features_df_train['unique_tokens'] = \
    X_train['tokens'].apply(lambda x: len(set(x)))

# Number of lemmas
# Unique lemmas
# Average lemma length

# Average token length. Including stop words.
engineered_features_df_train['mean_token_length'] = \
    X_train['tokens'].apply(lambda x: np.mean([len(t) for t in x]))