In [1]:
import numpy as np
import pandas as pd
from nltk import FreqDist
from nltk.tokenize import \
    regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix
import re
from collections import OrderedDict, defaultdict, Counter
import itertools
import string
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.max_colwidth = 150
seed = 7

In [2]:
# Loading dataset

In [3]:
df = pd.read_csv('./data/disaster_tweets/train.csv')
df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1
8,14,,,There's an emergency evacuation happening now in the building across the street,1
9,15,,,I'm afraid that the tornado is coming to our area...,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
df.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
# Keyword and location columns

In [7]:
# Location column doesn't seem to have usable information. In some cases it's nonsense.

In [8]:
print('Sample of some of the location values')
df['location'].unique()[7:17].tolist()

Sample of some of the location values


['World Wide!!',
 'Paranaque City',
 'Live On Webcam',
 'milky way',
 'GREENSBORO,NORTH CAROLINA',
 'England.',
 'Sheffield Township, Ohio',
 'India',
 'Barbados',
 'Anaheim']

In [9]:
Counter(df['keyword']).most_common(20)

[(nan, 61),
 ('fatalities', 45),
 ('armageddon', 42),
 ('deluge', 42),
 ('body%20bags', 41),
 ('damage', 41),
 ('harm', 41),
 ('sinking', 41),
 ('collided', 40),
 ('evacuate', 40),
 ('fear', 40),
 ('outbreak', 40),
 ('siren', 40),
 ('twister', 40),
 ('windstorm', 40),
 ('collision', 39),
 ('derailment', 39),
 ('earthquake', 39),
 ('explosion', 39),
 ('famine', 39)]

In [10]:
# Features and Target

In [11]:
tweets = df[['text']]
target = df['target']

In [12]:
print('Distribution of Target')
print
print(target.value_counts())
print()
print(target.value_counts(normalize=True))

Distribution of Target
0    4342
1    3271
Name: target, dtype: int64

0    0.57034
1    0.42966
Name: target, dtype: float64


In [13]:
# Train Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    tweets, target, test_size=.25, random_state=seed
)

In [15]:
# Cleaning and tokenizing the tweet text

In [16]:
# Saving a copy of untouched tweets
X_train_tweets_unprocessed = X_train.copy()['text']

In [17]:
X_train['text'] = X_train['text'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text'] = X_train['text'].apply(lambda x: x.lower())


In [18]:
X_train.head(10)

Unnamed: 0,text
1489,@masochisticmage + catastrophe! it caused people to get reckless and the bottom line is that at least three of your friends will have +
5973,#nochilllukehammings\nim screaming
7589,omg earthquake
3788,it's never a good sign when you pull up to work &amp; there's five ambulances &amp; a fire truck in the bay. wompppp at least it's friday
825,my mic and controllers aren't working one second
4569,my baby girls car wreak this afternoon thank god no serious injuries and she was wearing her seatbelt!!!... http://t.co/njqv45nds2
6371,look at the previous battles. citizens were committing suicide so to not be under american control. the bomb was the only way. @nbcnews
4648,@mistresspip i'm amazed you have not been inundated mistress.
1812,maj muzzamil pilot offr of mi-17 crashed near mansehra today. http://t.co/kl4r1ccwct
492,christian attacked by muslims at the temple mount after waving israeli flag via pamela geller - ... http://t.co/f5miuhqaby


In [19]:
# Remove URLs

In [20]:
# Preserve if tweet has a URL. Will use this later.

def binary_url(text):
    """
    Returns 1 if a string contains a URL, else returns 0.
    """
    search = re.search(pattern=r'http\S+', string=text)
    return int(bool(search))

has_url_Series = X_train['text'].apply(binary_url)

In [21]:
X_train['text'] = X_train['text'].apply(
    lambda x: re.sub(pattern=r'http\S+', repl='', string=x)
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text'] = X_train['text'].apply(


In [22]:
# Tokenizing

In [23]:
example_tweet = X_train['text'].loc[2372]
example_tweet

"such activities of govt can't derail us from our aim &amp; we still remain peaceful and unite for #freesikhpoliticalprisnors &amp; @bapusuratsingh"

In [24]:
X_train['text'].sample(5)

7296                my brother-n-law riooooos got the call to head up north and fight the wild fires. dudes a beast atû_ 
2487                                                                    a new favorite: desolate 2 by r3do  on #soundcloud
3549                      a memorial to the millions who perished in the holodomor has been erected in the u.s. capital.  
866     another day another excellent @_dangerousbeans porridge. seriously people. blood orange in porridge is phenomenal.
5231            i added a video to a @youtube playlist  gta 5 funny moments - 'obliteration!' (gta 5 online funny moments)
Name: text, dtype: object

In [25]:
token_pattern = r"[a-zA-Z]+'?[a-zA-Z]+"
# Pattern: Any word with at least two non-numeric letters, including words with apostrophes

tokenizer = RegexpTokenizer(token_pattern)

In [26]:
tokenizer.tokenize(example_tweet)

['such',
 'activities',
 'of',
 'govt',
 "can't",
 'derail',
 'us',
 'from',
 'our',
 'aim',
 'amp',
 'we',
 'still',
 'remain',
 'peaceful',
 'and',
 'unite',
 'for',
 'freesikhpoliticalprisnors',
 'amp',
 'bapusuratsingh']

In [27]:
# seems to work

In [28]:
X_train['text_tokenized'] = X_train['text'].apply(tokenizer.tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text_tokenized'] = X_train['text'].apply(tokenizer.tokenize)


In [30]:
# Remove stopwords
# Get comprehensive list
# Combine stopwords from two libraries: NLTK and SpaCy