Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).
But, it’s not always clear whether a person’s words are actually announcing a disaster.


In [1]:
import pandas as pd
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [2]:
# What is the shape of the data
print('Input has {} rows and {} columns'.format(len(data),len(data.columns)))

Input has 7613 rows and 5 columns


In [3]:
# How many disaster and non disaster are there?
print('Out {} rows, {} are disaster and {} are not disaster'.format(len(data),len(data[data['target']==1]),len(data[data['target']==0])))

Out 7613 rows, 3271 are disaster and 4342 are not disaster


# Clearning Data

In [4]:
import nltk
import string
import re
ps = nltk.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    token = re.split('\W+',text)
    text = [ps.stem(word) for word in token if word not in stopwords]
    return text

In [5]:
data['cleaned_text']= data['text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deed, reason, earthquak, may, allah, forgiv, us]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, rong, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[resid, ask, shelter, place, notifi, offic, ev..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000, peopl, receiv, wildfir, evacu, order, ..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, rubi, alaska, smoke, wildfi..."


# Create train set and test set

In [6]:
X= data['text']
y= data.target.to_numpy()
print(X)
print(y)

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object
[1 1 1 ... 1 1 1]


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=46)
print(X_train)

7502        @raineishida lol...Im just a nervous wreck :P
1368    @POTUS Would you please explain what you are g...
2366    @shantaeskyy GM! I pray any attack of the enem...
7102    After a violent afternoon storm more severe we...
3130    Not being able to touch anything or anyone in ...
                              ...                        
2112    I had no issues uploading DEATH TO SMOOCHY or ...
5652    UD: Rescue (Structural Collapse) - Scott Road ...
2619    There's a #fly loose in my workspace with two ...
517     #WeLoveLA #NHLDucks Avalanche Defense: How The...
6518    It's going on three years that we have been se...
Name: text, Length: 5329, dtype: object


# Apply CountVectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_counts_train = count_vect.fit_transform(X_train)
X_counts_test = count_vect.transform(X_test)
print(X_counts_train.shape)
print(count_vect.get_feature_names()[:10])

(5329, 14935)
['', '0', '0011', '001116', '0025', '010217', '0104', '010401', '012624', '02']


In [9]:
# Apply CountVectorizer to small sample
X_train_sample= X_train[0:20]
count_vect_sample = CountVectorizer(analyzer=clean_text)
X_counts_sample = count_vect_sample.fit_transform(X_train_sample)
print(X_counts_sample.shape)
print(count_vect_sample.get_feature_names())

(20, 198)
['', '2', '2011', '213924', '25', '300w', '30stm', '4', '40', '4x4', '53inch', 'abl', 'accid', 'acdelco', 'affect', 'afternoon', 'agenc', 'air', 'airplan', 'amp', 'annihil', 'anyon', 'anyth', 'around', 'attack', 'audienc', 'august', 'b1g', 'bar', 'bickleton', 'binladen', 'bless', 'block', 'bmurph1019', 'build', 'burn', 'bush', 'camp', 'cant', 'chang', 'chicago', 'chonc', 'co2', 'combo', 'comment', 'corner', 'cree', 'cross', 'crowd', 'croydon', 'curv', 'day', 'dec', 'deploy', 'derail', 'destini', 'detail', 'deton', 'disea', 'due', 'electrocut', 'emerg', 'enemi', 'equip', 'evacu', 'even', 'excus', 'explain', 'famili', 'fatal', 'feel', 'final', 'fire', 'flood', 'fog', 'forest', 'full', 'get', 'gm', 'go', 'goblu', 'good', 'got', 'great', 'gt', 'h20', 'hailyoutsey', 'head', 'high', 'home', 'hors', 'httpstcocvkqigr1az', 'httptco6peeip4y7w', 'httptco9vd6x4wdoy', 'httptcodywwnbbyvj', 'httptcogxyivswki7', 'httptcohpzhe0cjvf', 'httptcomnsy1qr7bq', 'httptcomtmoia0oo0', 'httptconn4ztcmsr

In [10]:
X_counts_df = pd.DataFrame(X_counts_sample.toarray())
X_counts_df.columns = count_vect_sample.get_feature_names()
X_counts_df

Unnamed: 0,Unnamed: 1,2,2011,213924,25,300w,30stm,4,40,4x4,...,what,wheavenli,wild,wildfir,without,work,would,wreck,wrestleon,û
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Training model



In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_counts_train, y_train)
print(clf.score(X_counts_train,y_train))
print(clf.score(X_counts_test,y_test))

0.9283167573653593
0.7985989492119089


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf_train = tfidf_vect.fit_transform(X_train)
X_tfidf_test  = tfidf_vect.transform(X_test)
print(X_tfidf_train.shape)
print(tfidf_vect.get_feature_names()[:10])

(5329, 14935)
['', '0', '0011', '001116', '0025', '010217', '0104', '010401', '012624', '02']


In [13]:
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_tfidf_train, y_train)
print(clf_tfidf.score(X_tfidf_train,y_train))
print(clf_tfidf.score(X_tfidf_test,y_test))

0.903921936573466
0.8021015761821366


# N- Grams Vectorization


In [18]:
import nltk
import string
import re
ps = nltk.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')
def clean_text_NGram(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    token = re.split('\W+',text)
    text = " ".join([ps.stem(word) for word in token if word not in stopwords])
    return text

In [19]:
data['cleaned_text_NGram']= data['text'].apply(lambda x: clean_text_NGram(x))
data.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,cleaned_text_NGram
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deed, reason, earthquak, may, allah, forgiv, us]",deed reason earthquak may allah forgiv us
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, rong, sask, canada]",forest fire near la rong sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,"[resid, ask, shelter, place, notifi, offic, ev...",resid ask shelter place notifi offic evacu she...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000, peopl, receiv, wildfir, evacu, order, ...",13000 peopl receiv wildfir evacu order califor...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, rubi, alaska, smoke, wildfi...",got sent photo rubi alaska smoke wildfir pour ...


In [20]:
X= data['cleaned_text_NGram']
y= data.target.to_numpy()
print(X)
print(y)

0               deed reason earthquak may allah forgiv us
1                    forest fire near la rong sask canada
2       resid ask shelter place notifi offic evacu she...
3       13000 peopl receiv wildfir evacu order califor...
4       got sent photo rubi alaska smoke wildfir pour ...
                              ...                        
7608    two giant crane hold bridg collaps nearbi home...
7609    ariaahrari thetawniest control wild fire calif...
7610    m194 0104 utc5km volcano hawaii httptcozdtoyd8ebj
7611    polic investig ebik collid car littl portug eb...
7612    latest home raze northern california wildfir a...
Name: cleaned_text_NGram, Length: 7613, dtype: object
[1 1 1 ... 1 1 1]


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=46)
print(X_train)

7502                     raineishida lolim nervou wreck p
1368    potu would pleas explain go volcano amp bush f...
2366    shantaeskyy gm pray attack enemi 2 derail ur d...
7102    violent afternoon storm sever weather head chi...
3130     abl touch anyth anyon penney without electrocut 
                              ...                        
2112    issu upload death smoochi awaken clip youtub r...
5652    ud rescu structur collaps scott road ypre road...
2619    there fli loos workspac two bore cat forse ter...
517     welovela nhlduck avalanch defens match vs st l...
6518    go three year separ sometim let man know leav ...
Name: cleaned_text_NGram, Length: 5329, dtype: object


In [30]:

from sklearn.feature_extraction.text import CountVectorizer
ngram_vect = CountVectorizer(ngram_range=(1,2))
X_ngram_train = ngram_vect.fit_transform(X_train)
X_ngram_test = ngram_vect.transform(X_test)
print(X_ngram_train.shape)
print(ngram_vect.get_feature_names()[:10])

(5329, 52809)
['0011', '0011 utc', '001116', '001116 utc20150805', '0025', '0025 updat', '010217', '010217 okinawa', '0104', '0104 utc']


In [31]:
clf_ngram = MultinomialNB()
clf_ngram.fit(X_ngram_train, y_train)
print(clf_ngram.score(X_ngram_train,y_train))
print(clf_ngram.score(X_ngram_test,y_test))

0.9787952711578157
0.8003502626970228


In [33]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression(random_state=0, C=0.5).fit(X_tfidf_train, y_train)
print(clf_log.score(X_tfidf_train,y_train))
print(clf_log.score(X_tfidf_test,y_test))

0.8588853443422781
0.7994746059544658
