# Data Wrangling

In [1]:
import numpy as np 
import pandas as pd 
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
wn = nltk.WordNetLemmatizer()
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("disaster.csv")

In [3]:
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
df.loc[df['location'].notnull(), df.columns]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...,...
7575,10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0t...,0
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0


Location doesn't seem to give us insight to the content of the tweet other than location which is not we want
<br> There are also a lot of missing values so we will drop the 'location' column

In [6]:
df2 = df[['id','keyword', 'text', 'target']]

In [7]:
dupe = df2[df2.duplicated(subset = 'text')].sort_values('text')
dupe

Unnamed: 0,id,keyword,text,target
4299,6105,hellfire,#Allah describes piling up #wealth thinking it...,0
4312,6123,hellfire,#Allah describes piling up #wealth thinking it...,1
6366,9098,suicide%20bomb,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
6373,9107,suicide%20bomb,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
6392,9135,suicide%20bomb,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
...,...,...,...,...
3461,4952,exploded,that exploded &amp; brought about the\nbeginni...,0
6103,8714,sinking,that horrible sinking feeling when youÛªve be...,0
6094,8702,sinking,that horrible sinking feeling when youÛªve be...,0
6123,8739,sinking,that horrible sinking feeling when youÛªve be...,1


It seems there are duplicates that have different target values which is confusing

In [8]:
#Go through each distinct duplicated row and correct the target values
dupes = dupe.drop_duplicates('text').sort_index()
dupes.at[1156, 'target'] = 0
dupes.at[1251, 'target'] = 0
dupes.at[2832, 'target'] = 1
dupes.at[2841, 'target'] = 1
dupes.at[4320, 'target'] = 0
dupes.at[4381, 'target'] = 0
dupes.at[5073, 'target'] = 1
dupes.at[5641, 'target'] = 1
dupes.at[6616, 'target'] = 0

Only 9 values had to be changed

In [9]:
#We will drop duplicates then add back the corrected distinct rows 
nodupe = pd.concat([df2.drop_duplicates(subset = 'text', keep = False), dupes], axis = 0)
nodupe.sort_index(inplace = True)

In [10]:
nodupe[['keyword']] = nodupe[['keyword']].fillna('none')
nodupe

Unnamed: 0,id,keyword,text,target
0,1,none,Our Deeds are the Reason of this #earthquake M...,1
1,4,none,Forest fire near La Ronge Sask. Canada,1
2,5,none,All residents asked to 'shelter in place' are ...,1
3,6,none,"13,000 people receive #wildfires evacuation or...",1
4,7,none,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
7608,10869,none,Two giant cranes holding a bridge collapse int...,1
7609,10870,none,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,none,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,none,Police investigating after an e-bike collided ...,1


In [11]:
nodupe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7503 entries, 0 to 7612
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       7503 non-null   int64 
 1   keyword  7503 non-null   object
 2   text     7503 non-null   object
 3   target   7503 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 293.1+ KB


In [12]:
keywords = nodupe['keyword'].unique()
keywords

array(['none', 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', '

Lets get rid of the random '%20' that shows up in some keywords

In [13]:
for word in keywords:
    old = word
    new = word.replace('%20', ' ')
    nodupe[['keyword']] = nodupe[['keyword']].replace(old, new) 

In [14]:
nodupe['keyword'].unique()

array(['none', 'ablaze', 'accident', 'aftershock', 'airplane accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown up', 'body bag', 'body bagging', 'body bags',
       'bomb', 'bombed', 'bombing', 'bridge collapse',
       'buildings burning', 'buildings on fire', 'burned', 'burning',
       'burning buildings', 'bush fires', 'casualties', 'casualty',
       'catastrophe', 'catastrophic', 'chemical emergency', 'cliff fall',
       'collapse', 'collapsed', 'collide', 'collided', 'collision',
       'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone',
       'damage', 'danger', 'dead', 'death', 'deaths', 'debris', 'deluge',
       'deluged', 'demolish', 'demolished', 'demolition', 'derail',
       'derailed', 'derailment', 'de

In [15]:
nodupe['text'].head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

Lets clean the text column now. 

In [16]:
def clean(df):
    
    punc = string.punctuation
    def remove_punctuation(text):
        no_punc=[words for words in text if words not in punc]
        return ''.join(no_punc)
    df['no_punc'] = df['text'].apply(lambda x: remove_punctuation(x))

    stopword = stopwords.words('english')
    def remove_stopwords(text):
        return ' '.join([word for word in text.split() if word not in (stopword)])
    df['textnostop'] = df['no_punc'].apply(remove_stopwords)

    def lemmatize(text):
        return ''.join([wn.lemmatize(word).lower() for word in text])    
    df['lemmatized'] = df['textnostop'].apply(lemmatize)

    def remove_nums(text):
        return ''.join([word for word in text if not any(num.isdigit() for num in word)])    
    df['no_nums'] = df['lemmatized'].apply(remove_nums)

In [17]:
def no_punc_nums(df):   
    punc = string.punctuation
    def remove_punctuation(text):
        no_punc=[words for words in text if words not in punc]
        return ''.join(no_punc)
    df['no_punc'] = df['text'].apply(lambda x: remove_punctuation(x))

    def remove_nums(text):
        return ''.join([word for word in text if not any(num.isdigit() for num in word)])    
    df['no_nums'] = df['no_punc'].apply(remove_nums)

In [18]:
nodupe1 = nodupe.copy()
clean(nodupe1)
nodupe1.head()

Unnamed: 0,id,keyword,text,target,no_punc,textnostop,lemmatized,no_nums
0,1,none,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,Our Deeds Reason earthquake May ALLAH Forgive us,our deeds reason earthquake may allah forgive us,our deeds reason earthquake may allah forgive us
1,4,none,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,Forest fire near La Ronge Sask Canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada
2,5,none,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,All residents asked shelter place notified off...,all residents asked shelter place notified off...,all residents asked shelter place notified off...
3,6,none,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,13000 people receive wildfires evacuation orde...,13000 people receive wildfires evacuation orde...,people receive wildfires evacuation orders ca...
4,7,none,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,Just got sent photo Ruby Alaska smoke wildfire...,just got sent photo ruby alaska smoke wildfire...,just got sent photo ruby alaska smoke wildfire...


In [19]:
nodupe2 = nodupe.copy()
no_punc_nums(nodupe2)
nodupe2.head()

Unnamed: 0,id,keyword,text,target,no_punc,no_nums
0,1,none,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,Our Deeds are the Reason of this earthquake Ma...
1,4,none,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,Forest fire near La Ronge Sask Canada
2,5,none,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,All residents asked to shelter in place are be...
3,6,none,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,people receive wildfires evacuation orders in...
4,7,none,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,Just got sent this photo from Ruby Alaska as s...


Each function seems have properly adjusted the texts. Lets extract the final product and initialize a clean data frame. 
<br> We will also seperate our target column to use as our "Y"  

In [20]:
data1 = nodupe1[['id', 'keyword', 'no_nums']]
data1.columns = ['ID', 'Keyword', 'Text']
data1.head()

Unnamed: 0,ID,Keyword,Text
0,1,none,our deeds reason earthquake may allah forgive us
1,4,none,forest fire near la ronge sask canada
2,5,none,all residents asked shelter place notified off...
3,6,none,people receive wildfires evacuation orders ca...
4,7,none,just got sent photo ruby alaska smoke wildfire...


In [21]:
data2 = nodupe2[['no_nums']]
data2.columns = ['text']
data2.head()

Unnamed: 0,text
0,Our Deeds are the Reason of this earthquake Ma...
1,Forest fire near La Ronge Sask Canada
2,All residents asked to shelter in place are be...
3,people receive wildfires evacuation orders in...
4,Just got sent this photo from Ruby Alaska as s...


For potential EDA

In [22]:
target = nodupe['target']
target

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7503, dtype: int64