# Data Wrangling

In [1]:
import numpy as np, pandas as pd, inflect, string, re, nltk, contractions,tqdm, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet as wn  
from nltk.stem import LancasterStemmer, WordNetLemmatizer
wn = nltk.WordNetLemmatizer()
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from collections import Counter 
from bs4 import BeautifulSoup  
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Data
df = pd.read_csv("disaster.csv")[['keyword','location', 'text', 'target']]
df.head()

Unnamed: 0,keyword,location,text,target
0,,,Our Deeds are the Reason of this #earthquake M...,1
1,,,Forest fire near La Ronge Sask. Canada,1
2,,,All residents asked to 'shelter in place' are ...,1
3,,,"13,000 people receive #wildfires evacuation or...",1
4,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   keyword   7552 non-null   object
 1   location  5080 non-null   object
 2   text      7613 non-null   object
 3   target    7613 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 238.0+ KB


In [4]:
# Lets check out what Non-Null rows look like
df.loc[df['location'].notnull(), df.columns]

Unnamed: 0,keyword,location,text,target
31,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...
7575,wrecked,TN,On the bright side I wrecked http://t.co/uEa0t...,0
7577,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0
7579,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
7580,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0


Location doesn't seem to give us insight to the content of the tweet other than location which is not we want
<br> Only 5080 out of 7613 rows were returned so there are a lot of null values. We will also drop the 'location' column

In [5]:
# Drop 'location'
df2 = df[['keyword', 'text', 'target']]

In [6]:
# Check for duplicate 'text' rows
dupe = df2[df2.duplicated(subset = 'text')].sort_values('text')
dupe

Unnamed: 0,keyword,text,target
4299,hellfire,#Allah describes piling up #wealth thinking it...,0
4312,hellfire,#Allah describes piling up #wealth thinking it...,1
6366,suicide%20bomb,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
6373,suicide%20bomb,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
6392,suicide%20bomb,#Bestnaijamade: 16yr old PKK suicide bomber wh...,1
...,...,...,...
3461,exploded,that exploded &amp; brought about the\nbeginni...,0
6103,sinking,that horrible sinking feeling when youÛªve be...,0
6094,sinking,that horrible sinking feeling when youÛªve be...,0
6123,sinking,that horrible sinking feeling when youÛªve be...,1


It seems there are duplicates that have different target values which is confusing

In [7]:
#Go through each distinct set of duplicated rows and correct the target values
dupes = dupe.drop_duplicates('text').sort_index()
dupes.at[1156, 'target'] = 1
dupes.at[1251, 'target'] = 1
dupes.at[2832, 'target'] = 1
dupes.at[2841, 'target'] = 1
dupes.at[4320, 'target'] = 1
dupes.at[4381, 'target'] = 1
dupes.at[5073, 'target'] = 1
dupes.at[5641, 'target'] = 1
dupes.at[6616, 'target'] = 1

Only 9 values had to be changed

In [8]:
#We will drop duplicates then add back the corrected distinct rows 
nodupe = pd.concat([df2.drop_duplicates(subset = 'text', keep = False), dupes], axis = 0)
nodupe.sort_index(inplace = True)
nodupe[['keyword']] = nodupe[['keyword']].fillna('none')
nodupe

Unnamed: 0,keyword,text,target
0,none,Our Deeds are the Reason of this #earthquake M...,1
1,none,Forest fire near La Ronge Sask. Canada,1
2,none,All residents asked to 'shelter in place' are ...,1
3,none,"13,000 people receive #wildfires evacuation or...",1
4,none,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...
7608,none,Two giant cranes holding a bridge collapse int...,1
7609,none,@aria_ahrary @TheTawniest The out of control w...,1
7610,none,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,none,Police investigating after an e-bike collided ...,1


In [9]:
nodupe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7503 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   keyword  7503 non-null   object
 1   text     7503 non-null   object
 2   target   7503 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 234.5+ KB


In [10]:
# Look at the values in the 'keyword' column
keywords = nodupe['keyword'].unique()
keywords

array(['none', 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', '

Lets get rid of the random '%20' that shows up in some keywords

In [11]:
for word in keywords:
    old = word
    new = word.replace('%20', ' ')
    nodupe[['keyword']] = nodupe[['keyword']].replace(old, new) 

nodupe['keyword'].unique()

array(['none', 'ablaze', 'accident', 'aftershock', 'airplane accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown up', 'body bag', 'body bagging', 'body bags',
       'bomb', 'bombed', 'bombing', 'bridge collapse',
       'buildings burning', 'buildings on fire', 'burned', 'burning',
       'burning buildings', 'bush fires', 'casualties', 'casualty',
       'catastrophe', 'catastrophic', 'chemical emergency', 'cliff fall',
       'collapse', 'collapsed', 'collide', 'collided', 'collision',
       'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone',
       'damage', 'danger', 'dead', 'death', 'deaths', 'debris', 'deluge',
       'deluged', 'demolish', 'demolished', 'demolition', 'derail',
       'derailed', 'derailment', 'de

In [12]:
# Check out 'text' values
nodupe['text'].sample(5, random_state = 0).values

array(['#earthquake (EMSC): MD 2.9 OFF COAST OF NORTHERN CALIFORNIA http://t.co/6AiMd1uway G http://t.co/9cgbJwmhII',
       '#np Avenged Sevenfold - Hail To The King',
       "Murfreesboro peeps- I'm hearing Walmart on S Rutherford is on lockdown with a hostage is that true or a rumor?",
       '[55436] 1950 LIONEL TRAINS SMOKE LOCOMOTIVES WITH MAGNE-TRACTION INSTRUCTIONS http://t.co/xEZBs3sq0y http://t.co/C2x0QoKGlY',
       'Man! What I would give to be in CA right now to help with the wild fires.'],
      dtype=object)

Lets clean the text column now

In [13]:
# Preprocessing 'text'
def replace_contractions(text):
    """Replace contractions in text"""
    return contractions.fix(text)

def remove_URL(sample):
    """Remove URLs from text"""
    return re.sub(r"http\S+", "", sample)

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = [unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in words]
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = map(str.lower, words)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = map(lambda word: re.sub(r'[^\w\s]', '', word), filter(lambda word: word != '', words))
#     new_words = [re.sub(r'[^\w\s]', '', word) for word in words if word != '']
    return new_words

def remove_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = [word for word in words if not any(num.isdigit() for num in word)]
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = filter(lambda word: (word not in stopwords.words('english')), words)
    return new_words

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = map(lambda word: lemmatizer.lemmatize(word, pos='v'), words)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = remove_stopwords(words)
    words = lemmatize_verbs(words)
    return words

def preprocess(sample):
    sample = remove_URL(sample)
    sample = replace_contractions(sample)
    # Tokenize
    words = nltk.word_tokenize(sample)

    # Normalize
    words = normalize(words)
    words = ' '.join([word for word in words])
    return words



sample = "The Blood tEst for Down's syndrome hailed 567 http://bbc.in/1BO3eWQ"               
print(preprocess(sample))

blood test syndrome hail


In [14]:
texts = nodupe[['text']].copy()
texts['pre_processed'] = texts['text'].apply(preprocess)
texts.head()

Unnamed: 0,text,pre_processed
0,Our Deeds are the Reason of this #earthquake M...,deeds reason earthquake may allah forgive us
1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,residents ask shelter place notify officer e...
3,"13,000 people receive #wildfires evacuation or...",people receive wildfires evacuation order cal...
4,Just got sent this photo from Ruby #Alaska as ...,get send photo ruby alaska smoke wildfires p...


Each function seems to have properly adjusted the texts. <br>   

We will make more dataframes for potential EDA

In [15]:
def clean(df):
    
    punc = string.punctuation
    def remove_punctuation(text):
        no_punc=[words for words in text if words not in punc]
        return ''.join(no_punc)
    df['no_punc'] = df['text'].apply(lambda x: remove_punctuation(x))

    stopword = stopwords.words('english')
    def remove_stopwords(text):
        return ' '.join([word for word in text.split() if word not in (stopword)])
    df['textnostop'] = df['no_punc'].apply(remove_stopwords)

    def lemmatize(text):
        return ''.join([wn.lemmatize(word).lower() for word in text])    
    df['lemmatized'] = df['textnostop'].apply(lemmatize)

    def remove_nums(text):
        return ''.join([word for word in text if not any(num.isdigit() for num in word)])    
    df['no_nums'] = df['lemmatized'].apply(remove_nums)

In [16]:
def no_punc_nums(df):   
    
    punc = string.punctuation
    def remove_punctuation(text):
        no_punc=[words for words in text if words not in punc]
        return ''.join(no_punc)
    df['no_punc'] = df['text'].apply(lambda x: remove_punctuation(x))

    def remove_nums(text):
        return ''.join([word for word in text if not any(num.isdigit() for num in word)])    
    df['no_nums'] = df['no_punc'].apply(remove_nums)

In [17]:
nodupe1 = nodupe.copy()
clean(nodupe1)
eda1 = nodupe1[['keyword', 'no_nums']]
eda1.columns = ['Keyword', 'Text']
eda1.head()

Unnamed: 0,Keyword,Text
0,none,our deeds reason earthquake may allah forgive us
1,none,forest fire near la ronge sask canada
2,none,all residents asked shelter place notified off...
3,none,people receive wildfires evacuation orders ca...
4,none,just got sent photo ruby alaska smoke wildfire...


In [18]:
nodupe2 = nodupe.copy()
no_punc_nums(nodupe2)
eda2 = nodupe2[['no_nums']]
eda2.columns = ['text']
eda2.head()

Unnamed: 0,text
0,Our Deeds are the Reason of this earthquake Ma...
1,Forest fire near La Ronge Sask Canada
2,All residents asked to shelter in place are be...
3,people receive wildfires evacuation orders in...
4,Just got sent this photo from Ruby Alaska as s...


In [19]:
# Seperate the 'target' for our 'Y'
target = nodupe['target']

In [20]:
# texts.to_csv('Cleaned.csv')

# data1.to_csv('CleanedDisasterTweets.csv')
 
# target.to_csv('TargetDisasterTweets.csv')

# data2.to_csv('reg_disaster_text.csv')