In [252]:
import pandas as pd
import numpy as np
import regex as re
import pickle
import string
from langdetect import detect

pd.set_option('display.max_colwidth', -1)

pd.set_option('display.max_rows', 1000)
#pd.set_option('display.max_columns', 500)

## Data cleaning hurricane tweets

### Read in all hurricane tweets from pickle

In [278]:
hurricanes = pd.read_csv("../Data/hurricane_tweets.csv")
floods = pd.read_csv("../Data/df_floods.csv")
fires = pd.read_csv("../Data/all_fires.csv")

#### Merge dataframes into one

In [279]:
# merge dataframes together
df = pd.concat([hurricanes, floods, fires])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [280]:
df.drop(columns = "Unnamed: 0", inplace = True)

In [281]:
df["disaster"].value_counts()

fire         40382
hurricane    36436
floods       896  
Name: disaster, dtype: int64

## Clean text columns

In [282]:
df = df[["text", "disaster"]]

In [283]:
df["text"] = df['text'].astype(str)

In [306]:
#df[df["text"].str.contains("blog")].head(100)

In [307]:
# this code was adapted from this stackoverflow answer
# https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression
def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,'')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [308]:
# lowercase text
df["text"] = df["text"].str.lower()

# remove picture URLs
df['text'] = df['text'].map(lambda x: re.sub('pic.twitter.com\/*', ' ', x))

# remove URLs
df['text'] = df['text'].map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))

# remove blog/map type
df['text'] = df['text'].map(lambda x: re.sub('blog*maps*', '', x))

# remove hashtags and AT users
df['text'] = df['text'].apply(strip_all_entities)

# remove single quotations
df["text"] = df["text"].map(lambda x: re.sub("'", "", x))
df["text"] = df["text"].map(lambda x: re.sub("'", "", x))


# remove spaces
df['text'] = df['text'].map(lambda x: re.sub('\n', ' ', x))

# remove characters that are not word characters or digits
df["text"] = df["text"].map(lambda x: re.sub("[^\w\d]", " ", x))

# remove all characters that are not letters
df['text'] = df['text'].map(lambda x: re.sub("[^a-zA-Z]", " ", x))

# remove multiple spaces
df['text'] = df['text'].map(lambda x: re.sub("\s{2,6}", " ", x))

In [309]:
df["text"]

0        offically tropical storm dorian where is it going tropical depression hurricane dorian track the latest on the storms track in the above youtube link rpmn ewuls                             
1        tropical storm dorian projected path spaghetti models t o l wer                                                                                                                              
2        futura tormenta tropical pasando por el sur de puerto rico xdbjsyke o                                                                                                                        
3        blogmapsinfo jyqlmzsi                                                                                                                                                                        
4        blogmapsinfo sj w kpc                                                                                                                                                                        
     

In [312]:
# remove tweets with this url type
df = df[~df["text"].str.contains("blogmapsinfo")]

In [316]:
## filter for English tweets only
#df["language"] = df["text"].apply(detect)

In [318]:
# drop duplicate rows
df.drop_duplicates(subset='text', keep='first', inplace=True)

In [319]:
df["text"]

0        offically tropical storm dorian where is it going tropical depression hurricane dorian track the latest on the storms track in the above youtube link rpmn ewuls                             
1        tropical storm dorian projected path spaghetti models t o l wer                                                                                                                              
2        futura tormenta tropical pasando por el sur de puerto rico xdbjsyke o                                                                                                                        
5        to become a this week a system located hundreds of miles from the lesser antilles is expected to become tomorrow it is also forecast to become later this week start preparing now kwjsv xpix
6        to become a this week a system located hundreds of miles from the lesser antilles is expected to become tomorrow it is also forecast to become later this week start preparing now msrpq mrrz
     