In [1]:
import pandas as pd
import numpy as np
import regex as re
import pickle
import string
from langdetect import detect

pd.set_option('display.max_colwidth', -1)

pd.set_option('display.max_rows', 1000)
#pd.set_option('display.max_columns', 500)

## Data cleaning hurricane tweets

### Read in all hurricane tweets from pickle

In [2]:
hurricanes = pd.read_csv("../Data/hurricane_tweets.csv")
floods = pd.read_csv("../Data/df_floods.csv")
fires = pd.read_csv("../Data/all_fires.csv")

  interactivity=interactivity, compiler=compiler, result=result)


#### Merge dataframes into one

In [3]:
# merge dataframes together
df = pd.concat([hurricanes, floods, fires], sort = False)

In [4]:
df.drop(columns = "Unnamed: 0", inplace = True)

In [5]:
df["text"] = df['text'].astype(str)

In [225]:
# for text in hurricanes["text"]:
#     try:
#         detect(text)
#     except:
#         noLang

In [156]:
# for key, valu in onlyText.items():
#         try:
#             if detect(val[0]) !="en":
#                 foreignLangs[key]= val
#                 foreignLangs[key].append(detect(val[0]))

#         except:
#             noLang[key] = val

In [157]:
# hurricanes["language"] = hurricanes["text"].apply(detect)

## Clean text columns

In [6]:
df = df[["text", "disaster"]]

In [7]:
df.iloc[0,0]

"OFFICALLY TROPICAL STORM DORIAN Where is it Going? Tropical Depression 5 Hurricane Dorian Track 2019 https://youtu.be/SKCqARFvsQw\xa0 The latest on the STORM'S TRACK!  in the above YOUTUBE LINK!!! @FlyRts @FearRTs @GFXCoach #dorian #florida #hurricane #hurricanedorian #tropicalstormdorianpic.twitter.com/RpMN7ewuLs"

In [8]:
df.iloc[6,0]

'#TDFIVE TO BECOME A #Hurricane THIS WEEK\n\nA system, located hundreds of miles from the Lesser Antilles, is expected to become #TropicalStormDorian tomorrow.  It is also forecast to become #HurricaneDorian later this week!  Start preparing now!\n\n#apexwx #tropics #Atlantic #stormpic.twitter.com/MsRpq4mRRZ'

In [9]:
#df[df["text"].str.contains("blog")].head(100)

In [9]:
# this code was adapted from this stackoverflow answer
# https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression
def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,'')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [10]:
# lowercase text
df["text"] = df["text"].str.lower()



# remove URLs
df['text'] = df['text'].map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))


# remove URL cutoffs
df['text'] = df['text'].map(lambda x: re.sub('\\[^\s]*', ' ', x))



# remove spaces
df['text'] = df['text'].map(lambda x: re.sub('\n', ' ', x))


# remove picture URLs
df['text'] = df['text'].map(lambda x: re.sub('pic.twitter.com\/[^\s]*', ' ', x))

# remove blog/map type
df['text'] = df['text'].map(lambda x: re.sub('blog\/maps\/info\/[^\s]*', '', x))



# remove hashtags and AT users
df['text'] = df['text'].apply(strip_all_entities)



# remove single quotations
df["text"] = df["text"].map(lambda x: re.sub("'", "", x))
df["text"] = df["text"].map(lambda x: re.sub("'", "", x))




# remove characters that are not word characters or digits
df["text"] = df["text"].map(lambda x: re.sub("[^\w\d]", " ", x))

# remove all characters that are not letters
df['text'] = df['text'].map(lambda x: re.sub("[^a-zA-Z]", " ", x))

# remove multiple spaces
df['text'] = df['text'].map(lambda x: re.sub("\s{2,6}", "", x))

In [11]:
df["text"]

0        offically tropical storm dorian where is it going tropical depressionhurricane dorian trackthe latest on the storms track in the above youtube link                                          
1        tropical storm dorian projected path spaghetti models                                                                                                                                        
2        futura tormenta tropical pasando por el sur de puerto rico                                                                                                                                   
3        blogmapsinfo                                                                                                                                                                                 
4        blogmapsinfo                                                                                                                                                                                 
     

In [12]:
# remove tweets with this url type
df = df[~df["text"].str.contains("blogmapsinfo")]

In [13]:
# drop duplicate rows
df.drop_duplicates(subset='text', keep='first', inplace=True)

In [14]:
df['text'] = df['text'].map(lambda x: re.sub("\s{2,6}", "", x))

In [15]:
# drop row with only one space
df = df[~(df["text"]== " ")]

In [16]:
# drop empty row
df = df[~(df["text"]== "")]

In [17]:
range(len(df))

range(0, 62922)

## Detect languages of tweets

In [21]:
# this code was used to test for errors that would prevent the detect function from running
# languages = []
# for i in range(101,150):
#     try:
#         languages.append(detect(df.iloc[i, 0]))
#     except:
#         print(f"error in row {i}")

In [22]:
# apply detect function on text column
df["languages"] = df["text"].apply(detect)

In [27]:
df.shape

(62921, 3)

In [28]:
df_en.shape

(59586, 3)

In [25]:
## Select for tweets that are English only
## this dropped 3_335 rows 
df_en = df[df["languages"] == "en"]

## Continue cleaning on english column

Here we are removing multiple copies of the same letter. For example "thanksssssssss" is updated to "thanks".

In [55]:
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("w{2,10}", "w", x))

In [54]:
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("s{3,10}", "s", x))

In [53]:
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("a{3,10}", "a", x))

In [56]:
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("t{3,10}", "t", x))

In [58]:
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("x{3,10}", "x", x))

In [63]:
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("m{3,10}", "m", x))

In [64]:
df_en.loc[:, "text"] = df_en['text'].map(lambda x: re.sub("l{3,10}", "l", x))

## Write to CSV

In [67]:
# write to csf
df_en.to_csv("../Data/all_tweets_clean2.csv", index = False)