In [21]:
# Imports
import pandas as pd
from bs4 import BeautifulSoup
import regex as re

from nltk.tokenize import RegexpTokenizer
import nltk as nltk
from nltk.corpus import stopwords

In [22]:
# Read in data scraped from Twitter
firetweets = pd.read_csv('./data/firetweets.csv')

In [23]:
# Read in data from CrisisLex.org
tweets = pd.read_csv('./data/all_disaster_tweets.csv')

In [24]:
# Get shape of first DataFrame
firetweets.shape

(6260, 2)

In [25]:
# Get shape of second DataFrame
tweets.shape

(27943, 2)

In [26]:
# Show first few rows of tweets
firetweets.head()

Unnamed: 0,date_created,message
0,2019-11-06 04:00:21,California #artist Danielle Nelisse paints #abstract landscape #paintings of #daniellenelisse #wildfires | thank yo… https://t.co/SZgfT1Yw25
1,2019-11-06 03:58:48,Many low income people lost their homes in Baja California. Goal here is to collect money to buy material so we can… https://t.co/myI6Fy2pI7
2,2019-11-06 03:58:37,RT @EpochTimes: “Get your act together Governor. You don’t see close to the level of burn in other states.”\n\nTrump renewed his threat to cu…
3,2019-11-06 03:57:50,RT @EpochTimes: “Get your act together Governor. You don’t see close to the level of burn in other states.”\n\nTrump renewed his threat to cu…
4,2019-11-06 03:54:22,Yikes\nhttps://t.co/gs4D0VP4HH #wildfires #Africa


In [27]:
# Get rid of the date column; The date column could be useful for those aiding in humanitarian efforts
firetweets.drop('date_created', axis=1, inplace=True)

In [28]:
# Rename column
firetweets.rename({'message':'tweet text'}, axis=1, inplace=True)

In [29]:
# Assign the 0 label to fires
firetweets['disaster type'] = 0

In [30]:
# Combine two data sources
all_tweets = pd.concat([firetweets, tweets])

In [31]:
# New size
all_tweets.shape

(34203, 2)

In [32]:
# Data types are string and float
all_tweets.dtypes

tweet text       object 
disaster type    float64
dtype: object

In [33]:
# Convert date_created column to datetime possibly for visualization
#tweets['date_created'] = pd.to_datetime(tweets['date_created'])

In [34]:
# Re-used function from Project 3 that will clean DataFrame column
def clean_text_column(df_column):
    
    words_list = []
    
    # Remove punctuation with the exception of hashtags; convert to lowercase
    df_column = pd.Series([re.sub("[^a-zA-Z#]", " ", BeautifulSoup(text).get_text().lower()) for text in df_column])
    
    # Instantiate tokenizer
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    
    # Tokenize each line of the series and append each line (less stopwords) to words_list
    for line in df_column:
        test = tokenizer.tokenize(line)
        words_list.append(' '.join([word for word in test if word not in stopwords.words('english')]))
    
    # Return the cleaned pandas series
    return pd.Series(words_list)

In [35]:
# Check out the first 10 tweets
pd.set_option('display.max_colwidth', -1)
all_tweets['tweet text'].head(10)

0    California #artist Danielle Nelisse paints #abstract landscape #paintings of #daniellenelisse #wildfires | thank yo… https://t.co/SZgfT1Yw25  
1    Many low income people lost their homes in Baja California. Goal here is to collect money to buy material so we can… https://t.co/myI6Fy2pI7  
2    RT @EpochTimes: “Get your act together Governor. You don’t see close to the level of burn in other states.”\n\nTrump renewed his threat to cu…
3    RT @EpochTimes: “Get your act together Governor. You don’t see close to the level of burn in other states.”\n\nTrump renewed his threat to cu…
4    Yikes\nhttps://t.co/gs4D0VP4HH #wildfires #Africa                                                                                             
5    RT @chooselovetoday: Community #Poll RE-TWEET #TheTruthCommunity #truthers #truth #WildFires #DirectedEnergyWeapons #HAARP #WeatherModifica…  
6    RT @3MSafety: Many people are exposed to smoke and ash from #wildfires and want to use an N95 particulate r

In [37]:
# Check null values
all_tweets.isnull().sum()

tweet text       0 
disaster type    10
dtype: int64

In [38]:
# Drop locations with null values
all_tweets.dropna(inplace=True)

In [39]:
# Clean the Tweet messages
clean_text_column(all_tweets['tweet text'])

0        california #artist danielle nelisse paints #abstract landscape #paintings #daniellenelisse #wildfires thank yo https co szgft yw
1        many low income people lost homes baja california goal collect money buy material https co myi fy pi                            
2        rt epochtimes get act together governor see close level burn states trump renewed threat cu                                     
3        rt epochtimes get act together governor see close level burn states trump renewed threat cu                                     
4        yikes https co gs vp hh #wildfires #africa                                                                                      
                            ...                                                                                                          
34188    donations queensland flood appeal fall short expectations http co gyfdbazq                                                      
34189    emergency payments flood 

In [40]:
# Clean the firetweets
clean_text_column(firetweets['tweet text'])

0       california #artist danielle nelisse paints #abstract landscape #paintings #daniellenelisse #wildfires thank yo https co szgft yw
1       many low income people lost homes baja california goal collect money buy material https co myi fy pi                            
2       rt epochtimes get act together governor see close level burn states trump renewed threat cu                                     
3       rt epochtimes get act together governor see close level burn states trump renewed threat cu                                     
4       yikes https co gs vp hh #wildfires #africa                                                                                      
                           ...                                                                                                          
6255    rt chalicegarden urgent horse trailer please help animals sos need evac enough trailers barbi twins revengebunny                
6257    rt ahmalcolm california #wildfire

In [41]:
# Overwrite original thread title words with relevant words only
tweets['tweet text'] = clean_text_column(tweets['tweet text'])

In [42]:
# Overwrite original thread title words with relevant words only; more on this in the preprocessing and visualizations step
firetweets['tweet text'] = clean_text_column(firetweets['tweet text'])

In [43]:
# Show cleaned data
all_tweets.head()

Unnamed: 0,tweet text,disaster type
0,California #artist Danielle Nelisse paints #abstract landscape #paintings of #daniellenelisse #wildfires | thank yo… https://t.co/SZgfT1Yw25,0.0
1,Many low income people lost their homes in Baja California. Goal here is to collect money to buy material so we can… https://t.co/myI6Fy2pI7,0.0
2,RT @EpochTimes: “Get your act together Governor. You don’t see close to the level of burn in other states.”\n\nTrump renewed his threat to cu…,0.0
3,RT @EpochTimes: “Get your act together Governor. You don’t see close to the level of burn in other states.”\n\nTrump renewed his threat to cu…,0.0
4,Yikes\nhttps://t.co/gs4D0VP4HH #wildfires #Africa,0.0


In [44]:
# Check data types again
all_tweets.dtypes

tweet text       object 
disaster type    float64
dtype: object

In [45]:
# Change the disaster type column to an integer to prepare for model preparation
all_tweets['disaster type'] = all_tweets['disaster type'].astype(int)

In [46]:
# Create .csv file for cleaned data
all_tweets.to_csv('./data/clean.csv', index=False)

In [47]:
# Create .csv file for cleaned data
firetweets.to_csv('./data/firetweets_clean.csv', index=False)