In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import svm
from numpy.random import RandomState
from scipy.stats import ttest_ind
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from datetime import timedelta  
from sklearn.utils import shuffle
import os

In [2]:
# Functions for importing & cleaning relevant tweets
def lower(s):
    return s.lower()

def tweet_imports(filename):
    imp = pd.read_pickle(filename)
    imp = imp.drop_duplicates()
    imp['tweet_clean'] = imp['tweet'].str.replace('http\S+|www.\S+|pic.twitter.com\S+', '', case=False)
    imp['tweet_clean'] =imp['tweet_clean'].replace('[^A-Za-z0-9 ]+','',regex=True)
    imp['tweet_clean'] = imp['tweet_clean'].apply(lower)#map(lambda x: x.lower(), imp['tweet_clean'])
    imp['date'] = pd.to_datetime(imp['date'])
    return imp

In [3]:
# Collect all tweets from every user into 2 groups of affirming and denying tweets, add label (1 vs. -1)
all_affirm_tweets = []
all_deny_tweets = []

for filename in os.listdir(os.getcwd()+'/affirm_tweets/'):
    tweets_per_user = pd.read_pickle(os.getcwd()+'/affirm_tweets/'+filename)
    all_affirm_tweets.append(tweets_per_user)
for filename in os.listdir(os.getcwd()+'/deny_tweets/'):
    tweets_per_user = pd.read_pickle(os.getcwd()+'/deny_tweets/'+filename)
    all_deny_tweets.append(tweets_per_user)
    
affirm_tweets = pd.concat(all_affirm_tweets)
affirm_tweets['label'] = [1]*affirm_tweets.shape[0]
deny_tweets = pd.concat(all_deny_tweets)
deny_tweets['label'] = [-1]*deny_tweets.shape[0]
print(affirm_tweets.shape)
print(deny_tweets.shape)

affirm_tweets.drop_duplicates(subset ="id", keep = 'first', inplace = True)
deny_tweets.drop_duplicates(subset ="id", keep = 'first', inplace = True)
print(affirm_tweets.shape)
print(deny_tweets.shape)

affirm_tweets.to_pickle('all_affirm_tweets.pkl')
deny_tweets.to_pickle('all_deny_tweets.pkl')

(230802, 12)
(290084, 12)
(230802, 12)
(290084, 12)


In [4]:
cleaned_affirm_tweets = tweet_imports('all_affirm_tweets.pkl')
cleaned_deny_tweets = tweet_imports('all_deny_tweets.pkl')

In [12]:
cleaned_affirm_tweets

Unnamed: 0,id,date,time,tweet,mentions,replies_count,retweets_count,likes_count,hashtags,username,search_term,label,tweet_clean
0,1177060829901885441,2019-09-25,20:22:25,#IPCC just released the #SROCC - a new report ...,[],1,40,75,"['#ipcc', '#srocc']",350,ice,1,ipcc just released the srocc a new report on ...
1,1176786922687148032,2019-09-25,02:14:00,The #IPCC special report on ocean and ice is o...,[],5,83,112,"['#ipcc', '#srocc']",350,ice,1,the ipcc special report on ocean and ice is ou...
2,1164031877910618114,2019-08-20,21:30:01,Unusually warm water surrounding one of the la...,[],1,62,93,[],350,ice,1,unusually warm water surrounding one of the la...
3,1152577659710427136,2019-07-20,06:55:02,This is one of the hottest summers on record. ...,[],0,17,26,['#abolishice'],350,ice,1,this is one of the hottest summers on record t...
4,1146328119336460288,2019-07-03,01:01:36,Antarctic ice has taken a nosedive.\n\nThe amo...,[],6,106,115,[],350,ice,1,antarctic ice has taken a nosedivethe amount o...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,1175370661985828865,2019-09-21,04:26:17,As #GretaThunberg led the Climate Strike in Ne...,[],0,0,2,"['#gretathunberg', '#nantichaocharoenchai', '#...",yv4ca,climate,1,as gretathunberg led the climate strike in new...
2,1169913415294259200,2019-09-06,03:01:08,The world is running out of time 🤜a race we ca...,[],0,1,5,"['#apclimateweek', '#apyouth4climate']",yv4ca,climate,1,the world is running out of time a race we can...
3,1169535040792850437,2019-09-05,01:57:37,Will you Fight 🤜for the future! \n\nJoin us to...,[],14,149,277,"['#climateemergency', '#apclimateweek', '#apyo...",yv4ca,climate,1,will you fight for the future join us to empow...
4,1169518364760276997,2019-09-05,00:51:21,"""Everyone has a responsible role towards tackl...",['earthdaynetwork'],0,3,6,"['#apclimateweek', '#apyouth4climate']",yv4ca,climate,1,everyone has a responsible role towards tackli...


# Import manually labelled & influential tweets

In [16]:
# TODO: replace w/ own manually labeled tweets
test_tweets = pd.concat([cleaned_affirm_tweets,cleaned_deny_tweets])
test_tweets = test_tweets.sample(n = 500) 
print(len(test_tweets), len(test_tweets[test_tweets['label']==1]), len(test_tweets[test_tweets['label']==-1]))
x = test_tweets[test_tweets['label']==-1]
print(len(x.drop_duplicates('tweet_clean')))

500 238 262
261


In [8]:
# train1 = tweet_imports("Datasets/Training Data/influential_tweets_filter_1.csv")
# train2 = tweet_imports("Datasets/Training Data/influential_tweets_filter_2.csv")
# train3 = tweet_imports("Datasets/Training Data/influential_tweets_filter_3.csv")
# all_train = pd.concat([train1, train2, train3])
all_train = pd.concat([cleaned_affirm_tweets, cleaned_deny_tweets])

# Allison et al. had: (403432, 220063, 183369)
print(len(all_train), len(all_train[all_train['label']==1]), len(all_train[all_train['label']==-1]))

520886 230802 290084


In [17]:
# # Select tweets that contain disasters from the training data 
# disaster_train = all_train[all_train['tweet_clean'].str.contains("michael|florence|wildfire|blizzard|fire|hurricane|bomb|cyclone|storm|snow|blaze")==True]
# disaster_train = disaster_train[disaster_train['tweet_clean'].str.contains("climate|change|global|warming")==True]
sub_train = all_train[all_train['tweet_clean'].str.contains("climate|change|global|warming")==True]

# Remove from training data the tweets that are already in the test data
#dis_set = disaster_train[['tweet_clean', 'label']].copy()
sub_set = sub_train[['tweet_clean', 'label']].copy()
test_tweets = test_tweets[['tweet_clean', 'label']].copy()
#dis_set['identifier'] = 0
sub_set['identifier'] = 0
test_tweets['identifier'] = 1
#dis_set = pd.concat([dis_set, test_tweets])
sub_set = pd.concat([sub_set,test_tweets])
#dis_set.drop_duplicates(keep=False)
sub_set.drop_duplicates(keep='first')
#disaster_train = dis_set[dis_set['identifier']==0]
sub_train = sub_set[sub_set['identifier']==0]
#print(len(disaster_train))
print(len(sub_train))

299709


# Import & clean downloaded Twitter data

In [10]:
def clean_tweets_restrict(filename, start_date, end_date):
    
    # Unlabeled tweets
    tweets = tweet_imports("Datasets/Twint Output/" + filename)
     
    # Constrain to relevant dates
    print(min(tweets['date']), max(tweets['date']))
    
    begin_tweets = pd.to_datetime(start_date) - timedelta(weeks = 2)
    end_tweets = pd.to_datetime(end_date) + timedelta(weeks = 2)
    print("Two weeks before:", begin_tweets, "Two weeks after:", end_tweets)
    
    tweets = tweets[tweets['date'] >= begin_tweets]
    tweets = tweets[tweets['date'] <= end_tweets]

    # Remove tweets to label that were already seen in train/valid/test for forming predictions
    tweet_dis_overlap = tweets.merge(disaster_train, on=['tweet_clean'])
    tweets = tweets[(~tweets.tweet_clean.isin(tweet_dis_overlap.tweet_clean))]
    tweet_test_overlap = tweets.merge(test_tweets, on=['tweet_clean'])
    tweets = tweets[(~tweets.tweet_clean.isin(tweet_test_overlap.tweet_clean))]

    # Combine pre-labeled tweets
    pre_labelled_tweets = pd.concat((tweet_dis_overlap, tweet_test_overlap), axis=0)
    print(len(tweets), len(pre_labelled_tweets))
    
    return((tweets, pre_labelled_tweets))

In [11]:
def count_tweets_restrict(filename, start_date, end_date, outfile):
    
    tweets, pre_labelled_tweets = clean_tweets_restrict(filename, start_date, end_date)
    
    # Split to pre- and post
    pre = tweets[tweets['date'] <= start_date]
    post = tweets[tweets['date'] > start_date]
    pre.reset_index(inplace=True)
    post.reset_index(inplace=True)
    print("Total tweets to label", len(tweets), "Prior tweets to label", len(pre), "Post tweets to label", len(post))
    
    # Merge pre- and post- tweets by same user to see if user sentiments change
    pre_users = pd.DataFrame(pre['user_id'].unique())
    post_users = pd.DataFrame(post['user_id'].unique())
    merge = pd.merge(pre_users, post_users, how='inner')
    print("Number of users tweeting before and after", len(merge))

    pre_tweets = pre.loc[pre['user_id'].isin(merge.iloc[:,0])]
    post_tweets = post.loc[post['user_id'].isin(merge.iloc[:,0])]
    print("Num tweets before", len(pre_tweets), "Num tweets after", len(post_tweets))
    
    tweets.to_csv('Datasets/Event Tweets/' + outfile + '.csv', sep=',')
    pre_labelled_tweets.to_csv('Datasets/Event Tweets/Prelabelled_' + outfile + '.csv', sep=',')
    
    return((tweets, pre, post, merge))

In [12]:
def count_tweets_restrict_combine(filename1, filename2, start_date, end_date, outfile):
    
    # Unlabeled tweets
    tweets1, pre_labelled_tweets1 = clean_tweets_restrict(filename1, start_date, end_date)
    tweets2, pre_labelled_tweets2 = clean_tweets_restrict(filename2, start_date, end_date)
    print(len(tweets1))
    print(len(tweets2))
    tweets = pd.merge(tweets1, tweets2, how='outer')
    pre_labelled_tweets = pd.merge(pre_labelled_tweets1, pre_labelled_tweets2, how='outer')
    
    # Split to pre- and post
    pre = tweets[tweets['date'] <= start_date]
    post = tweets[tweets['date'] > start_date]
    pre.reset_index(inplace=True)
    post.reset_index(inplace=True)
    print("Total tweets", len(tweets), "Prior tweets", len(pre), "Post tweets", len(post))
    
    # Merge pre- and post- tweets by same user to see if user sentiments change
    pre_users = pd.DataFrame(pre['user_id'].unique())
    post_users = pd.DataFrame(post['user_id'].unique())
    merge = pd.merge(pre_users, post_users, how='inner')
    print("Number of users tweeting before and after", len(merge))

    pre_tweets = pre.loc[pre['user_id'].isin(merge.iloc[:,0])]
    post_tweets = post.loc[post['user_id'].isin(merge.iloc[:,0])]
    print("Num tweets before", len(pre_tweets), "Num tweets after", len(post_tweets))
    
    tweets.to_csv('Datasets/Event Tweets/' + outfile + '.csv', sep=',')
    pre_labelled_tweets.to_csv('Datasets/Event Tweets/Prelabelled_' + outfile + '.csv', sep=',')
    
    return((tweets, pre, post, merge))

In [13]:
# January 2018 bomb cyclone (Jan 2 - Jan 6): https://en.wikipedia.org/wiki/January_2018_North_American_blizzard
blizzard_tweets, blizzard_pre, blizzard_post, blizzard_merge = count_tweets_restrict('blizzard_geo_tweets_v2.csv', '1/2/18', '1/6/18', 'blizzard')

(Timestamp('2017-11-21 00:00:00'), Timestamp('2018-01-19 00:00:00'))
('Two weeks before:', Timestamp('2017-12-19 00:00:00'), 'Two weeks after:', Timestamp('2018-01-20 00:00:00'))
(14957, 1610)
('Total tweets to label', 14957, 'Prior tweets to label', 2614, 'Post tweets to label', 12343)
('Number of users tweeting before and after', 330)
('Num tweets before', 634, 'Num tweets after', 1479)


In [14]:
# California Mendocino Wildfires (July 27 - Sep 18): https://en.wikipedia.org/wiki/Mendocino_Complex_Fire
summerfire_tweets, summerfire_pre, summerfire_post, summerfire_merge = count_tweets_restrict_combine('summerfire_geo_tweets.csv', 
                                                                                             'summerfire_geo_tweets_v2.csv', 
                                                                                             '7/27/18', '9/18/18',
                                                                                                    'summerfire')

(Timestamp('2018-05-31 00:00:00'), Timestamp('2018-09-26 00:00:00'))
('Two weeks before:', Timestamp('2018-07-13 00:00:00'), 'Two weeks after:', Timestamp('2018-10-02 00:00:00'))
(2710, 929)
(Timestamp('2018-06-04 00:00:00'), Timestamp('2018-10-01 00:00:00'))
('Two weeks before:', Timestamp('2018-07-13 00:00:00'), 'Two weeks after:', Timestamp('2018-10-02 00:00:00'))
(2808, 956)
2710
2808
('Total tweets', 3035, 'Prior tweets', 173, 'Post tweets', 2862)
('Number of users tweeting before and after', 36)
('Num tweets before', 49, 'Num tweets after', 95)


In [15]:
# Hurricane Florence (Aug 31 - Sep 19): https://en.wikipedia.org/wiki/Hurricane_Florence
florence_tweets, florence_pre, florence_post, florence_merge = count_tweets_restrict('florence_geo_tweets.csv', '8/31/18', '9/19/18',
                                                                                    'florence')

(Timestamp('2018-07-03 00:00:00'), Timestamp('2018-09-30 00:00:00'))
('Two weeks before:', Timestamp('2018-08-17 00:00:00'), 'Two weeks after:', Timestamp('2018-10-03 00:00:00'))
(6413, 1032)
('Total tweets to label', 6413, 'Prior tweets to label', 778, 'Post tweets to label', 5635)
('Number of users tweeting before and after', 122)
('Num tweets before', 193, 'Num tweets after', 497)


In [16]:
# Hurricane Michael (Oct 7 - Oct 16): https://en.wikipedia.org/wiki/Hurricane_Michael
michael_tweets, michael_pre, michael_post, michael_merge = count_tweets_restrict('michael_geo_tweets.csv', '10/07/18', '10/16/18',
                                                                                'michael')

(Timestamp('2018-09-05 00:00:00'), Timestamp('2018-10-28 00:00:00'))
('Two weeks before:', Timestamp('2018-09-23 00:00:00'), 'Two weeks after:', Timestamp('2018-10-30 00:00:00'))
(13035, 3126)
('Total tweets to label', 13035, 'Prior tweets to label', 1912, 'Post tweets to label', 11123)
('Number of users tweeting before and after', 281)
('Num tweets before', 880, 'Num tweets after', 1351)


In [17]:
# California Camp wildfires (Nov 8 - 25): https://en.wikipedia.org/wiki/Camp_Fire_(2018)
winterfire_tweets, winterfire_pre, winterfire_post, winterfire_merge = count_tweets_restrict_combine(
    'winterfire_geo_tweets.csv', 
    'winterfire_geo_tweets_v2.csv', 
    '11/08/18', '11/25/18', 'winterfire')

(Timestamp('2018-09-10 00:00:00'), Timestamp('2018-12-08 00:00:00'))
('Two weeks before:', Timestamp('2018-10-25 00:00:00'), 'Two weeks after:', Timestamp('2018-12-09 00:00:00'))
(5375, 151)
(Timestamp('2018-09-10 00:00:00'), Timestamp('2018-12-08 00:00:00'))
('Two weeks before:', Timestamp('2018-10-25 00:00:00'), 'Two weeks after:', Timestamp('2018-12-09 00:00:00'))
(6081, 143)
5375
6081
('Total tweets', 6654, 'Prior tweets', 55, 'Post tweets', 6599)
('Number of users tweeting before and after', 14)
('Num tweets before', 19, 'Num tweets after', 43)


# Create train / validation / test split

In [18]:
#randomly select tweets that go to validation set or training set
sub_train = shuffle(sub_train,random_state=123)
sub_train = sub_train.drop_duplicates('tweet_clean')
num_tweets = len(sub_train)

print(num_tweets)
infl_val_pos = sub_train[sub_train['label']==1]
infl_val_neg = sub_train[sub_train['label']==-1]
print(len(infl_val_pos), len(infl_val_neg))

labeled_tweets = test_tweets[test_tweets['label']!=0]
test_tweets_shuffle = shuffle(labeled_tweets,random_state=456)
manual_pos = test_tweets_shuffle[test_tweets_shuffle['label']==1]
manual_neg = test_tweets_shuffle[test_tweets_shuffle['label']==-1]
print(len(manual_pos), len(manual_neg))

train_pct = .9

276273
125790 150483
238 262


In [19]:
training_data = sub_train[:int(num_tweets*train_pct)]

val_tweets = sub_train[int(num_tweets*train_pct):]

# concatenate test tweets
half_test_num = 250

test_tweets_shuffle = test_tweets_shuffle.drop_duplicates('tweet_clean')
print(len(test_tweets_shuffle))
test_tweets_pos = test_tweets_shuffle[test_tweets_shuffle['label']==1]
print(len(test_tweets_pos))
test_tweets_neg = test_tweets_shuffle[test_tweets_shuffle['label']==-1]
print(len(test_tweets_neg))

test_tweets = shuffle(pd.concat([test_tweets_neg[:half_test_num],test_tweets_pos[:half_test_num]]),random_state=0)

print("Num total tweets", num_tweets,
      "\n Num train tweets", len(training_data), 
      "\n Num validation tweets", len(val_tweets), 
      "\n Num test tweets", len(test_tweets))

499
238
261
Num total tweets 276273 
 Num train tweets 248645 
 Num validation tweets 27628 
 Num test tweets 488


In [20]:
# Basic stats on training data from celebrities
train_pos = training_data[training_data['label']==1]
train_neg = training_data[training_data['label']==-1]
print("Num train tweets", len(training_data), "Num positive tweets", len(train_pos), 
      "Num negative tweets", len(train_neg))

# Check number of manual labelled tweets and tweets to label
val_tweets_pos = val_tweets[val_tweets['label']==1]
val_tweets_neg = val_tweets[val_tweets['label']==-1]
print("Num val tweets", len(val_tweets), "Num positive tweets", len(val_tweets_pos), 
      "Num negative tweets", len(val_tweets_neg))

# Check number of manual labelled tweets and tweets to label
test_tweets_pos = test_tweets[test_tweets['label']==1]
test_tweets_neg = test_tweets[test_tweets['label']==-1]
print("Num test tweets", len(test_tweets), "Num positive tweets", len(test_tweets_pos), 
      "Num negative tweets", len(test_tweets_neg))

test_tweets = pd.concat([test_tweets_pos, test_tweets_neg])

Num train tweets 248645 Num positive tweets 113434 Num negative tweets 135211
Num val tweets 27628 Num positive tweets 12356 Num negative tweets 15272
Num test tweets 488 Num positive tweets 238 Num negative tweets 250


In [21]:
# Remove test tweets from training & validation sets
test_train_overlap = training_data.merge(test_tweets, on=['tweet_clean'])
#print(len(test_train_overlap))
training_data = training_data[(~training_data.tweet_clean.isin(test_train_overlap.tweet_clean))]
#print(len(training_data))

test_val_overlap = val_tweets.merge(test_tweets, on=['tweet_clean'])
#print(len(test_val_overlap))
val_tweets = val_tweets[(~val_tweets.tweet_clean.isin(test_val_overlap.tweet_clean))]
#print(len(val_tweets))

In [22]:
# Basic stats on training data from celebrities
train_pos = training_data[training_data['label']==1]
train_neg = training_data[training_data['label']==-1]
print("Num train tweets", len(training_data), "Num positive tweets", len(train_pos), 
      "Num negative tweets", len(train_neg))

# Check number of manual labelled tweets and tweets to label
val_tweets_pos = val_tweets[val_tweets['label']==1]
val_tweets_neg = val_tweets[val_tweets['label']==-1]
print("Num val tweets", len(val_tweets), "Num positive tweets", len(val_tweets_pos), 
      "Num negative tweets", len(val_tweets_neg))

# Check number of manual labelled tweets and tweets to label
test_tweets_pos = test_tweets[test_tweets['label']==1]
test_tweets_neg = test_tweets[test_tweets['label']==-1]
print("Num test tweets", len(test_tweets), "Num positive tweets", len(test_tweets_pos), 
      "Num negative tweets", len(test_tweets_neg))

Num train tweets 248378 Num positive tweets 113298 Num negative tweets 135080
Num val tweets 27607 Num positive tweets 12347 Num negative tweets 15260
Num test tweets 488 Num positive tweets 238 Num negative tweets 250


# Export relevant data

In [24]:
training_data.to_csv('dedup_training_data.csv', sep=',')
val_tweets.to_csv('dedup_val_data.csv', sep=',')
test_tweets.to_csv('dedup_test_data.csv', sep=',')