# Evacuation Tweet Preview

```sh
search_tweets.py --credential-file=creds.yml --query='("evacuate" OR "evacuating" OR "leaving" OR "leave" OR "escape" OR "escaping") -people -residents lang:en -is:retweet -has:links -is:nullcast' --start-time=2017-09-04T00:00 --end-time=2017-09-17T00:00 --results-per-call=500 --expansions="geo.place_id,author_id" --tweet-fields="id,text,public_metrics,created_at,entities,geo,in_reply_to_user_id,lang,referenced_tweets" --user-fields="id,created_at,description,location,public_metrics,verified,username,name" --place-fields="geo,name,full_name" --filename-prefix="evacuate"
```
code: https://github.com/twitterdev/search-tweets-python/tree/v2

In [18]:
from preprocess import load_tweets, aggregate_tweet_info, geolocate, convert_located_tweets_to_dataframe

`evacuate.json`: ~974k tweets

In [2]:
evac_data = load_tweets('evacuate.json')
for k, v in evac_data.items():
    print(len(v), k)
evac_tweets = aggregate_tweet_info(evac_data)
geolocate(evac_tweets) # in-place
df_vac_tweets = convert_located_tweets_to_dataframe(evac_tweets)
df_vac_tweets.to_csv('evacuation_reformatted.csv.gz', index=False)

974002it [00:20, 48542.47it/s]


969796 tweets
676655 users
9052 places


100%|██████████| 969796/969796 [2:12:17<00:00, 122.18it/s]  


In [3]:
df_vac_tweets.head(3)

Unnamed: 0,tweet_id,user_id,username,user_location,tweet_time,text,tweet_place_type,tweet_place_name,tweet_lat,tweet_lon,loc_lat,loc_lon,loc_country,loc_state,loc_county,loc_city
0,909205251361665024,50491692,SparKLeShiNes,Bay Area,2017-09-16 23:59:58+00:00,Kevin she's not leaving you...,,,,,37.878695,-122.370941,United States,California,,
1,909205250975961088,361525569,__sxmiraa,"Kennington, London",2017-09-16 23:59:58+00:00,Leave out in 2017 is childishhhh pls pls,,"Canterbury, England",51.276756,1.090136,51.27597,1.07561,United Kingdom,England,Kent,Canterbury
2,909205248681631744,788780861881782272,sofiabertox1,"Glasgow, Scotland",2017-09-16 23:59:58+00:00,Am a needy bastard take it or leave it😘 but in...,,,,,55.857809,-4.242511,United Kingdom,Scotland,Lanarkshire,Glasgow


In [1]:
from preprocess import load_tweets_csv, tweet_summary, is_none

In [2]:
evac_tweets = load_tweets_csv('evacuation_reformatted.csv.gz')

In [3]:
tweet_summary(evac_tweets)

969796 tweets
    51128 with original geo info
    416736 geolocated
    273718 from US
676655 users
    486719 have location in profile
    303672 geolocated
202149 US users
    9950 without state info
    70400 without county info
    71105 without city info


In [4]:
def get_fl_users(tweets):
    users = tweets.groupby('username').first()
    return users[(users.loc_country == 'United States') & (users.loc_state == 'Florida')]

```
22340 FL users
8529 without county info
8596 without city info
Miami                 2529
Tampa                 1631
Orlando               1598
Jacksonville           586
Tallahassee            446
```

In [9]:
fl_users = get_fl_users(evac_tweets)
print(f"""{len(fl_users)} FL users
{sum(is_none(county) for county in fl_users.loc_county)} without county info
{sum(is_none(city) for city in fl_users.loc_city)} without city info""")
fl_users.loc_city.value_counts().iloc[:50]

25188 FL users
9234 without county info
9270 without city info


Miami                 3576
Tampa                 2097
Orlando               2096
Jacksonville           745
Tallahassee            466
Fort Lauderdale        384
St Petersburg          322
Gainesville            241
Sarasota               220
West Palm Beach        196
Fort Myers             191
Naples                 161
Pensacola              159
Boca Raton             153
Jacksonville Beach     149
Clearwater             137
Miami Beach            136
Hollywood              125
Coral Gables           125
Lakeland               123
Edgewood               121
Cape Coral             120
Daytona Beach          117
Jupiter                113
Pembroke Pines         102
Bradenton               99
Port St Lucie           95
Melbourne               93
Ocala                   92
Oviedo                  84
Kissimmee               82
Venus                   82
St Augustine            78
Delray Beach            71
Homestead               68
Boynton Beach           67
Largo                   66
S

In [17]:
print(f"""{fl_users[fl_users.loc_city == 'Miami'].loc_county.iloc[0]}
{fl_users[fl_users.loc_city == 'Tampa'].loc_county.iloc[0]}
{fl_users[fl_users.loc_city == 'Orlando'].loc_county.iloc[0]}
{fl_users[fl_users.loc_city == 'Jacksonville'].loc_county.iloc[0]}
{fl_users[fl_users.loc_city == 'Tallahassee'].loc_county.iloc[0]}""")

Miami-Dade County
Hillsborough County
Orange County
Duval County
Leon County


In [5]:
from tqdm import tqdm
import re
from preprocess import load_tweets, aggregate_tweet_info

def to_dict(list_of_raw_tweets):
    return {tweet['id']: tweet for tweet in list_of_raw_tweets}

In [6]:
raw_tweets = to_dict(aggregate_tweet_info(load_tweets('evacuate.json')))

974002it [00:18, 52800.00it/s]


In [7]:
fl_cities = {
    'Miami': 'Miami-Dade County',
    'Tampa': 'Hillsborough County',
    'Orlando': 'Orange County',
    'Jacksonville': 'Duval County',
    'Tallahassee': 'Leon County',
    'Florida': None
}

for idx, tweet in tqdm(evac_tweets.iterrows()):
    if not is_none(tweet.loc_city):
        continue
    usr = raw_tweets[tweet.tweet_id]['user']
    uloc = (usr['location'] if 'location' in usr else '') + ' ' + usr['description']
    for city in fl_cities:
        if  re.search(fr'(\b{city.lower()})|({city.lower()}\b)', uloc.lower()) or re.search(city, uloc):
            evac_tweets.at[idx, 'loc_country'] = 'United States'
            evac_tweets.at[idx, 'loc_state'] = 'Florida'
            evac_tweets.at[idx, 'loc_county'] = fl_cities[city]
            evac_tweets.at[idx, 'loc_city'] = city if city != 'Florida' else None
            break

969796it [02:03, 7861.87it/s]


In [8]:
fl_tweets = evac_tweets[evac_tweets.loc_state == 'Florida']
len(fl_tweets) # 33982

38792

### Extract info of positive tweets

In [10]:
import pandas as pd

In [11]:
labeled_tweets = pd.read_csv('labeled_5000.csv', dtype={'tweet_id': str, 'user_id': str})

In [12]:
fl_tweets = fl_tweets.join(labeled_tweets.set_index('tweet_id')[['label']], on='tweet_id')

In [13]:
fl_tweets.label.value_counts()

neutral     2005
positive    1727
negative    1267
Name: label, dtype: int64

In [None]:
cnt = 0
for idx, tweet in fl_users.iterrows():
    if tweet.loc_city != 'Miami' or not is_none(tweet.user_location) and 'miami' in tweet.user_location.lower():
        continue
    cnt += 1
    print('%03d' % cnt, tweet.tweet_id, raw_tweets[tweet.tweet_id]['user']['description'])

In [20]:
from classification import tokenize, train, tweets_before_landfall
import numpy as np

In [21]:
labeled_mask = np.array([not is_none(label) for label in fl_tweets.label])

In [22]:
X,y,vec,metas,clf,preds,probas = train(tweets_before_landfall(fl_tweets[labeled_mask]), False)

4139it [00:01, 2895.73it/s]


Counter({'positive': 1579, 'neutral': 1434, 'negative': 1126})
Counter({'positive': 1682, 'neutral': 1244, 'negative': 1213})
              precision    recall  f1-score   support

    negative       0.80      0.87      0.83      1126
     neutral       0.79      0.68      0.73      1434
    positive       0.81      0.86      0.83      1579

    accuracy                           0.80      4139
   macro avg       0.80      0.80      0.80      4139
weighted avg       0.80      0.80      0.80      4139

auc=0.927 (positive vs others)
auc=0.950 (negative vs others)
auc=0.913 (weighted ovr)


In [23]:
# predictions on unlabeled data
idxs = []
u_preds = clf.predict(vec.transform(tokenize(txt) for idx, txt in tqdm(fl_tweets[~labeled_mask].iterrows()) if not idxs.append(idx)))
pred_pos_idxs = np.array(idxs)[u_preds == 'positive']

33793it [01:15, 447.28it/s]


In [24]:
pos_idxs = fl_tweets.index[fl_tweets.label == 'positive'].to_numpy()

In [40]:
pos_tweet_metas = fl_tweets.loc[np.append(pos_idxs, pred_pos_idxs)][['tweet_time', 'loc_city', 'loc_county', 'username']]
pos_tweet_metas = pos_tweet_metas.rename(columns={'tweet_time': 'time', 'loc_city': 'city', 'loc_county': 'county'})
pos_tweet_metas['annotated'] = False
pos_tweet_metas.iloc[:len(pos_idxs), -1] = True
pos_tweet_metas['annotated'].value_counts()

False    2318
True     1727
Name: annotated, dtype: int64

In [42]:
#fl_tweets[labeled_mask].to_csv('evacuation_FL_labeled_5000.csv', index=False)
pos_tweet_metas.sort_values(by=['time']).to_csv('irma_positive_tweet_metadata.csv', index=False)

1727 labeled & 2318 predicted positive (=4045) from 3518 users

1533 unknown cities, 1529 unknown counties

In [43]:
sum(is_none(city) for city in pos_tweet_metas.city)

1533