In [1]:
# python libraries
import tweepy
import csv
import pandas as pd

# twitter keys (personal ones can't be published) 
CONSUMER_KEY = ''
CONSUMER_SECRET = ''

ACCESS_KEY = ''
ACCESS_SECRET = ''

In [2]:
# accessing twitter API using tweepy
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)

# twitter API has rate limit: only restricted number of calls can be made in 15min window,
# if limit is reached, tweepy is waiting for next window and outputs notification
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [3]:
def get_tweets(tweet_IDs, API):
    '''
    This function uses tweepy's "statuses_lookup" 
    (https://docs.tweepy.org/en/v3.10.0/api.html#API.statuses_lookup)
    to return full tweet objects for up to 100 tweets per request.

    adapted from https://stackoverflow.com/questions/44581647/retrieving-a-list-of-tweets-using-tweet-id-in-tweepy
    '''

    full_tweets = []
    tweet_count = len(tweet_IDs)
    
    try:
        # "//" = real floor division
        for i in range(((tweet_count-1) // 100) + 1):
            # catch the last group if it is less than 100 tweets
            end_loc = min((i+1) * 100, tweet_count)
            full_tweets.extend(
                # separating input tweet IDs into chunks of 100
                API.statuses_lookup(id_=tweet_IDs[(i*100):end_loc], trim_user=True)
            )

            if i % 100 == 0:
                print(str(i)+' chunks (~'+str(i*100)+' tweets) analyzed...')

        return full_tweets
    
    except tweepy.TweepError:
        print('something went wrong, quitting...')

In [4]:
# import tweet IDs from labelled data set as list to download text and meta-info
df_tweet_ids = pd.read_csv('data/ENCASEH2020/hatespeech_labels.csv')
print(df_tweet_ids.info())

tweet_ids = df_tweet_ids['tweet_id'].to_list()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99799 entries, 0 to 99798
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  99799 non-null  int64 
 1   label     99799 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB
None


In [5]:
results = get_tweets(tweet_ids, api)

0 chunks (~0 tweets) analyzed...
100 chunks (~10000 tweets) analyzed...
200 chunks (~20000 tweets) analyzed...
300 chunks (~30000 tweets) analyzed...
400 chunks (~40000 tweets) analyzed...
500 chunks (~50000 tweets) analyzed...
600 chunks (~60000 tweets) analyzed...
700 chunks (~70000 tweets) analyzed...
800 chunks (~80000 tweets) analyzed...


Rate limit reached. Sleeping for: 197


900 chunks (~90000 tweets) analyzed...


In [6]:
# save all status attributes (json object) into one dataframe
# https://stackoverflow.com/questions/47925828/how-to-create-a-pandas-dataframe-using-tweepy
# FIXME: text of tweets sometimes randomly contains EoL characters, this should actually be ignored in json-file!

temp = [status._json for status in results]
df_full_tweets = pd.json_normalize(temp)
print(df_full_tweets.info())
df_full_tweets.to_csv('data/full_tweets_all.csv', encoding='utf-8')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54280 entries, 0 to 54279
Columns: 186 entries, created_at to retweeted_status.withheld_in_countries
dtypes: bool(4), float64(39), int64(4), object(139)
memory usage: 75.6+ MB
None


In [7]:
# save text and geo-information of tweets in csv-file
csv_file_path = 'data/full_tweets_geo.csv'
header = [
    'tweet_id',
    'coordinates',
    'place_full_name', 'place_country_code', 'place_bb_coord', 'created_at', 'text'
]

with open(csv_file_path, 'w', errors='replace', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    # write the header
    csv_writer.writerow(header)

    for tweet in results:
        if tweet:
            csv_writer.writerow([
                tweet.id, 
                tweet.coordinates,
                tweet.place.full_name if tweet.place else None,
                tweet.place.country_code if tweet.place else None,
                tweet.place.bounding_box.coordinates if tweet.place else None,
                tweet.created_at,
                tweet.text.encode('utf-8')
            ])

In [8]:
# analyse results
# FIXME!

df_geo = pd.read_csv('data/full_tweets_geo.csv')
print('# labelled tweets:',len(tweet_ids))
print(df_geo.head())
print(df_geo.info())

print(df_geo['place_country_code'].value_counts())

# labelled tweets: 99799
             tweet_id coordinates place_full_name place_country_code  \
0  848337741813358592         NaN             NaN                NaN   
1  850344984742174720         NaN             NaN                NaN   
2  848668638869671939         NaN             NaN                NaN   
3  848338236770582529         NaN             NaN                NaN   
4  847542736651767809         NaN             NaN                NaN   

  place_bb_coord           created_at  \
0            NaN  2017-04-02 00:54:13   
1            NaN  2017-04-07 13:50:17   
2            NaN  2017-04-02 22:49:05   
3            NaN  2017-04-02 00:56:11   
4            NaN  2017-03-30 20:15:09   

                                                text  
0  b'fucks sake go away stupid anon \xe2\x80\x94 ...  
1  b'Carlos Correa had gyalchester as his walkup ...  
2  b'Damn dean just put Corbin to sleep. That Mat...  
3  b'Dick Tracy Meets Gruesome - the 2017 re-boot...  
4  b'what idiot call