In [None]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time

In [None]:
# set enviornment variable for later usage
os.environ['TOKEN'] = '<here you need to insert your token - as this is a security secret belonging to my personal account - like a password - it is not provided in this file'

In [None]:
# define method to authenticate
def auth():
    return os.getenv('TOKEN')

In [None]:
# build method to create header
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [None]:
# build method to create URL
def create_url(keyword, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/recent" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    # 'start_time': start_date, # defaulted to 7 days ago
                    # 'end_time': end_date, # defaulted to 30 seconds ago
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

In [None]:
# build metthod to create connection to targeted endpoint (API)
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [None]:
# set topic and language for further processing
# note: this is changed per execution of this script in order to generate different pools of topics

# topic = "covid lang:en"
topic = "research lang:en"
# max_results = 10
# max_tweets = 30
max_results = 100
max_tweets = 20000

In [None]:
# inputs for the request
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = topic

In [None]:
# create web service call
url = create_url(keyword, max_results)

In [None]:
# test - just check how it looks like
url

('https://api.twitter.com/2/tweets/search/recent',
 {'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
  'max_results': 100,
  'next_token': {},
  'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
  'query': 'research lang:en',
  'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
  'user.fields': 'id,name,username,created_at,description,public_metrics,verified'})

In [None]:
# get data for defined search term
# use predefined methods and call it in a loop as only around 100 entries can be requested with one call
# also build in a delay in order to avoid to get blocked by Twitter API
# this is done iteratively and the dataframe df is filled accordingly with the combined end result of all calls
# furthermore the entire loops ends if eitehr no further data can be retrieved by Twitter with the current 
# search settings or the limit of 20000 entries is reached (do not select more in order to ensure the terms are
# at the end to a certain degree balanced)

df = pd.DataFrame()

try:
  del df
except NameError:
  print("Should not happen")

end_criterion = 0
next_token = None # initialize for first run
while end_criterion == 0:
  # perform web service call

  try:
    json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
    df_tmp = pd.DataFrame(json_response['data'])
    try:
      df = df.append(df_tmp)
      print("Log: df_tmp appended to df")
    except NameError:
      # should only appear the first time
      print("Log: First run, set df with df_tmp")
      df = df_tmp
    print("Log: df has now a length of " + str(len(df)))
    
    try:
      next_token = json_response['meta']['next_token']
    except KeyError:
      print("Log: Caught Error - No next token available.")
      next_token = None

    if not next_token:
      print("Log: No next token available, all tweets regarding the topic " + topic + " received for the last 7 days.")
      end_criterion = 1
    if len(df) >= max_tweets:
      print("Log: Maximal number of " + str(max_tweets) + " tweets we want to collect reached.")
      end_criterion = 1
    if end_criterion == 0:
      # only wait if the loop continues
      time.sleep(30) # Sleep for 30 seconds
  except (ConnectionError, ConnectionResetError, ProtocolError) as ce:
    # wait for 30 seconds and re-try the call - hope it was only a one time error
    print("Caught Connection Error, ConnectionResetError, or ProtocolError")
    print(ce)
    time.sleep(30) # Sleep for 30 seconds


Endpoint Response Code: 200
Log: First run, set df with df_tmp
Log: df has now a length of 100
Endpoint Response Code: 200
Log: df_tmp appended to df
Log: df has now a length of 200
Endpoint Response Code: 200
Log: df_tmp appended to df
Log: df has now a length of 300
Endpoint Response Code: 200
Log: df_tmp appended to df
Log: df has now a length of 400
Endpoint Response Code: 200
Log: df_tmp appended to df
Log: df has now a length of 500
Endpoint Response Code: 200
Log: df_tmp appended to df
Log: df has now a length of 600
Endpoint Response Code: 200
Log: df_tmp appended to df
Log: df has now a length of 700
Endpoint Response Code: 200
Log: df_tmp appended to df
Log: df has now a length of 800
Endpoint Response Code: 200
Log: df_tmp appended to df
Log: df has now a length of 900
Endpoint Response Code: 200
Log: df_tmp appended to df
Log: df has now a length of 1000
Endpoint Response Code: 200
Log: df_tmp appended to df
Log: df has now a length of 1100
Endpoint Response Code: 200
Log: 

In [None]:
# check created data frame
df

Unnamed: 0,lang,text,author_id,referenced_tweets,conversation_id,created_at,id,source,reply_settings,public_metrics,in_reply_to_user_id,geo,withheld
0,en,RT @Sig_thesisFF: The Siggraph Thesis Fast For...,394573933,"[{'type': 'retweeted', 'id': '1478446669670846...",1478448529433579524,2022-01-04T19:29:32.000Z,1478448529433579524,Twitter for iPhone,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",,,
1,en,RT @JRNYcrypto: Its crazy to think NFTs are be...,1113106191687802884,"[{'type': 'retweeted', 'id': '1478433112329572...",1478448524308144132,2022-01-04T19:29:31.000Z,1478448524308144132,Twitter for iPhone,everyone,"{'retweet_count': 135, 'reply_count': 0, 'like...",,,
2,en,"@Al_Red_Mac @walrozt First, Alastair you were ...",53937582,"[{'type': 'replied_to', 'id': '147840120440958...",1478335219337998336,2022-01-04T19:29:30.000Z,1478448521007218694,Twitter for iPhone,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",460555579,,
3,en,RT @bodHOST: Discover the research by Microsof...,47515772,"[{'type': 'retweeted', 'id': '7382137218397839...",1478448519639977989,2022-01-04T19:29:29.000Z,1478448519639977989,Twitter Web App,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",,,
4,en,A fuller picture of who really drove the riot ...,33888038,,1478448517547012102,2022-01-04T19:29:29.000Z,1478448517547012102,Twitter Web App,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,en,RT @arc_nt: The ARC North Thames Early Career ...,2176215350,"[{'type': 'retweeted', 'id': '1478380790039912...",1478410472957296643,2022-01-04T16:58:18.000Z,1478410472957296643,Twitter Web App,everyone,"{'retweet_count': 6, 'reply_count': 0, 'like_c...",,,
96,en,Latest research has shown that curly hair ladi...,1040990048756396034,,1478410470621159429,2022-01-04T16:58:18.000Z,1478410470621159429,Twitter for Android,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",,,
97,en,Research into how deep the layers of damage ca...,1380427214752190464,,1478410466036785159,2022-01-04T16:58:17.000Z,1478410466036785159,Twitter Web App,everyone,"{'retweet_count': 1, 'reply_count': 1, 'like_c...",,,
98,en,RT @Sara_Edw: We’re hiring! Looking for ~10 su...,1107852445,"[{'type': 'retweeted', 'id': '1478409234761097...",1478410464396726274,2022-01-04T16:58:16.000Z,1478410464396726274,Twitter for iPhone,everyone,"{'retweet_count': 6, 'reply_count': 0, 'like_c...",,,


In [None]:
# create a file to store the gathered data for later usage
text_file_name = "data_" + str(topic) + ".csv"
text_file_name

'data_research lang:en.csv'

In [None]:
# send csv to local storage where it can be further processed and copied (file system)
df.to_csv(text_file_name)

In [None]:
# final check about the length - how many tweets were read during this cycle
len(df)

20082