In [1]:
# code credits: https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a

In [1]:
import requests
import os
import json
import csv
import pandas as pd
import datetime
import dateutil.parser
import unicodedata
import time

# Enter your bearer_token here

In [2]:
os.environ['TOKEN'] = ''

In [3]:
def auth():
    return os.getenv('TOKEN')

In [4]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

# Change date here

In [5]:
def create_url(keyword, start_date, end_date, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/recent"

    query_params = {
        'query': '#' + keyword + ' lang:en -is:retweet',
        'start_time': start_date,
        'end_time': end_date,
        'max_results': max_results,
        'sort_order': 'relevancy',
        'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld',
        'expansions': 'attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id',
        'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
        'media.fields': 'duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text',
        'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
        'next_token': {}
    }
    return (search_url, query_params)

In [6]:
# def custom_create_url(tag):
#   url = "https://api.twitter.com/2/tweets/search/recent"
#   params = {
#       'query': '#'+tag+' lang:en -is:retweet',
#       'start_time': '2022-02-11T00:00:00.000Z',
#       'end_time': '2022-02-17T00:00:00.000Z',
#       'max_results':100,
#       'sort_order': 'relevancy',
#       'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld',
#       'expansions': 'attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id',
#       'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
#       'media.fields': 'duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text',
#       'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type',
#       'poll.fields': 'duration_minutes,end_datetime,id,options,voting_status',
#       'next_token': {}
#   }
#   return url, params

In [7]:
# def bearer_oauth(r):
#     """
#     Method required by bearer token authentication.
#     """

#     r.headers["Authorization"] = f"Bearer {bearer_token}"
#     r.headers["User-Agent"] = "v2TweetLookupPython"
#     return r

In [8]:
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [9]:
# def connect_to_endpoint(url):
#     response = requests.request("GET", url, auth=bearer_oauth)
#     print(response.status_code)
#     if response.status_code != 200:
#         raise Exception(
#             "Request returned an error: {} {}".format(
#                 response.status_code, response.text
#             )
#         )
#     return response.json()

In [10]:
def append_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 4. Tweet ID
        tweet_id = tweet['id']

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 7. source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
        res = [author_id, created_at, geo, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter)

In [11]:
# def custom_connect_to_endpoint(custom_url, params, next_token=None):
#     params['next_token'] = next_token
#     response = requests.request("GET", custom_url, auth=bearer_oauth, params=params)
#     print(response.status_code)
#     if response.status_code != 200:
#         raise Exception(
#             "Request returned an error: {} {}".format(
#                 response.status_code, response.text
#             )
#         )
#     return response.json()

In [12]:
# def make_csv(json_response, file_path, csvWriter):
#   counter = 0
  
 
#   for tweet in json_response['data']:
      
#       # We will create a variable for each since some of the keys might not exist for some tweets
#       # So we will account for that

#       # 1. Author ID
#       author_id = str(tweet['author_id'])

#       # 2. Time created
#       created_at = dateutil.parser.parse(tweet['created_at'])

#       # example = 
#       # {
#       #           "country": "United States",
#       #           "country_code": "US",
#       #           "full_name": "Chicago, IL",
#       #           "geo": {
#       #               "bbox": [
#       #                   -87.940033,
#       #                   41.644102,
#       #                   -87.523993,
#       #                   42.0230669
#       #               ],
#       #               "properties": {},
#       #               "type": "Feature"
#       #           },
#       #           "id": "1d9a5370a355ab0c",
#       #           "name": "Chicago",
#       #           "place_type": "city"
#       #       }

#       geo = ""
#       country = ""
#       country_code = ""
#       place_full_name = ""
#       place_name = ""
#       place_type = ""
#       # 3. Geolocation
#       if ('geo' in tweet):   
#           geo = tweet['geo']['place_id']
#           for place in json_response['includes']['places']:
#             if place['id'] == geo:
#               country = place['country']
#               country_code = place['country_code']
#               place_full_name = place['full_name']
#               place_name = place['name']
#               place_type = place['place_type']
#       else:
#           geo = ""

#       # 4. Tweet ID
#       tweet_id = str(tweet['id'])

#       # 5. Language
#       lang = tweet['lang']

#       # 6. Tweet metrics
#       retweet_count = tweet['public_metrics']['retweet_count']
#       reply_count = tweet['public_metrics']['reply_count']
#       like_count = tweet['public_metrics']['like_count']
#       quote_count = tweet['public_metrics']['quote_count']

#       # 7. source
#       source = tweet['source']

#       # 8. Tweet text
#       text = tweet['text']

#       # 9 hashtags
#       tags = ''
#       if ('hashtags' in tweet['entities']):  
#         for tag in tweet['entities']['hashtags']:
#           tags += tag['tag'] + str(',')
#         tags = tags[:-1]

#       #sensitive
#       sensitive = tweet['possibly_sensitive']

#       # urls for further analysis
#       urls = ''
#       if 'urls' in tweet['entities']:
#         for url in tweet['entities']['urls']:
#           urls += url['url'] + str(',')
#         urls = urls[:-1]
      
#       #annotations
#       context_text = ''
#       context_probability = 0
#       context_type = ''
#       if 'tweets' in json_response['includes']:
#         for tweets_for_annotation in json_response['includes']['tweets']:
#             if tweets_for_annotation['conversation_id'] == tweet['conversation_id']:
#               if 'entities' in tweets_for_annotation:
#                 if 'annotations' in tweets_for_annotation['entities']:
#                   for annotation in tweets_for_annotation['entities']['annotations']:
#                     context_text = annotation['normalized_text']
#                     context_probability = annotation['probability']
#                     context_type = annotation['type']


      
#       # Assemble all data in a list
#       res = [author_id, created_at, geo, country, country_code, place_full_name, place_name, place_type, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text, tags, sensitive, urls, context_text, context_probability, context_type]
      
#       # Append the result to the CSV file
#       csvWriter.writerow(res)
#       counter += 1




#   # Print the number of tweets for this iteration
#   print("# of Tweets added from this response: ", counter)

# update query list every day

In [13]:
# def main():
#     query_list = ['canada']
#     total_tweets = 0
    
#     csvFile = open(file_path, "a", newline="", encoding='utf-8')
#     csvWriter = csv.writer(csvFile)
#     csvWriter.writerow(['author id', 'created_at', 'geo', 'country', 'country_code', 'place_full_name', 'place_name', 'place_type', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet', 'hashtags', 'sensitive', 'urls', 'context_text', 'context_probability', 'context_type'])
    
#     for tag in query_list:
#         count = 0
#         max_count = 1000
#         flag = True
#         next_token = None
        
#         while flag:
#             if count >= max_count:
#                 break
#             print("-------------------")
#             print("Token: ", next_token)
#             custom_url, params = custom_create_url(tag)
#             custom_json_response = custom_connect_to_endpoint(custom_url, params, next_token)
#             result_count = custom_json_response['meta']['result_count']

#             if 'next_token' in custom_json_response['meta']:
#                 # Save the token to use for next call
#                 next_token = custom_json_response['meta']['next_token']
#                 print("Next Token: ", next_token)
#                 if result_count is not None and result_count > 0 and next_token is not None:
#                     make_csv(custom_json_response, file_path, csvWriter)
#                     count += result_count
#                     total_tweets += result_count
#                     print("Total # of Tweets added: ", total_tweets)
#                     print("-------------------")
#                     time.sleep(5)
#             # If no next token exists
#             else:
#                 print('inside else')
#                 if result_count is not None and result_count > 0:
#                     print("-------------------")
#                     make_csv(custom_json_response, file_path, csvWriter)
#                     count += result_count
#                     total_tweets += result_count
#                     print("Total # of Tweets added: ", total_tweets)
#                     print("-------------------")
#                     time.sleep(5)
#                 #Since this is the final request, turn flag to false to move to the next time period.
#                 flag = False
#                 next_token = None
#             time.sleep(5)
#     csvFile.close()

In [14]:
def main():
    #Inputs for tweets
    bearer_token = auth()
    headers = create_headers(bearer_token)
    keyword = "narendramodi"
    s_time = ['00:00:00', '01:00:00','02:00:00','03:00:00','04:00:00','05:00:00','06:00:00','07:00:00','08:00:00','09:00:00','10:00:00', '11:00:00', '12:00:00', '13:00:00', '14:00:00', '15:00:00', '16:00:00', '17:00:00', '18:00:00', '19:00:00', '20:00:00', '21:00:00', '22:00:00', '23:00:00']
    e_time = ['01:00:00', '02:00:00','03:00:00','04:00:00','05:00:00','06:00:00','07:00:00','08:00:00','09:00:00','10:00:00','11:00:00', '12:00:00', '13:00:00', '14:00:00', '15:00:00', '16:00:00', '17:00:00', '18:00:00', '19:00:00', '20:00:00', '21:00:00', '22:00:00', '23:00:00','23:59:59']
    max_results = 20

    #Total number of tweets we collected from the loop
    total_tweets = 0

    # Create file
    csvFile = open("data.csv", "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Create headers for the data you want to save, in this example, we only want save these columns in our dataset
    csvWriter.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
    csvFile.close()
    
    date='2022-02-16'

    for i in range(0,len(s_time)):

        # Inputs
        count = 0 # Counting tweets per time period
        max_count = 100 # Max tweets per time period
        flag = True
        next_token = None

        # Check if flag is true
        while flag:
            # Check if max_count reached
            if count >= max_count:
                break
            print("-------------------")
            print("Token: ", next_token)
            url = create_url(keyword, date+'T'+s_time[i]+'.000Z',date+'T'+e_time[i]+'.000Z', max_results)
            json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
            result_count = json_response['meta']['result_count']

            if 'next_token' in json_response['meta']:
                # Save the token to use for next call
                next_token = json_response['meta']['next_token']
                print("Next Token: ", next_token)
                if result_count is not None and result_count > 0 and next_token is not None:
                    print("Start Date: ", date+'T'+s_time[i]+'.000Z')
                    append_to_csv(json_response, "data.csv")
                    count += result_count
                    total_tweets += result_count
                    print("Total # of Tweets added: ", total_tweets)
                    print("-------------------")
                    time.sleep(1)                
            # If no next token exists
            else:
                if result_count is not None and result_count > 0:
                    print("-------------------")
                    print("Start Date: ", date+'T'+s_time[i]+'.000Z')
                    append_to_csv(json_response, "data.csv")
                    count += result_count
                    total_tweets += result_count
                    print("Total # of Tweets added: ", total_tweets)
                    print("-------------------")
                    time.sleep(1)

                #Since this is the final request, turn flag to false to move to the next time period.
                flag = False
                next_token = None
            time.sleep(1)
    print("Total number of results: ", total_tweets)

In [15]:
# def main():
#     csvFile = open(file_path, "a", newline="", encoding='utf-8')
#     csvWriter = csv.writer(csvFile)
#     csvWriter.writerow(['author id', 'created_at', 'geo', 'country', 'country_code', 'place_full_name', 'place_name', 'place_type', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet', 'hashtags', 'sensitive', 'urls', 'context_text', 'context_probability', 'context_type'])
  
#     query_list = ['nasa']
#     # url = create_url()
#     # json_response = connect_to_endpoint(url)
#     for tag in query_list: 
#       custom_url, params = custom_create_url(tag)
#       custom_json_response = custom_connect_to_endpoint(custom_url, params)
#       make_csv(custom_json_response, file_path, csvWriter)
#       print(json.dumps(custom_json_response, indent=4, sort_keys=True))
    
#     # When done, close the CSV file
#     csvFile.close()

In [17]:
if __name__ == "__main__":
    main()

-------------------
Token:  None
Endpoint Response Code: 200
-------------------
Start Date:  2022-02-16T00:00:00.000Z
# of Tweets added from this response:  2
Total # of Tweets added:  2
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
-------------------
Start Date:  2022-02-16T01:00:00.000Z
# of Tweets added from this response:  4
Total # of Tweets added:  6
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
-------------------
Start Date:  2022-02-16T02:00:00.000Z
# of Tweets added from this response:  5
Total # of Tweets added:  11
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
-------------------
Start Date:  2022-02-16T03:00:00.000Z
# of Tweets added from this response:  18
Total # of Tweets added:  29
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
-------------------
Start Date:  2022-02-16T04:00:00.000Z
# of Tweets added from this response:  20
Tota