In [1]:
import re
import tweepy
from textblob import TextBlob

In [2]:
def authenticate():
  # keys and tokens from the Twitter Dev Console
  consumer_key = '#'
  consumer_secret = '#'
  access_token = '#'
  access_token_secret = '#'

  BEARER_TOKEN = '#'
  # attempt authentication
  try:
    print("Authenticating...")
    # create Client object
    client = tweepy.Client(bearer_token=BEARER_TOKEN)
    # set access token and secret
    # create tweepy API object to fetch tweets
    print("API value:....   ", client)
    return client
  except Exception as e:
    print("Error: Authentication Failed", e)

In [3]:
API = authenticate()

Authenticating...
API value:....    <tweepy.client.Client object at 0x7f9449f1ffa0>


In [126]:
def get_tweets(query, start_time, end_time, next_token, geocode = '53.5500,2.4333,1mi', max_results=500,):
  '''
  Main function to fetch tweets.
  '''
  # empty list to store parsed tweets
  tweets = []

  expansions = ['author_id,in_reply_to_user_id,geo.place_id']
  tweet_fields = ['id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source']
  user_fields =  ['id,name,username,created_at,description,public_metrics,verified']
  place_fields = ['full_name,id,country,country_code,geo,name,place_type']
  try:
    # call twitter api to fetch tweets
    fetched_tweets = API.search_all_tweets(query,
      end_time=end_time,
      start_time=start_time,
      expansions=expansions,
      tweet_fields=tweet_fields,
      place_fields=place_fields,
      user_fields=user_fields,
      max_results=max_results,
      next_token=next_token
    )
    print(len(fetched_tweets.data))
    print(fetched_tweets.meta)
  
    return fetched_tweets

  except Exception as e:
    # print error (if any)
    print("Error getting tweets", e)


In [127]:
query = 'International Students OR Schooling in the UK  -is:retweet place_country:GB'
max_result = 500
end_time='2022-01-30T00:00:01Z'
start_time='2022-01-01T00:00:01Z',

result = get_tweets(query=query, start_time=start_time, end_time=end_time, next_token=None)


476
{'newest_id': '1487575860672491522', 'oldest_id': '1487455796677992451', 'result_count': 476, 'next_token': 'b26v89c19zqg8o3fpe4829txzerpzsg573dqdzz6fjmv1'}


In [130]:
import pandas as pd
import csv
import time

def append_to_csv(result_set, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in result_set.data:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = tweet['created_at']

        # 3. Geolocation
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "

        # 4. Tweet ID
        tweet_id = tweet['id']

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 7. source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
        res = [author_id, created_at, geo, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text]        
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

In [131]:
start_list = ['2022-01-01T00:00:00.000Z','2022-02-01T00:00:00.000Z','2022-03-01T00:00:00.000Z','2022-04-01T00:00:00.000Z','2022-05-01T00:00:00.000Z','2022-06-01T00:00:00.000Z','2022-07-01T00:00:00.000Z']
end_list = ['2022-01-31T00:00:00.000Z','2021-02-28T00:00:00.000Z','2021-03-31T00:00:00.000Z','2022-04-30T00:00:00.000Z','2021-05-31T00:00:00.000Z','2021-06-30T00:00:00.000Z','2021-07-31T00:00:00.000Z']

#Total number of tweets we collected from the loop
total_tweets = 0

# Create file
csvFile = open("data.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

csvWriter.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
csvFile.close()


for i in range(0,len(start_list)):
  # Inputs
  count = 0 # Counting tweets per time period
  max_count = 100 # Max tweets per time period
  flag = True
  next_token = None
  # Check if flag is true
  while flag:
      # Check if max_count reached
      if count >= max_count:
        break
      print("-------------------")
      print("Token: ", next_token)
      result = get_tweets(query=query, start_time=start_time, end_time=end_time, max_results=max_count, next_token=next_token)
      result_count = result.meta['result_count']
      if result.meta['next_token']:
        # Save the token to use for next call
        next_token = result.meta['next_token']
        print("Next Token: ", next_token)
        if result_count is not None and result_count > 0 and next_token is not None:
            print("Start Date: ", start_list[i])
            append_to_csv(result, "data.csv")
            count += result_count
            total_tweets += result_count
            print("Total # of Tweets added: ", total_tweets)
            print("-------------------")
            time.sleep(5)
      # If no next token exists
      else:
        if result_count is not None and result_count > 0:
          print("-------------------")
          print("Start Date: ", start_list[i])
          append_to_csv(result, "data.csv")
          count += result_count
          total_tweets += result_count
          print("Total # of Tweets added: ", total_tweets)
          print("-------------------")
          time.sleep(5)
        
        # Since this is the final request, turn flag to false to move to the next time period.
        flag = False
        next_token = None
      time.sleep(5)
print("Total number of results: ", total_tweets)


-------------------
Token:  None
94
{'newest_id': '1487575860672491522', 'oldest_id': '1487548323519152128', 'result_count': 94, 'next_token': 'b26v89c19zqg8o3fpe482bydgud4dzd1iak5qs0qv8iv1'}
Next Token:  b26v89c19zqg8o3fpe482bydgud4dzd1iak5qs0qv8iv1
Start Date:  2022-01-01T00:00:00.000Z
# of Tweets added from this response:  94
Total # of Tweets added:  94
-------------------
-------------------
Token:  b26v89c19zqg8o3fpe482bydgud4dzd1iak5qs0qv8iv1
97
{'newest_id': '1487548004337086466', 'oldest_id': '1487518414650281986', 'result_count': 97, 'next_token': 'b26v89c19zqg8o3fpe482bxh3kgafkgf3j3tdq8a36wsd'}
Next Token:  b26v89c19zqg8o3fpe482bxh3kgafkgf3j3tdq8a36wsd
Start Date:  2022-01-01T00:00:00.000Z
# of Tweets added from this response:  97
Total # of Tweets added:  191
-------------------
-------------------
Token:  None
94
{'newest_id': '1487575860672491522', 'oldest_id': '1487548323519152128', 'result_count': 94, 'next_token': 'b26v89c19zqg8o3fpe482bydgud4dzd1iak5qs0qv8iv1'}
Next T