In [None]:
import json
import requests
import pandas as pd
import boto3
from datetime import datetime, timedelta
from dateutil.parser import *
import unicodedata
import time

In [None]:
b_token = '<Your_Twitter_Bearer_Token>' # In your own environment, i would recommend to save your token as an environment variable
s3 = boto3.client('s3')  
bucket ='<Name_Of__Your_S3_Bucket>'

def lambda_handler(event, context): # Given for AWS Lambda
    
    # Creating a function that uses the token for authentication and returns headers to access API
    def create_headers(bearer_token):
        headers = {"Authorization": "Bearer {}".format(bearer_token)}
        return headers
    
    # Request for endpoint with parameters to pass  
    def create_url(keyword, start_date, end_date, max_results = 5):
        search_url = 'https://api.twitter.com/2/tweets/search/recent'
        # the query params can be adapted to collect the data needed
        query_params = {'query': keyword,
                        'start_time': start_date,
                        'end_time': end_date,
                        'max_results': max_results,
                        'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                        'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                        'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                        'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                        'next_token': {}} # This is very important, as it collects the token for the next request page
        return (search_url, query_params)
    
    # Connecting to the endpoint
    def connect_to_endpoint(url, headers, params, next_token = None):
        params['next_token'] = next_token   #params object received from create_url function
        response = requests.request('GET', url, headers = headers, params = params)
        print('Endpoint Response Code: ' + str(response.status_code))
        if response.status_code != 200:
            raise Exception(response.status_code, response.text)
        return response.json()
    
    # Input 
    bearer_token = b_token
    headers = create_headers(bearer_token)
    keyword = 'lang:en -is:retweet -is:reply -is:quote ("sanctions")' # only english tweets, no retweets, 
    # no replies, no quotes and the Tweets have to contain the keyword 'sanctions'
    # can be adapted accordingly
    start_time = (datetime.now() - timedelta(days=1, hours=3)).isoformat()[:-3]+"Z"
    end_time = (datetime.now() - timedelta(hours=3)).isoformat()[:-3]+"Z"
    max_results = 100  # has to be between 10 and 100 -> limitations for number of Tweets per request by Twitter
    data = []

    jsonFile = f'twitter_data_{datetime.utcnow().strftime("%Y%m%d_%H_%M_%fZ")}.json' 
    # UTC time included in file name, to identify the time period of collected Tweets by the file name

    count = 0 
    max_count = 45000 # Maximum of Tweets to be collected in total
    next_token = None  

    while count < max_count:
         # The following 2 lines are optional: security break in case of a mix up with the Tweet numbers
        if count+max_results > max_count:
            break   
    
        print('-------------------')
        print('Token: ', next_token)
        url = create_url(keyword, start_time,end_time, max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        result_count = json_response['meta']['result_count']
        entry = json_response
        
        if 'next_token' in json_response['meta']:
            # Save the token to use for next call
            next_token = json_response['meta']['next_token']
            print('Next Token: ', next_token)
            if result_count is not None and result_count > 0 and next_token is not None: 
                print('Start Date: ', start_time)
                data.append(entry)
                count += result_count
                print('Total # of Tweets added: ', count) # To keep track of number of Tweets collected during the process
                print('-------------------')
                uploadByteStream = bytes(json.dumps(data).encode('UTF-8'))

                        
        # If no next token exists
        else:
            if result_count is not None and result_count > 0: # If there is no next token anymore aka the last request page was reached
                print('-------------------')
                print('Start Date: ', start_time)
                data.append(entry)
                count += result_count
                print('Total # of Tweets added: ', count)
                print('-------------------')
                next_token = None
                uploadByteStream = bytes(json.dumps(data).encode('UTF-8'))
                s3.put_object(Bucket=bucket, Key=jsonFile, Body=uploadByteStream)

        time.sleep(0.5)  # To slow down requests 
        # -> should be adapted according to number of requests and runtime of the code to stay within Twitter request limits

        
    print('Total number of results: ', count) # To see, how many Tweets were collected in total
    print(jsonFile)      # To see, in which datafile, the Tweets are collected
    print('Put Complete')
    print('-------------------')
    
    return {
        'Total number of results: ': count,
        'File name': jsonFile
    }