In [5]:
import requests
import os
import json
import pandas as pd
import csv
import datetime
import dateutil.parser
import unicodedata
import time
import tweepy
from pathlib import Path

In [8]:
# Read in Company Data to Extract Twitter Accounts
companies = pd.read_excel("2022 SP1500 Twitter Handles - Final Deliverable.xlsx")

# Strip @ from Twitter to use with API Call
companies['corp_twitter'] = companies['Corporate Twitter Handle'].str[1:]
companies['cust_service_twitter'] = companies['Customer Service Twitter Handle'].str[1:]
companies['new_ceo_twitter'] = companies['New CEO Twitter Handle'].str[1:]
companies['old_ceo_twitter'] = companies['Original Data - CEO Twitter Handle'].str[1:]

# Isolate all Twitter accounts we are interested in
corp_twitter_list = list(companies[companies['corp_twitter'].notnull()]['corp_twitter'])
cust_service_twitter_list = list(companies[companies['cust_service_twitter'].notnull()]['cust_service_twitter'])
old_ceo_twitter_list = list(companies[companies['old_ceo_twitter'].notnull()]['old_ceo_twitter'])
new_ceo_twitter_list = list(companies[companies['new_ceo_twitter'].notnull()]['new_ceo_twitter'])

# API Keys and Bearer Key
os.environ['API_KEY'] = # Insert API KEY
os.environ['API_SECRET_KEY'] = # INSERT API SECRET KEY
os.environ['BEARER_TOKEN'] = # INSERT BEARER TOKEN
os.environ['ACCESS_TOKEN'] = # INSERT ACCESS TOKEN
os.environ['ACCESS_TOKEN_SECRET'] = # INSERT ACCESS TOKEN SECRET

def auth():
    '''Pulls the Bearer Token we have stored in the environment for Twitter API Pulls'''
    return os.getenv('BEARER_TOKEN')

def create_headers(bearer_token):
    '''Creates the headers necessary for Twitter API Pulls'''
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def create_url(query, start_date, end_date, max_results=500): 
    '''Creates the URL needed to access the Twitter API, with the data we want to pull out'''
    search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': query,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

def connect_to_endpoint(url, headers, params, next_token = None):
    '''Connects to Twitter API endpoint'''
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code == 503:
        print('ERROR CODE 503: Server is receiving too many requests. Restart and retry.')
        time.sleep(5)
        connect_to_endpoint(url, headers, params, next_token = params['next_token'])
    elif response.status_code == 400:
        print("ERROR CODE 400: This account doesn't appear to exist. Check Twitter Handle and try again in the future.")
        return 400
    elif response.status_code == 429:
        print("CODE 429: Usage cap exceeded. Max tweets of 10,000,000 pulled for the month.")
        return 429
    elif response.status_code != 200:
        raise Exception(response.status_code, response.text)
    else:
        return response.json()

def create_csv(fileName):
    '''Creates csv file with contents we want to put in for our Twitter companies'''
    if not os.path.exists(fileName):
        # Creates file if it doesn't already exist
        csvFile = open(fileName, "a", newline="", encoding='utf-8')
        csvWriter = csv.writer(csvFile)

        #Create headers for the data you want to save, in this example, we only want save these columns in our dataset
        csvWriter.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 
                        'reply_count','retweet_count', 'tweet_type', 'source','tweet'])
        csvFile.close()
    else:
        print("File Already Exists!")
    
def append_to_csv(json_response, fileName):
    '''Appends data from Twitter API to CSV file that has already been created'''
    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # Isolate Tweet Data
        author_id = tweet['author_id']
        created_at = dateutil.parser.parse(tweet['created_at'])
        tweet_id = tweet['id']
        lang = tweet['lang']
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']
        text = tweet['text']
        
        # Source, if available
        if ('source' in tweet):
            source = tweet['source']
        else:
            source = " "
        #Geolocation, if available
        if ('geo' in tweet):   
            geo = " "
        else:
            geo = " "
        
        # Kind of Tweet
        if ('referenced_tweets' in tweet):
            tweet_type = tweet['referenced_tweets'][0]['type']
        else:
            tweet_type = "original"
        
        # Assemble all data in a list
        res = [author_id, created_at, geo, tweet_id, lang, like_count, quote_count, 
               reply_count, retweet_count, tweet_type, source, text]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 
    
def pull_account_tweets(twitter_handle, account_type, 
                        start_date='2009-01-01T00:00:00.000Z', end_date='2022-12-31T00:00:00.000Z'):
    bearer_token = auth()
    headers = create_headers(bearer_token)
    query = "from:" + str(twitter_handle) + " lang:en" # Pulls tweets from that account in the English language
    max_results = 500 # 500 is the max amount of tweets we can pull in one query

    # Total number of tweets we collected from the loop
    total_tweets = 0

    fileName = account_type + " Account Data/" + str(twitter_handle) + ".csv" 
    # FileName is always going to be the twitter handle
    create_csv(fileName)
    
    # Inputs
    count = 0 # Counting tweets per time period
    flag = True
    next_token = None
    
    # Check if flag is true
    while flag:
        url = create_url(query, start_date, end_date, max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        if json_response == 400:
            break
        elif json_response == 429:
            break
        result_count = json_response['meta']['result_count']

        if 'next_token' in json_response['meta']:
            # Save the token to use for next call, only runs if there are more tweets in the period
            next_token = json_response['meta']['next_token']
            if result_count is not None and result_count > 0 and next_token is not None:
                append_to_csv(json_response, fileName)
                count += result_count
                total_tweets += result_count
                print("Total Tweets Added: " , total_tweets)
                time.sleep(5)                
        # If no next token exists
        else:
            if result_count is not None and result_count > 0:
                append_to_csv(json_response, fileName)
                count += result_count
                total_tweets += result_count
                print("Total Tweets Added: ", total_tweets)
                time.sleep(5)
            
            #Since this is the final request, turn flag to false to move to the next time period.
            flag = False
            next_token = None
        time.sleep(5)

    print("Total number of results: ", total_tweets)
    
def determine_file_exists(account, account_type):
    ''' Function to determine if a csv file exists for the account we want to run. Used to filter accounts 
        that have already been run'''
    if os.path.exists(account_type + " Account Data/" + str(account) + ".csv"):
        return True
    else:
        return False
    
def mass_pull_data(account_list, account_type, update_backwards=False, update_forwards=False, forward_year=2022):
    # For the future, add way to run for just the past year. This does all tweets or updates for older tweets that weren't
    # pulled due to some kind of error. 
    if account_type == "Old CEO":
        twitter_handle_column = "Original Data - CEO Twitter Handle"
    else:
        # Ensure account_type is either Corporate, Customer Service, New CEO, or Old CEO
        twitter_handle_column = account_type + " Twitter Handle"
    company_data = (pd.read_excel("2022 SP1500 Twitter Handles - Final Deliverable.xlsx")[['Name', twitter_handle_column]]
                        .rename(columns={'Name':'Company', twitter_handle_column:'Twitter Handle'}))
    if update_backwards == True:
        # Updates backwards, meaning that not all tweets have been pulled for an account.
        # This will take the latest tweet pulled and pull all tweets from 2009-01-01 to that tweet's date, updating the 
        # data file as it goes.
        accounts_to_run = account_list
        for account in accounts_to_run:
            path = Path(account_type + " Account Data/"+str(account)+".csv")
            if path.is_file():
                # If the path to that file already exists, check if it's empty
                data = pd.read_csv(account_type + " Account Data/"+str(account)+".csv")
                if data.empty:
                    # Pull data for the empty file
                    print('File Empty. Pulling ' + account + ' Tweets from 2009-01-01 to 2022-12-31')
                    pull_account_tweets(account, account_type)
                    print('Finished Pulling ' + account + ' Tweets')
                else:
                    # Isolate last date and pull tweets from beginning to that date
                    last_date = data.iloc[-1]['created_at']
                    last_date = datetime.datetime.fromisoformat(last_date)
                    last_date = last_date.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]+"Z"
                    print('Pulling ' + account + ' Tweets from 2009-01-01 to ' + str(last_date[:10]))
                    pull_account_tweets(account, account_type, end_date=last_date)
                    print('Finished Pulling ' + account + ' Tweets')
                    #create_summary_data(twitter_handle=account, account_type=account_type, company_data=company_data)
                    #print('Updated ' + account + ' Summary Data\n')
            else:
                # If the file doesn't exist, it just runs as normal, creating the file and populating with data
                print('Pulling ' + account + ' Tweets')
                pull_account_tweets(account, account_type)
                print('Finished Pulling ' + account + ' Tweets')
                #create_summary_data(twitter_handle=account, account_type=account_type, company_data=company_data)
                #print('Updated ' + account + ' Summary Data\n')
        print('Completed Running Accounts')
        
    elif update_forwards == True:
        # Updates forwards, meaning it will take the last date in the dataset and a specified year, pulling all tweets
        # between those dates.
        # Use this when you're updating yearly data (ex: End of 2023, pull all 2023 tweets)
        
        # WAIT: there's a potential issue if it cuts out in the middle of running this script.
        # Since the function pulls from newest to oldest, it could pull Dec 2023 before Jan 2023, error out
        # and then it would read Dec 2023 as the newest data, but it'd be missing most of that year and we wouldn't be able
        # to pull it.
        # Is there a way to pull fowards in time? Start in January and go until Dec? Or maybe a better solution
        
        # Maybe we check for the last date in the year before, ensure that all data from that year has been collected,
        # and then do some kind of similar thing for the year we're trying to pull?
        # Ex: Say error happens as it's pulling November data. It doesn't have Jan-Oct. Instead of taking 12-31 as last
        # date, we could first see if there's any 2023 data. If so, let's find the earliest date and use that as our end
        # date instead of 12-31. In this case, something like 11-10. If there is no 2023 data, the code functions as normal
        
        accounts_to_run = account_list
        for account in accounts_to_run:
            path = Path(account_type + " Account Data/"+str(account)+".csv")
            if path.is_file():
                # If the path to that file already exists, check if it's empty
                data = pd.read_csv(account_type + " Account Data/"+str(account)+".csv")
                if data.empty:
                    # Pull data for the empty file
                    print('File Empty. Pulling ' + account + ' Tweets from 2009-01-01 to 2022-12-31')
                    pull_account_tweets(account, account_type)
                    print('Finished Pulling ' + account + ' Tweets')
                else:
                    # Isolate latest date in the set, set that as our new start_date, and use defined end_date
                    # as our end_date, pulling all tweets in between that time
                    data['created_at'] = pd.to_datetime(data['created_at'])
                    data = data.sort_values('created_at', ascending=False)
                    start_date = data['created_at'][0]
                    start_date = start_date.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]+"Z"
                    end_date = str(forward_year) + "-12-31T00:00:00.000Z"
                    print('Pulling ' + account + ' Tweets from ' + str(start_date[:10]) + ' to ' + str(end_date[:10]))
                    pull_account_tweets(account, account_type, start_date=start_date, end_date=end_date)
                    print('Finished Pulling ' + account + ' Tweets')
            else:
                # If the file doesn't exist, it just runs as normal, creating the file and populating with data
                print('Pulling ' + account + ' Tweets')
                pull_account_tweets(account, account_type)
                print('Finished Pulling ' + account + ' Tweets')
                #create_summary_data(twitter_handle=account, account_type=account_type, company_data=company_data)
                #print('Updated ' + account + ' Summary Data\n')
    else:
        # Pulls all data for accounts we pass in a list to it
        # Filter out companies we've already run to speed up process.
        accounts_to_run = [account for account in account_list if not determine_file_exists(account, account_type)]
        for account in accounts_to_run:
            print('Pulling ' + account + ' Tweets')
            pull_account_tweets(account, account_type) # Pulls tweets until there's no more or it errors out
            print('Finished Pulling ' + account + ' Tweets')
            create_summary_data(twitter_handle=account, account_type=account_type, company_data=company_data)
            print('Updated ' + account + ' Summary Data\n')
    print('Completed Updating Account Data')
        
def create_summary_data(twitter_handle, account_type, company_data):
    # Creates Yearly Summary
    data = pd.read_csv(account_type + " Account Data/"+str(twitter_handle)+".csv")
    data['created_at'] = pd.to_datetime(data['created_at'], format = "%Y-%m-%d")
    data['year'] = data['created_at'].dt.year
    tweet_data = (data.groupby(['year', 'tweet_type']).size().reset_index(name='counts')
     .pivot(index='year', columns='tweet_type',values='counts').fillna(0).reset_index())
    types_of_tweets = ['original', 'quoted', 'replied_to', 'retweeted']
    for tweet in types_of_tweets:
        if tweet not in tweet_data.columns:
            tweet_data[tweet] = 0
    tweet_data = tweet_data.rename(columns={'year':'Year', 'original': 'Original Tweets', 'quoted':'Quote Tweets', 
                      'replied_to':'Reply Tweets', 'retweeted':'Retweets'})
    tweet_summary = (data[data['tweet_type'] == "original"].groupby('year')
     .agg({'like_count':'sum', 'retweet_count':'sum', 'reply_count':'sum', 'quote_count': 'sum'}).fillna(0).reset_index()
     .rename(columns={'year':'Year', 'like_count':'Original Tweet Likes', 'retweet_count': 'Original Tweet Retweets', 
                      'reply_count':'Original Tweet Replies', 'quote_count': 'Original Tweet Quotes'})
    )

    tweet_data = tweet_data.merge(tweet_summary, on='Year')
    tweet_data['Twitter Handle'] = "@" + str(twitter_handle)
    tweet_data['Account Type'] = str(account_type)
    tweet_data = (tweet_data.merge(company_data, on='Twitter Handle', how='left')
                  .drop_duplicates(subset=['Year', 'Twitter Handle'], keep='first'))
    tweet_data = tweet_data[['Company', 'Account Type', 'Twitter Handle', 'Year', 'Original Tweets', 'Retweets', 'Reply Tweets',
                            'Quote Tweets', 'Original Tweet Retweets', 'Original Tweet Likes', 'Original Tweet Replies',
                            'Original Tweet Quotes']]
    #tweet_data.to_csv("./Summary Data/Yearly Summary.csv", mode='a', header=False)
    
    # Creates Overall Summary
    api_key = os.getenv('API_KEY')
    api_secret_key = os.getenv('API_SECRET_KEY')
    access_token = os.getenv('ACCESS_TOKEN')
    access_token_secret = os.getenv('ACCESS_TOKEN_SECRET')
    
    summary = pd.DataFrame(data[data['tweet_type'] == 'original'].agg({'like_count': 'sum', 'quote_count': 'sum', 
                                            'reply_count': 'sum', 'retweet_count': 'sum'})).transpose()
    summary['Twitter Handle'] = "@"+twitter_handle
    tweet_summary = pd.DataFrame(data.groupby('tweet_type').size()).transpose().rename_axis(None, axis=1)
    for tweet in types_of_tweets:
        if tweet not in tweet_summary.columns:
            tweet_summary[tweet] = 0
    tweet_summary['Twitter Handle'] = "@"+twitter_handle
    summary = summary.merge(tweet_summary, on='Twitter Handle')

    auth = tweepy.OAuthHandler(api_key, api_secret_key)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth)
    try:
        user = api.get_user(screen_name = twitter_handle)
    except tweepy.errors.NotFound:
        print(f"OOPS! User {twitter_handle} not found. Account no longer exists. Try  different method")
        return
    except tweepy.errors.Forbidden:
        print(f"OOPS! User {twitter_handle} has been suspended.")
        return
    summary['Account Followers'] = user.followers_count
    summary['Account Following'] = user.friends_count
    summary['Account Likes'] = user.favourites_count
    summary['Account Public Lists'] = user.listed_count
    summary['Account Creation Date'] = user.created_at
    summary = summary.rename(columns = {'like_count': 'Original Tweet Likes', 'quote_count': 'Original Tweet Quotes', 
                                        'reply_count': 'Original Tweet Replies', 'retweet_count': 'Original Tweet Retweets',
                                        'original': 'Original Tweets', 'quoted': 'Quote Tweets', 'replied_to': 'Reply Tweets',
                                        'retweeted': 'Retweets'
                                       })
    summary['Account Type'] = account_type
    summary = (summary.merge(company_data, left_on='Twitter Handle', right_on='Twitter Handle', how='left')
     .drop_duplicates(subset='Twitter Handle', keep='first'))
    summary = summary[['Company', 'Account Type', 'Twitter Handle', 'Account Creation Date', 'Account Followers', 'Account Following',
        'Account Likes', 'Account Public Lists', 'Original Tweets', 'Retweets', 'Reply Tweets', 'Quote Tweets', 
        'Original Tweet Retweets', 'Original Tweet Likes', 'Original Tweet Replies', 'Original Tweet Quotes']]
    #summary.to_csv("./Summary Data/Overall Summary.csv", mode='a', header=False)
    
    total = tweet_data.merge(summary, on=['Company', 'Account Type', 'Twitter Handle'], suffixes=[' Year', ' Total'])
    total.to_csv("./Summary Data/Tweets Summary.csv", mode='a', header=False)

In [9]:
mass_pull_data(corp_twitter_list, account_type = "Corporate", update_backwards=True)

Pulling Apple Tweets from 2009-01-01 to 2016-09-01
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling Apple Tweets
Pulling Microsoft Tweets from 2009-01-01 to 2009-09-21
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling Microsoft Tweets
Pulling exxonmobil Tweets from 2009-01-01 to 2009-10-21
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling exxonmobil Tweets
Pulling JNJNews Tweets from 2009-01-01 to 2009-02-13
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling JNJNews Tweets
Pulling generalelectric Tweets from 2009-01-01 to 2011-04-08
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling generalelectric Tweets
Pulling amazon Tweets from 2009-01-01 to 2009-02-14
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling amazon Tweets
Pulling 

Pulling Accenture Tweets from 2009-01-01 to 2009-01-12
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling Accenture Tweets
Pulling Nike Tweets from 2009-01-01 to 2011-12-29
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling Nike Tweets
Pulling UPS Tweets from 2009-01-01 to 2010-07-21
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling UPS Tweets
Pulling LillyPad Tweets from 2009-01-01 to 2010-09-14
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling LillyPad Tweets
Pulling Lowes Tweets from 2009-01-01 to 2009-01-22
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling Lowes Tweets
Pulling MDLZ Tweets from 2009-01-01 to 2014-06-10
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling MDLZ Tweets
Pulling usbank Tweets from 2009-01-01 to 2011-

Endpoint Response Code: 200
Total number of results:  0
Finished Pulling RAI_News Tweets
Pulling FedEx Tweets from 2009-01-01 to 2010-06-08
File Already Exists!
Endpoint Response Code: 200
# of Tweets added from this response:  102
Total Tweets Added:  102
Total number of results:  102
Finished Pulling FedEx Tweets
Pulling netflix Tweets from 2009-01-01 to 2009-01-07
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling netflix Tweets
Pulling GDMS Tweets from 2009-01-01 to 2012-09-06
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling GDMS Tweets
Pulling PayPal Tweets from 2009-01-01 to 2009-05-13
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling PayPal Tweets
Pulling GM Tweets from 2009-01-01 to 2009-01-06
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling GM Tweets
Pulling RaytheonTech Tweets from 2009-01-01 to 2009-0

Endpoint Response Code: 200
Total number of results:  0
Finished Pulling johnsoncontrols Tweets
Pulling eatoncorp Tweets from 2009-01-01 to 2009-01-23
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling eatoncorp Tweets
Pulling airproducts Tweets from 2009-01-01 to 2010-01-29
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling airproducts Tweets
Pulling SouthwestAir Tweets from 2009-01-01 to 2009-01-01
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling SouthwestAir Tweets
Pulling Humana Tweets from 2009-01-01 to 2011-10-20
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling Humana Tweets
Pulling WeAreOxy Tweets from 2009-01-01 to 2013-10-29
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling WeAreOxy Tweets
Pulling LyondellBasell Tweets from 2009-01-01 to 2010-04-23
File Already

Endpoint Response Code: 200
Total number of results:  0
Finished Pulling PSEGNews Tweets
Pulling ConEdison Tweets from 2009-01-01 to 2012-06-29
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling ConEdison Tweets
Pulling newell_brands Tweets from 2009-01-01 to 2012-04-23
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling newell_brands Tweets
Pulling CBSTweet Tweets from 2009-01-01 to 2009-03-30
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling CBSTweet Tweets
Pulling SJM_Media Tweets from 2009-01-01 to 2013-10-07
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling SJM_Media Tweets
Pulling Zoetis Tweets from 2009-01-01 to 2013-02-01
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling Zoetis Tweets
Pulling EdwardsLifesci Tweets from 2009-01-01 to 2015-03-12
File Already Exists!


Endpoint Response Code: 200
Total number of results:  0
Finished Pulling StanleyBlkDeckr Tweets
Pulling KeurigPepper Tweets from 2009-01-01 to 2019-05-14
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling KeurigPepper Tweets
Pulling Cerner Tweets from 2009-01-01 to 2009-06-29
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling Cerner Tweets
Pulling smuckers Tweets from 2009-01-01 to 2013-08-26
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling smuckers Tweets
Pulling moodyscorp Tweets from 2009-01-01 to 2016-04-25
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling moodyscorp Tweets
Pulling ameriprise Tweets from 2009-01-01 to 2011-05-10
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling ameriprise Tweets
Pulling Clorox Tweets from 2009-01-01 to 2010-02-24
File Already Exists!

Endpoint Response Code: 200
Total number of results:  0
Finished Pulling LamResearch Tweets
Pulling EQTCorp Tweets from 2009-01-01 to 2017-11-13
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling EQTCorp Tweets
File Empty. Pulling AllianceData Tweets from 2009-01-01 to 2022-12-31
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling AllianceData Tweets
Pulling FastenalCompany Tweets from 2009-01-01 to 2010-03-12
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling FastenalCompany Tweets
Pulling skyworksinc Tweets from 2009-01-01 to 2009-06-23
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling skyworksinc Tweets
Pulling autodesk Tweets from 2009-01-01 to 2009-05-25
File Already Exists!
Endpoint Response Code: 200
Total number of results:  0
Finished Pulling autodesk Tweets
Pulling MicronTech Tweets from 2009-01-01 to 2009-

Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  26000
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  26500
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  27000
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  27500
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  28000
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  28500
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  29000
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  29500
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  30000
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  30500
Endpoint Response Co

Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  68000
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  68500
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  69000
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  69500
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  70000
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  70500
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  71000
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  71500
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  72000
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  72500
Endpoint Response Co

Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  110000
Endpoint Response Code: 200
# of Tweets added from this response:  499
Total Tweets Added:  110499
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  110999
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  111499
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  111999
Endpoint Response Code: 200
# of Tweets added from this response:  500
Total Tweets Added:  112499
Endpoint Response Code: 429


Exception: (429, '{"account_id":1598398401024065554,"product_name":"academic","title":"UsageCapExceeded","period":"Monthly","scope":"Product","detail":"Usage cap exceeded: Monthly product cap","type":"https://api.twitter.com/2/problems/usage-capped"}')

In [39]:
# Want to create the update_forward function, which should read in data file, coerce date column to being a date,
# sort by date with newest date at top, take that newest date as input to pull_account_tweets with forward_year
# providing the last date (ex: foward_year=2022 means that last date should be 12-31-22)
account_type = 'Corporate'
account = 'ATT'
forward_year = 2022

path = Path(account_type + " Account Data/"+str(account)+".csv")
data = pd.read_csv(path)
data['created_at'] = pd.to_datetime(data['created_at'])
data = data.sort_values('created_at', ascending=False)
data_new_year = data[data['created_at'].dt.year == forward_year]
newest_date = data_new_year['created_at'].iloc[0]
oldest_date = data_new_year['created_at'].iloc[-1]
print(f'Newest Date: {newest_date}')
print(f'Oldest Date: {oldest_date}')
# if path.is_file():
#     # If the path to that file already exists, check if it's empty
#     data = pd.read_csv(account_type + " Account Data/"+str(account)+".csv")
#     if data.empty:
#         # Pull data for the empty file
#         print('File Empty. Pulling ' + account + ' Tweets from 2009-01-01 to 2022-12-31')
#         pull_account_tweets(account, account_type)
#         print('Finished Pulling ' + account + ' Tweets')
#     else:
#         data['created_at'] = pd.to_datetime(data['created_at'])
#         data = data.sort_values('created_at', ascending=False)
#         start_date = data['created_at'][0]
#         start_date = start_date.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]+"Z"
#         end_date = str(forward_year) + "-12-31T00:00:00.000Z"
#         print('Pulling ' + account + ' Tweets from ' + str(start_date[:10]) + ' to ' + str(end_date[:10]))
#         #pull_account_tweets(account, account_type, start_date=start_date, end_date=end_date)
#         print('Finished Pulling ' + account + ' Tweets')

Newest Date: 2022-11-28 23:00:00+00:00
Oldest Date: 2022-01-01 00:48:52+00:00


In [9]:
corp_twitter_list

['Apple',
 'Microsoft',
 'exxonmobil',
 'JNJNews',
 'generalelectric',
 'amazon',
 'facebook',
 'ATT',
 'jpmorgan',
 'WellsFargo',
 'ProcterGamble',
 'google',
 'google',
 'pfizer',
 'verizon',
 'Chevron',
 'CocaColaCo',
 'HomeDepot',
 'Merck',
 'InsidePMI',
 'Visa',
 'comcast',
 'intel',
 'PepsiCo',
 'Disney',
 'Cisco',
 'BankofAmerica',
 'IBM',
 'UnitedHealthGrp',
 'Citi',
 'AltriaNews',
 'Oracle',
 'bmsnews',
 'Medtronic',
 'Amgen',
 'GileadSciences',
 'Walmart',
 'McDonaldsCorp',
 'CVSHealth',
 '3M',
 'abbvie',
 'abbvie',
 'MasterCard',
 'Honeywell',
 'bmsnews',
 'Boeing',
 'Qualcomm',
 'Starbucks',
 'UTC',
 'UnionPacific',
 'Accenture',
 'Nike',
 'UPS',
 'LillyPad',
 'Lowes',
 'MDLZ',
 'usbank',
 'WBA_Global',
 'priceline',
 'CP_News',
 'AIGinsurance',
 'LockheedMartin',
 'TXInstruments',
 'SimonPropertyGp',
 'thermofisher',
 'Broadcom',
 'GoldmanSachs',
 'DanaherCorp',
 'DowNewsroom',
 'wbd',
 'DuPont_News',
 'Chubb',
 'WeAreOxy',
 'conocophillips',
 'AbbottNews',
 'nexteraenergy

**Data collection is finished and we have all of the companies. Here's what I need to do next:
1) Update data for companies who had errors while pulling their data. Should just be the update parameter I've already built in to the function
2) Update data for companies who I didn't collect all of their 2022 data (so those who started in Nov and Dec didn't get pulled)
3) Add in the summary data I originally pulled into the master file
4) Update 2022 data for companies who didn't fully get pulled
5) Remove all duplicates from summary for years and ensure those numbers are correct
6) Create additional parameter for running new years (i.e. 2023, 2024) for future data pulls

In [10]:
# Counts number of data files we have (number of companies we've pulled)
import os.path
account_type = "Corporate"
path = "./" + account_type + " Account Data/"
num_files = len([f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))])
print(num_files)

1293


In [None]:
companies = pd.read_excel("2022 SP1500 Twitter Handles - Final Deliverable.xlsx")
companies['corp_twitter'] = companies['Corporate Twitter Handle'].str[1:]
corp_twitter_list = list(companies[companies['corp_twitter'].notnull()]['corp_twitter'])