__Initialization__

Assumes Python 3

---------------- Only run once on the machine to get things configured -------------------


In [None]:
!echo 'This could take awhile. Status at bottom will show idle when done.'
!git clone https://github.com/JustAnotherArchivist/snscrape
!echo done

In [None]:
!echo 'This could take awhile. Status at bottom will show idle when done.'
!pip install snscrape
!pip install pandas
!pip install seaborn
!pip install openpyxl
!echo done

---------------------------- Below here can be run multiple times --------------------------------

__Parse tweets into a Pandas data set__

Have to run this before trying to prepare the dataset for plotting or plotting.  Set parameters below.

In [None]:
import json
import pandas
import seaborn
import os
import glob
import matplotlib.pyplot as plt

# a number or None to get all results.  WARNING: Getting all data can take awhile for busy accounts.
maxResults = None 

# 'YYYY-MM-DD' to get Tweets only since a date or None to get Tweets from all dates
since = '2020-08-01'     

# A list of twitter handles seperated by commas.  Can be broken across lines to make readable.        
twitterUsers = ['atrupar',        
                'forbes']

# Diagnostic parameters
workingDataMaxRows = 10
showWorkingData = False

#functions

def scrape_twitter(twitterUser):
    if maxResults is None:
        maxResultsParam = ''
    else:
        maxResultsParam = f'--max-results {maxResults}'
    
    if since is None:
        sinceParam = ''
    else:
        sinceParam = f'--since {since}'

    print(f'Running: snscrape {maxResultsParam} --jsonl {sinceParam} twitter-user {twitterUser}')
    print('This can take awhile. The status bar at the bottom of the screen will say Busy until this is done.')

    results  = !snscrape {maxResultsParam} --jsonl {sinceParam} twitter-user {twitterUser}

    print('Done scraping Twitter')
    
    return results

def build_data_frame(twitterUser, results):
    temp = []
    
    for json_str in results:
        result = json.loads(json_str)
    
        isVideo = False
        isImage = False
        mediaType = 'None'
        views = 0
        media = result['media']
        if (media is None) == False and len(media) > 0:
            if media[0]['_type'] == 'snscrape.modules.twitter.Photo':
                isImage = True
                mediaType = 'Image'
            elif media[0]['_type'] == 'snscrape.modules.twitter.Video':
                isVideo = True
                mediaType = 'Video'
                views = media[0]['views']
                if views is None:
                    views = 0

        totalEngagement =  result['replyCount'] + result['retweetCount'] + result['likeCount'] + result['quoteCount']
        totalEnagementWithVideoViews = totalEngagement + views

        record = {
            'TweetId': result['id'],
            'TwitterUser': twitterUser,
            'TweetDate': result['date'],
            'Replies': result['replyCount'],
            'Retweets': result['retweetCount'],
            'Likes': result['likeCount'],
            'Quotes': result['quoteCount'],
            'Source': result['sourceLabel'],
            'IsVideo': isVideo,
            'IsImage': isImage,
            'VideoViews': views,
            'MediaType': mediaType,
            'TotalEngagement':totalEngagement,
            'TotalEngagementWithVideoViews':totalEnagementWithVideoViews
        }

        temp.append(record)

    output = pandas.DataFrame(temp)

    # Put TweetDate into proper date format.  We put it in local time for simplicity and because Excel export can't handle timezones
    output['TweetDate'] = pandas.to_datetime(output['TweetDate']).dt.tz_localize(None)

    if showWorkingData:
        with pandas.option_context('display.max_rows', workingDataMaxRows,):
            display(output)
            
    return output

def output_raw(twitterUser, df):
    df.to_excel(f'{twitterUser}_RawData.xlsx')
    print('File created')

def create_scatter_plots(twitterUser, df):
    seaborn.set(style='whitegrid')

    plt.figure(figsize=(30, 10))
    scatter2 = seaborn.scatterplot(x='TweetDate', y='TotalEngagement', hue = 'MediaType', data=df).set(title=f'Total Enagement by Media Type for {twitterUser}', yscale='log')
    lgd = plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
    plt.savefig(f'{twitterUser}_TotalEnagementByMediaType.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.show()
    
    plt.figure(figsize=(30, 10))
    scatter2 = seaborn.scatterplot(x='TweetDate', y='TotalEngagement', hue = 'MediaType', data=df).set(title=f'Total Enagement by Media Type with Video Views for {twitterUser}', yscale='log')
    lgd = plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
    plt.savefig(f'{twitterUser}_TotalEnagementByMediaType.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.show()                         

    plt.figure(figsize=(30, 10))
    scatter3 = seaborn.scatterplot(x='TweetDate', y='TotalEngagement', hue = 'Source', data=df).set(title=f'Total Enagement by Source for {twitterUser}', yscale='log')
    lgd = plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
    plt.savefig(f'{twitterUser}_TotalEnagementBySource.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.show()

    plt.figure(figsize=(30, 10))
    scatter4 = seaborn.scatterplot(x='TweetDate', y='TotalEngagementWithVideoViews', hue = 'Source', data=df).set(title=f'Total Enagement by Source with Video Views for {twitterUser}', yscale='log')
    lgd = plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
    plt.savefig(f'{twitterUser}_TotalEnagementBySourceIncludingVideoViews.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.show()
    
    print(f"Files starting with '{twitterUser}_' are the ones that contain Excel data and charts for {twitterUser}")
    
def clean_output_files(twitterUser):
    for filename in glob.glob(f"{twitterUser}_*"):
        os.remove(filename) 
    
#main
for twitterUser in twitterUsers:
    print('')
    print(f'Processing {twitterUser}')
    clean_output_files(twitterUser)
    results = scrape_twitter(twitterUser)
    df = build_data_frame(twitterUser, results)
    output_raw(twitterUser, df)
    create_scatter_plots(twitterUser, df)
    print(f'Completed {twitterUser}')
