__Initialization__

Assumes Python 3

---------------- Only run once on the machine to get things configured -------------------


In [None]:
!echo 'This could take awhile. Status at bottom will show idle when done.'
!git clone https://github.com/JustAnotherArchivist/snscrape
!echo done

In [None]:
!echo 'This could take awhile. Status at bottom will show idle when done.'
!pip install snscrape
!pip install pandas
!pip install seaborn
!echo done

---------------------------- Below here can be run multiple times --------------------------------

__Parse tweets into a Pandas data set__

Have to run this before trying to prepare the dataset for plotting or plotting.

Parameters

* Set displayMaxRows to None (displayMaxRows = None) to show all rows or use a number, like 10, to limit ouput (displayMaxRows = 10)
* Set maxResults to None if you want all tweets or limit by setting a number
* Set since to None for all dates or use YYYY-MM-DD format to get all tweets since that date

In [None]:
import json
import pandas

# Parameters
displayMaxRows = 10
maxResults = None
since = '2020-01-01'
twitterUser = 'atrupar'

temp = []

if maxResults is None:
    maxResultsParam = ''
else:
    maxResultsParam = f'--max-results {maxResults}'
    
if since is None:
    sinceParam = ''
else:
    sinceParam = f'--since {since}'
    
print(f'Running: snscrape {maxResultsParam} --jsonl {sinceParam} twitter-user {twitterUser}')
print('This can take awhile. The status bar at the bottom of the screen will say Busy until this is done.')

results  = !snscrape {maxResultsParam} --jsonl {sinceParam} twitter-user {twitterUser}

print('Done scraping Twitter')

for json_str in results:
    result = json.loads(json_str)
    
    isVideo = False
    isImage = False
    mediaType = 'None'
    views = 0
    media = result['media']
    if (media is None) == False and len(media) > 0:
        if media[0]['_type'] == 'snscrape.modules.twitter.Photo':
            isImage = True
            mediaType = 'Image'
        elif media[0]['_type'] == 'snscrape.modules.twitter.Video':
            isVideo = True
            mediaType = 'Video'
            views = media[0]['views']
            
    totalEngagement =  result['replyCount'] + result['retweetCount'] + result['likeCount'] + result['quoteCount']
    totalEnagementWithVideoViews = totalEngagement + views
        
    record = {
        'TweetId': result['id'],
        'TweetDate': result['date'],
        'Replies': result['replyCount'],
        'Retweets': result['retweetCount'],
        'Likes': result['likeCount'],
        'Quotes': result['quoteCount'],
        'Source': result['sourceLabel'],
        'IsVideo': isVideo,
        'IsImage': isImage,
        'VideoViews': views,
        'MediaType': mediaType,
        'TotalEngagement':totalEngagement,
        'TotalEngagementWithVideoViews':totalEnagementWithVideoViews
    }
    
    temp.append(record)

output = pandas.DataFrame(temp)

# Put TweetDate into proper date format
output['TweetDate'] = pandas.to_datetime(output['TweetDate'])

with pandas.option_context('display.max_rows', displayMaxRows,):
    display(output)

Output the tweet data to a CSV

In [None]:
fileName='output.csv'
output.to_csv(fileName)
print('File created')

PLOT: Total engagement scatter with colors by media type

In [None]:
import seaborn
import matplotlib.pyplot as plt
    
seaborn.set(style='whitegrid')
plt.figure(figsize=(20, 5))

outputGrouped = output.groupby(['MediaType', pandas.Grouper(key='TweetDate', freq='W-MON')]).agg({'TotalEngagement':'sum','TotalEngagementWithVideoViews':'sum'}).reset_index().sort_values('TweetDate')
with pandas.option_context('display.max_rows', displayMaxRows,):
    display(output)

scatter = seaborn.scatterplot(x='TweetDate', y='TotalEngagement', hue = 'MediaType', data=outputGrouped).set(title='Total Enagement')
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1))

PLOT: Total engagement scatter with colors by media type on a log scale

In [None]:
import seaborn
import matplotlib.pyplot as plt
    
seaborn.set(style='whitegrid')
plt.figure(figsize=(20, 5))

outputGrouped = output.groupby(['MediaType', pandas.Grouper(key='TweetDate', freq='W-MON')]).agg({'TotalEngagement':'sum','TotalEngagementWithVideoViews':'sum'}).reset_index().sort_values('TweetDate')
with pandas.option_context('display.max_rows', displayMaxRows,):
    display(output)

scatter = seaborn.scatterplot(x='TweetDate', y='TotalEngagement', hue = 'MediaType', data=outputGrouped).set(title='Total Enagement (Log scale)', yscale='log')
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1))

PLOT: Total engagement including video views scatter with colors by media type on a log scale

In [None]:
import seaborn
import matplotlib.pyplot as plt
    
seaborn.set(style='whitegrid')
plt.figure(figsize=(20, 5))

outputGrouped = output.groupby(['MediaType', pandas.Grouper(key='TweetDate', freq='W-MON')]).agg({'TotalEngagement':'sum','TotalEngagementWithVideoViews':'sum'}).reset_index().sort_values('TweetDate')
with pandas.option_context('display.max_rows', displayMaxRows,):
    display(output)

scatter = seaborn.scatterplot(x='TweetDate', y='TotalEngagementWithVideoViews', hue = 'MediaType', data=outputGrouped).set(title='Total Enagement with Video Views (Log scale)', yscale='log')
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1))

PLOT: Total engagement scatter with colors by source

In [None]:
import seaborn
import matplotlib.pyplot as plt
    
seaborn.set(style='whitegrid')
plt.figure(figsize=(20, 5))

outputGrouped = output.groupby(['Source', pandas.Grouper(key='TweetDate', freq='W-MON')]).agg({'TotalEngagement':'sum','TotalEngagementWithVideoViews':'sum'}).reset_index().sort_values('TweetDate')
with pandas.option_context('display.max_rows', displayMaxRows,):
    display(output)

scatter = seaborn.scatterplot(x='TweetDate', y='TotalEngagement', hue = 'Source', data=outputGrouped).set(title='Total Enagement by Source')
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1))

PLOT: Total engagement scatter with colors by source on a log scale

In [None]:
import seaborn
import matplotlib.pyplot as plt
    
seaborn.set(style='whitegrid')
plt.figure(figsize=(20, 5))

outputGrouped = output.groupby(['Source', pandas.Grouper(key='TweetDate', freq='W-MON')]).agg({'TotalEngagement':'sum','TotalEngagementWithVideoViews':'sum'}).reset_index().sort_values('TweetDate')
with pandas.option_context('display.max_rows', displayMaxRows,):
    display(output)

scatter = seaborn.scatterplot(x='TweetDate', y='TotalEngagement', hue = 'Source', data=outputGrouped).set(title='Total Enagement by Source (Log scale)', yscale='log')
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1))

PLOT: Total engagement including video views scatter with colors by source on a log scale

In [None]:
import seaborn
import matplotlib.pyplot as plt
    
seaborn.set(style='whitegrid')
plt.figure(figsize=(20, 5))

outputGrouped = output.groupby(['Source', pandas.Grouper(key='TweetDate', freq='W-MON')]).agg({'TotalEngagement':'sum','TotalEngagementWithVideoViews':'sum'}).reset_index().sort_values('TweetDate')
with pandas.option_context('display.max_rows', displayMaxRows,):
    display(output)

scatter = seaborn.scatterplot(x='TweetDate', y='TotalEngagementWithVideoViews', hue = 'Source', data=outputGrouped).set(title='Total Enagement by Source with Video Views (Log scale)', yscale='log')
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1))