__Initialization__

Only run once on the machine

In [None]:
!git clone https://github.com/JustAnotherArchivist/snscrape

In [None]:
!pip install snscrape
!pip install pandas
!pip install seaborn

Let's get some data for the twitter handle @thepennyhoarder

In [25]:
!snscrape --max-results 10000 --jsonl twitter-user SeeFunnyVideo > tweets.jsonl

If we want images

In [None]:
!snscrape --max-results 100 --jsonl twitter-user tcabanski > tweets.jsonl

Run this command to dump the output.  Not recommended.

In [None]:
more tweets.jsonl

Set displayMaxRows to None (displayMaxRows = None) to show all rows or use a number, like 10, to limit ouput (displayMaxRows = 10)

In [21]:
displayMaxRows = 10

Parse tweets into a Pandas data set

In [26]:
import json
import pandas

with open('./tweets.jsonl', 'r') as json_file:
    json_list = list(json_file)
    
temp = []

for json_str in json_list:
    result = json.loads(json_str)
    
    isVideo = False
    isImage = False
    mediaType = "None"
    views = 0
    media = result["media"]
    if (media is None) == False and len(media) > 0:
        if media[0]["_type"] == "snscrape.modules.twitter.Photo":
            isImage = True
            mediaType = "Image"
        elif media[0]["_type"] == "snscrape.modules.twitter.Video":
            isVideo = True
            mediaType = "Video"
            views = media[0]["views"]
        
    record = {
        "tweetId": result["id"],
        "tweetDate": result["date"],
        "replies": result["replyCount"],
        "retweets": result["retweetCount"],
        "likes": result["likeCount"],
        "quotes": result["quoteCount"],
        "source": result["sourceLabel"],
        "isVideo": isVideo,
        "isImage": isImage,
        "videoViews": views,
        "mediaType": mediaType
    }
    
    temp.append(record)

output = pandas.DataFrame(temp)

with pandas.option_context('display.max_rows', displayMaxRows,):
    display(output)

Unnamed: 0,tweetId,tweetDate,replies,retweets,likes,quotes,source,isVideo,isImage,videoViews,mediaType
0,1495901540074553344,2022-02-21T23:21:34+00:00,2,6,38,2,Twitter Web App,True,False,1285327,Video
1,1495898357189218306,2022-02-21T23:08:55+00:00,2,7,46,1,Twitter Web App,True,False,15827,Video
2,1495897638168064000,2022-02-21T23:06:04+00:00,1,6,37,0,Twitter Web App,True,False,21866,Video
3,1495896941888417792,2022-02-21T23:03:18+00:00,0,6,20,0,Twitter Web App,False,False,0,
4,1495338791455768576,2022-02-20T10:05:24+00:00,3,10,51,3,Twitter Web App,True,False,195064,Video
...,...,...,...,...,...,...,...,...,...,...,...
6734,243402654573813760,2012-09-05T17:38:09+00:00,0,0,0,0,Twitter Web Client,False,False,0,
6735,242995079395020801,2012-09-04T14:38:36+00:00,0,0,0,0,Twitter Web Client,False,False,0,
6736,242939182488645633,2012-09-04T10:56:29+00:00,0,0,0,0,Twitter Web Client,False,False,0,
6737,242875399019835392,2012-09-04T06:43:02+00:00,0,0,0,0,Twitter Web Client,False,False,0,


In [27]:
output['tweetDate'] = pandas.to_datetime(output['tweetDate'])
output = output.groupby(['mediaType', pandas.Grouper(key='tweetDate', freq='W-MON')])['retweets'].sum().reset_index().sort_values('tweetDate')
with pandas.option_context('display.max_rows', displayMaxRows,):
    display(output)

Unnamed: 0,mediaType,tweetDate,retweets
39,,2012-05-28 00:00:00+00:00,0
40,,2012-09-10 00:00:00+00:00,0
41,,2013-03-25 00:00:00+00:00,0
42,,2013-04-01 00:00:00+00:00,0
43,,2013-04-29 00:00:00+00:00,0
...,...,...,...
242,Video,2022-02-07 00:00:00+00:00,54
169,,2022-02-07 00:00:00+00:00,11
243,Video,2022-02-14 00:00:00+00:00,21
170,,2022-02-21 00:00:00+00:00,6


Basic scatter plot

In [None]:
import seaborn
import matplotlib.pyplot as plt
 
seaborn.set(style='whitegrid')
plt.figure(figsize=(20, 5))
 
seaborn.scatterplot(x="tweetDate", y="retweets", hue = "mediaType", data=output)
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1));