In [1]:
import re
import time
import pandas as pd

from twitter.scraper import Scraper
from twitter.util import find_key

In [62]:
email, username, password = ..., ..., ...
scraper = Scraper(email, username, password)

In [None]:
tweets = scraper.tweets([33836629]).pop()
tweets_and_replies = scraper.tweets_and_replies([33836629]).pop()

### Find all unique urls in users tweets

In [63]:
unique_urls = set(find_key(tweets, 'expanded_url'))
unique_urls

### Get summary of user tweet data

In [57]:
tweet_data = []
for d in tweets + tweets_and_replies:
    instructions = find_key(d, 'instructions').pop()
    entries = find_key(instructions, 'entries').pop()
    for entry in entries:
        legacy = find_key(entry, 'legacy')
        tweet_data.extend(legacy)

user_key = 'can_dm'  # filter using arbitrary key that only users have
expr = (x for x in tweet_data for k in x if k != user_key)

In [60]:
## simple subset of relevant tweet fields        
cols = [
    'user_id_str',
    'id_str',
    'created_at',
    'favorite_count',
    'full_text',
    'quote_count',
    'reply_count',
    'retweet_count',
    #  'retweeted',
    #  'conversation_id_str',
    #  'favorited',
    #  'is_quote_status',
    #  'lang',
    #  'quoted_status_id_str',
]

df = pd.DataFrame(expr)[cols]

df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %H:%M:%S %z %Y")

numeric = [
    'favorite_count',
    'quote_count',
    'reply_count',
    'retweet_count',
]

df[numeric] = df[numeric].apply(pd.to_numeric, errors='coerce')

## drop duplicates, sort by date
df = (df
      .dropna(subset='id_str')
      .drop_duplicates(subset='id_str')
      .sort_values('created_at', ascending=False)
      .reset_index(drop=True)
      )

# df.to_feather(f'{time.time_ns()}.feather')
# df.to_parquet(f'{time.time_ns()}.parquet')
df.to_csv(f'{time.time_ns()}.csv', index=False)

df

Unnamed: 0,user_id_str,id_str,created_at,favorite_count,full_text,quote_count,reply_count,retweet_count
0,33836629,1637213069301649408,2023-03-18 22:03:08+00:00,69.0,@theamazingdrj Yes the integration right into ...,1.0,6.0,4.0
1,1181493805356158978,1637212448674684928,2023-03-18 22:00:40+00:00,9.0,@karpathy How does it compare to using chatGPT...,0.0,2.0,1.0
2,33836629,1637188599967027200,2023-03-18 20:25:54+00:00,13.0,@ErikSchluntz Very likely,0.0,1.0,1.0
3,1374841081293021188,1637183652458283008,2023-03-18 20:06:14+00:00,6.0,@karpathy Do you think this will work well for...,0.0,1.0,0.0
4,33836629,1637154111333494784,2023-03-18 18:08:51+00:00,5.0,@aliapanahi logprobs kwarg https://t.co/4Uuh4V...,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...
219,33836629,1600031572442218497,2022-12-06 07:37:08+00:00,248.0,😂 stop Riley probably up there as someone who ...,2.0,8.0,12.0
220,16535432,1600012570949058560,2022-12-06 06:21:38+00:00,1698.0,To get a sense of how hyped LLMs are right now...,18.0,47.0,96.0
221,33836629,1593417987687473152,2022-11-18 01:37:07+00:00,206.0,If previous neural nets are special-purpose co...,5.0,2.0,16.0
222,33836629,1528792715810394112,2022-05-23 17:39:21+00:00,3044.0,Something I've been doing for a few years that...,42.0,184.0,115.0


### search tweet text

In [56]:
df[df.full_text.str.contains('repos?i?|github', regex=True, flags=re.I)]

Unnamed: 0,user_id_str,id_str,created_at,favorite_count,full_text,quote_count,reply_count,retweet_count
9,52667700,1637152715716583424,2023-03-18 18:03:18+00:00,99.0,@karpathy Sometimes I wish people could unders...,2.0,1.0,5.0
14,33836629,1637147822482165760,2023-03-18 17:43:52+00:00,325.0,"If not careful, fine-tuning collapses entropy ...",5.0,9.0,21.0
17,788533935886077952,1636786608916819968,2023-03-17 17:48:32+00:00,411.0,I finally installed github copilot (better lat...,5.0,15.0,14.0
18,33836629,1636765735627395073,2023-03-17 16:25:35+00:00,22.0,@BlancheMinerva @JosephJacks_ I didn’t work on...,0.0,4.0,1.0
20,33836629,1636459245184106497,2023-03-16 20:07:42+00:00,1254.0,Less publicized but highly awesome aspect of G...,10.0,38.0,132.0
144,33836629,1620875263700799488,2023-02-01 20:02:31+00:00,10.0,@portisto @trending_repos sad. The way they co...,0.0,1.0,2.0
145,65629552,1620850430254223360,2023-02-01 18:23:51+00:00,7.0,@trending_repos @karpathy How can a main langu...,0.0,4.0,0.0
146,33836629,1620811724952866816,2023-02-01 15:50:03+00:00,245.0,@trending_repos wow,0.0,6.0,4.0
147,1162359127294861314,1620749130556669952,2023-02-01 11:41:19+00:00,2541.0,Trending repository of the month 🏆\n \nnanoGP...,9.0,19.0,320.0
150,33836629,1620187595979513857,2023-01-30 22:29:59+00:00,15.0,@hi_tysam It was very nice to read through top...,0.0,1.0,2.0


In [55]:
flags = re.I

(df
 .query('full_text.str.contains("nanogpt", regex=True, flags=@flags)', engine='python')
 # .query(...)
 # .query(...)
 )

Unnamed: 0,user_id_str,id_str,created_at,favorite_count,full_text,quote_count,reply_count,retweet_count
58,1615441883672502291,1632577588529954819,2023-03-06 03:03:23+00:00,91.0,Speed up your LLM research exploration with a ...,2.0,3.0,14.0
143,33836629,1621578354024677377,2023-02-03 18:36:21+00:00,5276.0,The most dramatic optimization to nanoGPT so f...,57.0,89.0,353.0
147,1162359127294861314,1620749130556669952,2023-02-01 11:41:19+00:00,2541.0,Trending repository of the month 🏆\n \nnanoGP...,9.0,19.0,320.0
172,33836629,1615398117683388417,2023-01-17 17:18:18+00:00,21166.0,"🔥 New (1h56m) video lecture: ""Let's build GPT:...",331.0,546.0,3321.0
178,33836629,1613250487838707712,2023-01-11 19:04:23+00:00,2257.0,Didn't tweet nanoGPT yet (quietly getting it t...,24.0,39.0,303.0
