In [1]:
import tweepy
import yaml
import pandas as pd
from pandas import json_normalize

In [2]:
with open('config.yml', "r") as stream:
    try:
        cf = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)


auth = tweepy.OAuth1UserHandler(
    cf['twitter_key'], 
    cf['twitter_secret'], 
    cf['twitter_access_key'], 
    cf['twitter_access_secret']
)

In [70]:
def get_tweets(res):
    # https://stackoverflow.com/questions/70371657/problem-with-getting-tweet-fields-from-twitter-api-2-0-using-tweepy
    # Inspired by https://stackoverflow.com/a/34065538/1391964
    fields = ['id', 'text'] + tweet_fields
    df = pd.DataFrame({fn: getattr(tweet, fn) for fn in fields} for tweet in res.data)
    df = pd.concat([df, json_normalize(df['public_metrics'])], axis=1).drop('public_metrics', axis=1)
    df = df.set_index('id')
    return df

def get_context(df):
    # Context Annotations
    df_context_annotations = df[df['context_annotations'].str.len() > 0].loc[:,['context_annotations']]
    df_context_annotations = df_context_annotations.explode('context_annotations')
    df_context_annotations = json_normalize(df_context_annotations['context_annotations'], sep="_").set_index(df_context_annotations.index)
    return(df_context_annotations)

def get_entities(df):
    # Entities
    df_entities = json_normalize(df['entities']).set_index(df.index)

    entity_dict = {}

    for entity in df_entities.columns:
        new_df = df_entities.explode(entity)
        new_df = json_normalize(new_df[entity]).set_index(new_df.index).dropna(how='all')
        new_df = new_df.add_prefix(entity + '_')
        entity_dict[entity] = new_df
    
    return entity_dict

In [63]:
client = tweepy.Client(bearer_token=cf['twitter_bearer'])

tweet_fields = ['author_id',
                'attachments',
                'conversation_id',
                'created_at',
                'geo',
                'in_reply_to_user_id',
                'lang',
                'referenced_tweets', # nested array
                'context_annotations', # nested
                'entities', # nested
                'public_metrics'] # json_normalize]

df = pd.DataFrame()

# @bennstancil
for tweets in tweepy.Paginator(client.get_users_tweets,
                               id=1643966166,
                               tweet_fields=tweet_fields,
                               max_results=100,
                               limit=20):
    df = pd.concat([df, get_tweets(tweets)], axis=0)
    #df_dict['context'] = get_context(df_dict['tweets'])
    #df_dict = df_dict.update(get_entities(df_dict['tweets']))

#res = client.get_users_tweets(id=1643966166,
#                               tweet_fields=tweet_fields,
#                               max_results=100)



BadRequest: 400 Bad Request
The `tweet.fields` query parameter value [referenced_tweets.id] is not one of [attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,non_public_metrics,organic_metrics,possibly_sensitive,promoted_metrics,public_metrics,referenced_tweets,reply_settings,source,text,withheld]

In [80]:
df_context = get_context(df)
dict_entities = get_entities(df)
df = df.drop(['context_annotations', 'entities'], axis=1)

In [81]:
df

Unnamed: 0_level_0,text,author_id,attachments,conversation_id,created_at,geo,in_reply_to_user_id,lang,referenced_tweets,retweet_count,reply_count,like_count,quote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1505641332232007680,@dada_dad_ Speaking as someone who's only work...,1643966166,,1505428143363878912,2022-03-20 20:24:01+00:00,,4.523085e+08,en,"[(type, id)]",0,0,2,0
1505640467018371073,"@leoebfolsom On that, I have no idea, though t...",1643966166,,1502338198416072707,2022-03-20 20:20:35+00:00,,2.198416e+08,en,"[(type, id)]",0,1,1,0
1505639860786262016,"@ryanjanssen @g_xing ah, yeah. I'm skeptical o...",1643966166,,1504863910314315789,2022-03-20 20:18:10+00:00,,1.308685e+08,en,"[(type, id)]",0,1,1,0
1505217827921350659,@jillzzy @g_xing https://t.co/u6wkR0dNNM,1643966166,,1504863910314315789,2022-03-19 16:21:10+00:00,,1.470270e+07,und,"[(type, id), (type, id)]",0,1,2,0
1505029658085150723,@drewwww @g_xing And contrary to Silicon Valle...,1643966166,,1504863910314315789,2022-03-19 03:53:27+00:00,,1.643966e+09,en,"[(type, id)]",0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
374969773286621184,What happens if the US intervenes in Syria? 60...,1643966166,{'media_keys': ['3_374969773295009792']},374969773286621184,2013-09-03 18:58:55+00:00,,,en,,0,0,0,0
373667865259290624,"@scottorn thanks, I'm glad you enjoyed it. I'l...",1643966166,,373611107853672448,2013-08-31 04:45:36+00:00,,8.545872e+06,en,"[(type, id)]",0,0,1,0
373240281916665856,RT @ttunguz: Great new data science blog on @s...,1643966166,,373240281916665856,2013-08-30 00:26:32+00:00,,,en,"[(type, id)]",1,0,0,0
373240214006673408,"@arjunblj thanks! Glad you enjoyed it, and we'...",1643966166,,373161369433952256,2013-08-30 00:26:16+00:00,,2.555251e+07,en,"[(type, id)]",0,0,0,0


In [None]:
df_context

In [253]:
dict_entities['urls']

Unnamed: 0_level_0,urls_start,urls_end,urls_url,urls_expanded_url,urls_display_url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1505217827921350659,17.0,40.0,https://t.co/u6wkR0dNNM,https://twitter.com/benedictevans/status/39781...,twitter.com/benedictevans/…
1504904989151571970,235.0,258.0,https://t.co/2j3fmK1NGh,https://twitter.com/pedram_navid/status/150489...,twitter.com/pedram_navid/s…
1504864349206286376,42.0,65.0,https://t.co/vGra6fbHyW,https://docs.google.com/forms/d/e/1FAIpQLSeDj7...,docs.google.com/forms/d/e/1FAI…
1504863910314315789,62.0,85.0,https://t.co/6dg7FcUyT9,https://benn.substack.com/p/startups-shouldnt-...,benn.substack.com/p/startups-sho…
1504158201368961024,136.0,159.0,https://t.co/ZKBVqIcS7e,https://compilerqueen.substack.com/,compilerqueen.substack.com
1504158201368961024,160.0,183.0,https://t.co/S6295yVOWE,https://stkbailey.substack.com/,stkbailey.substack.com
1503204739256827905,70.0,93.0,https://t.co/MwKv8utMUp,https://twitter.com/bennstancil/status/1503204...,pic.twitter.com/MwKv8utMUp
1502339571715133448,14.0,37.0,https://t.co/56fHdxpzHu,https://twitter.com/bennstancil/status/1502339...,pic.twitter.com/56fHdxpzHu
1502338198416072707,55.0,78.0,https://t.co/9r3wFJF0af,https://benn.substack.com/p/the-data-app-store,benn.substack.com/p/the-data-app…
1501772060317605893,224.0,247.0,https://t.co/jrYkZsXVWI,https://twitter.com/spbail/status/150160714761...,twitter.com/spbail/status/…


In [254]:
dict_entities['mentions']

Unnamed: 0_level_0,mentions_start,mentions_end,mentions_username,mentions_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1505217827921350659,0.0,8.0,jillzzy,14702696
1505217827921350659,9.0,16.0,g_xing,521285413
1505029658085150723,0.0,8.0,drewwww,15169938
1505029658085150723,9.0,16.0,g_xing,521285413
1505027932779073537,0.0,8.0,drewwww,15169938
...,...,...,...,...
1492279002861604867,0.0,14.0,SriniKadamati,300955028
1492187658528960515,0.0,9.0,gwenshap,9462812
1491966564324761600,0.0,11.0,sarahcat21,2539212208
1491965819349356546,0.0,11.0,sarahcat21,2539212208


In [255]:
dict_entities['annotations']

Unnamed: 0_level_0,annotations_start,annotations_end,annotations_probability,annotations_type,annotations_normalized_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1505029658085150723,33.0,46.0,0.5102,Organization,Silicon Valley
1505027932779073537,17.0,21.0,0.7667,Organization,Tesla
1502345129260195846,157.0,166.0,0.7645,Organization,Salesforce
1501773933321891840,63.0,69.0,0.4578,Product,twitter
1501773933321891840,139.0,143.0,0.7163,Person,steve
1501309804777717761,162.0,168.0,0.5952,Product,twitter
1499839416143167490,20.0,28.0,0.5089,Person,elon musk
1499839416143167490,44.0,48.0,0.9687,Place,texas
1499035976156467201,4.0,5.0,0.8849,Place,US
1499035976156467201,40.0,50.0,0.9848,Place,Afghanistan


In [256]:
dict_entities['hashtags']

Unnamed: 0_level_0,hashtags_start,hashtags_end,hashtags_tag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1502341825528541194,12.0,25.0,deletewordle
