### polars/pandas examples

In [1]:
# !pip uninstall twitter-api-client -y
# !pip install twitter-api-client --no-cache-dir

In [2]:
import re
from pathlib import Path

import orjson
import pandas as pd
import polars as pl

from twitter.util import find_key

In [3]:
def to_int(tdf: pl.LazyFrame, *args) -> pl.LazyFrame:
    return tdf.with_columns(pl.col(col).cast(pl.Int64, strict=False).alias(col) for col in args)


def to_dt(tdf: pl.LazyFrame, fmt: str, *args) -> pl.LazyFrame:
    return tdf.with_columns(pl.col(col).str.strptime(pl.Datetime, fmt).alias(col) for col in args)


def get_data(path: Path, expr: str = '', **kwargs) -> dict:
    D = {}
    for p in path.rglob('*'):
        if re.search(expr, p.name, **kwargs):
            D.setdefault(p.stem.split('_')[-1], []).append(orjson.loads(p.read_bytes()))
    return D

In [4]:
PATH = Path('data/raw')

# filter for users who favorited or retweeted a tweet
data = get_data(PATH, expr='Favoriters|Retweeters')

### polars

In [5]:
def get_user_details(data: dict, cols: list = None, sort: str = 'created_at') -> pl.LazyFrame:
    numeric = [
        'fast_followers_count',
        'favourites_count',
        'followers_count',
        'friends_count',
        'listed_count',
        'media_count',
        'normal_followers_count',
        'statuses_count',
    ]

    D = []
    for u in find_key(data, 'user_results'):
        x = u.get('result', {})
        y = x.get('rest_id')
        if z := x.get('legacy', {}):
            D.append({'rest_id': y} | z)

    return (
        pl.LazyFrame(D)
        .unique(subset='rest_id')
        .pipe(to_dt, '%a %b %d %H:%M:%S %z %Y', 'created_at')
        .pipe(to_int, *numeric)
        .sort(sort.strip("-"), descending="-" not in sort)
        .select(cols)
    )

In [6]:
lf = get_user_details(
    data,
    cols=['created_at', 'screen_name', 'followers_count'],
    sort='-created_at',
)

lf.collect()

created_at,screen_name,followers_count
"datetime[μs, +00:00]",str,i64
2007-03-31 01:16:45 +00:00,"""TheLos""",1601
2008-03-18 19:04:59 +00:00,"""wickedjava""",2986
2008-04-17 17:30:21 +00:00,"""needless_input...",218
2008-06-27 08:58:13 +00:00,"""DebrisStorm""",178
2008-07-26 21:58:07 +00:00,"""daka17""",66
2008-09-03 23:27:25 +00:00,"""heyitsaaron""",1230
2008-09-11 23:37:14 +00:00,"""marinamiss""",771
2008-09-18 13:59:25 +00:00,"""shangrila79""",229
2008-10-11 07:18:09 +00:00,"""fridayschild71...",183
2008-10-27 19:40:43 +00:00,"""Jacelendrahz""",188


### pandas

In [7]:
def get_user_details2(data: dict, cols: list = None, sort: str = 'created_at') -> pd.DataFrame:
    D = []
    for u in find_key(data, 'user_results'):
        x = u.get('result', {})
        y = x.get('rest_id')
        if z := x.get('legacy', {}):
            D.append({'rest_id': y} | z)
    df = (
        pd.DataFrame(D)
        .drop_duplicates('rest_id')
        .assign(created_at=lambda x: pd.to_datetime(x['created_at']))
        .sort_values(sort.strip('-'), ascending='-' in sort)
        .reset_index(drop=True)
    )
    n = [x for x in df.columns if 'count' in x]
    df[n] = df[n].apply(pd.to_numeric, errors='coerce')
    return df[cols] if cols else df

In [8]:
PATH = Path('data/raw')

data = get_data(PATH, expr='Favoriters|Retweeters') # filter for users who favorited or retweeted a tweet

df = get_user_details2(
    data,
    cols = ['created_at','screen_name','followers_count'],
    sort = '-created_at',
)

df

Unnamed: 0,created_at,screen_name,followers_count
0,2007-03-31 01:16:45+00:00,TheLos,1601
1,2008-03-18 19:04:59+00:00,wickedjava,2986
2,2008-04-17 17:30:21+00:00,needless_input,218
3,2008-06-27 08:58:13+00:00,DebrisStorm,178
4,2008-07-26 21:58:07+00:00,daka17,66
...,...,...,...
1850,2023-02-19 07:06:15+00:00,Later_Hayter,54
1851,2023-02-21 06:47:49+00:00,hart_kanya,2
1852,2023-02-26 09:43:04+00:00,_Val_Nichole,62
1853,2023-03-04 23:50:32+00:00,Chublosophy,346
