In [23]:
import pandas as pd
import json

In [22]:
with open("../data/raw/tweets.jsonl") as f:
    tweet_searches = f.readlines()

In [45]:
from pydantic import BaseModel
from typing import Optional, List
from datetime import datetime

class PublicMetricsModel(BaseModel):
    retweet_count: int
    reply_count: int
    like_count: int
    quote_count: int
        
class TweetModel(BaseModel):
    public_metrics: PublicMetricsModel
    text: str
    created_at: datetime
    author_id: int
    id: int

class UserModel(BaseModel):
    id: int
    name: str
    username: str

class IncludesResult(BaseModel):
    users: List[UserModel]

class MetaModel(BaseModel):
    result_count: int
    newest_id: Optional[int]
    oldest_id: Optional[int]
        
class SearchResult(BaseModel):
    meta: MetaModel
    includes: Optional[IncludesResult]
    data: Optional[List[TweetModel]]

In [67]:
SearchResult(**json.loads(tweet_searches[1])).data[0].dict(exclude={'public_metrics'})

{'text': '😜 https://t.co/xYL0cheL6r',
 'created_at': datetime.datetime(2021, 4, 26, 21, 12, 15, tzinfo=datetime.timezone.utc),
 'author_id': 2214159667,
 'id': 1386790251134431241}

In [53]:
user_df = pd.DataFrame(columns=['id', 'name', 'username'])
tweet_df = pd.DataFrame(columns=['id', 'author_id', 'created_at', 'text', 'retweet_count', 'reply_count', 'like_count', 'quote_count'])

In [78]:
for search_result in tweet_searches:
    result = SearchResult(**json.loads(search_result))
    
    if result.data is None:
        continue
    
    for user in result.includes.users:
        user_df = user_df.append(user.dict(), ignore_index=True)
    
    for tweet in result.data:
        metrics = tweet.public_metrics.dict()
        info = tweet.dict(exclude={'public_metrics'})
        
        info.update(metrics)
        
        tweet_df = tweet_df.append(info, ignore_index=True)

In [85]:
len(user_df.id.unique())

350

In [86]:
len(user_df)

533