In [1]:
import sys

sys.path.append('..')
import pandas as pd
import re

import pipeline.config as config

In [2]:
import json
from google.cloud import storage
from contextlib import closing


def load_subsample_index(client, bucket=config.ARTICLE_CONVERT_META_BUCKET,
                         conversion_index=config.ARTICLE_CONVERT_SUBSAMPLE_IDX):
    files = set([f.name for f in client.list_blobs(bucket_or_name=bucket)])
    if conversion_index not in files:
        return {
            "subsampled": 0,
            "files": []
        }
    bucket = client.bucket(bucket)
    with bucket.blob(conversion_index).open("r") as fp:
        index = json.load(fp)
    return index

with closing(storage.Client(project=config.GCP_PROJECT)) as client:
    idx = load_subsample_index(client=client)

In [3]:
dataframes = []
for fname in idx["files"]:
    dataframes.append(pd.read_parquet(fname))

total_df = pd.concat(dataframes, ignore_index=True)
total_df.head()

Unnamed: 0,source,id,url,category,title,published,body,summary,summary_type
0,reuters,41678,https://www.reuters.com/world/asia-pacific/un-...,Asia Pacific,Security Council condemns Taliban ban on Afgha...,2023-04-27T22:17:00,"UNITED NATIONS, April 27 (Reuters) - The U.N. ...",,
1,reuters,41679,https://www.reuters.com/world/europe/lavrov-un...,Europe,"In letter to Putin, UN chief proposes way forw...",2023-04-24T20:55:00,"UNITED NATIONS, April 24 (Reuters) - U.N. Secr...",,
2,reuters,41682,https://www.reuters.com/world/middle-east/tali...,Middle East,Taliban not invited to UN Doha meeting on Afgh...,2023-04-28T18:07:00,"UNITED NATIONS, April 28 (Reuters) - U.N. Secr...",,
3,reuters,41683,https://www.reuters.com/world/asia-pacific/un-...,Asia Pacific,UN chief to convene meeting to work on way for...,2023-04-20T00:18:00,April 19 (Reuters) - U.N. Secretary-General An...,,
4,reuters,41684,https://www.reuters.com/markets/commodities/ru...,Commodities,Russia's Lavrov to talk Ukraine grain deal wit...,2023-04-18T19:30:00,"UNITED NATIONS, April 18 (Reuters) - Russian F...",,


In [6]:
from datetime import datetime

total_df["published"] = total_df['published'].apply(lambda t: datetime.fromisoformat(t))
total_df.to_parquet("gs://scraped-news-article-data-null/all-data.parquet", index=False)