In [1]:
import pandas as pd
import numpy as np
from google.cloud import storage
from datetime import datetime
from tqdm import tqdm
import re
from pandarallel import pandarallel
from contextlib import closing
import json


pandarallel.initialize(progress_bar=True)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
def get_work(source="markdown-corref", filter_f=lambda article: True):
    with closing(storage.Client(project="msca310019-capstone-f945")) as client:
        result = []
        bucket = client.bucket(source)
        tbd = set([f.name for f in client.list_blobs(bucket_or_name=source)])
        tbd_df = pd.DataFrame({
            "tbd": list(tbd)
        })
    
    def filter_article(f_name):
        with closing(storage.Client(project="msca310019-capstone-f945")) as client:
            bucket = client.bucket(source)
            with bucket.blob(f_name).open("r") as fp:
                article = json.load(fp)
                if filter_f(article):
                    return article
        return ""
    
    tbd_df["result"] = tbd_df.tbd.parallel_apply(filter_article)
    tbd_df_filtered = tbd_df.loc[tbd_df["result"].str.len() > 0]
    
    return tbd_df_filtered.result.to_list()

In [3]:
def year_month_filter(article, year=2023, month=4):
    timestamp = datetime.fromisoformat(article["published"])
    return timestamp.year == year and timestamp.month == month

In [6]:
sample = get_work(filter_f=year_month_filter)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4124), Label(value='0 / 4124'))), …

In [7]:
len(sample)

5526

In [10]:
for s in sample:
    if s["source"] == "reuters" and len(s["title"]) < len(s["category"]):
        real_title = s["category"]
        s["category"] = s["title"]
        s["title"] = real_title
        
final_df = pd.DataFrame(sample)
final_df.head()

Unnamed: 0,source,id,category,title,published,body,summary,summary_type
0,cnbc,3717,Politics,DeSantis and allies ramp up Disney fight as more Republicans criticize his tactics,2023-04-19T20:00:49+00:00,"## In this article\nFollow your favorite stocksCREATE FREE ACCOUNT\nFlorida Gov. Ron DeSantis responds to a question during a press conference at the headquarters of the former Reedy Creek Improvement District that a newly appointed board now calls the Central Florida Tourism Oversight District, in Lake Buena Vista, Florida, Monday, April 17, 2023.\nFlorida Gov. Ron DeSantis and Florida Gov. Ron DeSantis's allies are ramping up Florida Gov. Ron DeSantis and his allies's fight against Walt Di...","* Florida Gov. Ron DeSantis ripped Disney over its recent maneuvers to thwart the governor's efforts to seize some control of the company's Orlando parks and properties.\n* The DeSantis-picked board overseeing Disney World's special tax district moved to regain authority that they say Disney wrongly took away just before they took charge.\n* The public feud stems from Disney's vocal opposition to the Florida legislation dubbed ""Don't Say Gay"" by critics.\n* Former President Donald Trump, now...",BULLETS
1,reuters,1996,LitigationCorporate StructureLawsuits,J&J talc unit faces skepticism on bankruptcy settlement support,2023-04-11T22:19:00,"April 11 (Reuters) - A Johnson & Johnson (JNJ.N) company on Tuesday defended A Johnson & Johnson (JNJ.N) company's second attempt to resolve talc lawsuits in bankruptcy, telling a U.S. judge A Johnson & Johnson (JNJ.N) company can quickly build consensus around A Johnson & Johnson (JNJ.N) company's $8.9 billion settlement offer despite doubts about A Johnson & Johnson (JNJ.N) company's assertion that 60,000 plaintiffs would support the deal.\nthe deal has divided lawyers representing cancer ...",,
2,reuters,10431,Aerospace & Defense,US GAO denies Lockheed protest of Textron $7 bln Army helicopter deal,2023-04-06T20:41:00,"WASHINGTON, April 6 (Reuters) - The U.S. Government Accountability Office on Thursday denied Lockheed Martin Corp's (LMT.N) protest of the Army contract for the Future Long Range-Assault Aircraft worth as much as a $7.1 billion, awarded to Textron Inc's (TXT.N) Bell helicopter unit over Lockheed Martin Corp's's Sikorsky unit.\nArmy ""reasonably evaluated Sikorsky's proposal as technically unacceptable because Sikorsky failed to provide the level of architectural detail required by the"" reques...",,
3,reuters,14305,United States,"'Donald Trump raped me,' writer says at civil trial",2023-04-26T23:22:00,"NEW YORK, April 26 (Reuters) - A writer explained in graphic detail on Wednesday how Donald Trump allegedly the alleged rape A writer nearly 30 years ago, at a civil trial to determine whether Donald Trump the alleged rape A writer and then lied about the alleged rape.\n""A writer'm here because Donald Trump the alleged rape A writer, and when A writer wrote about the alleged rape, Donald Trump lied and said the alleged rape didn't happen,"" A writer told jurors in federal court in Manhattan. ...",,
4,reuters,10282,Charged,Walmart plans own EV charger network at U.S. stores by 2030,2023-04-06T16:52:00,"NEW YORK, April 6 (Reuters) - Walmart Inc (WMT.N) Walmart's plan to have Walmart Inc (WMT.N)'s own network of electric vehicle charging stations by 2030 to tap into the growing adoption of EVs in the United States.\nelectric vehicle charging stations will be placed at thousands of Walmart Inc (WMT.N) and Sam's Club stores, alongside nearly 1,300 Walmart Inc (WMT.N) already offers as part of a deal with Volkswagen (VOWG_p.DE) unit Electrify America, one of the United States's largest open pub...",,


In [11]:
final_df.to_parquet("gs://scraped-news-article-data-null/april-2023.parquet", index=False)