In [1]:
import boto3
import os
import pandas as pd
import time
import spacy
from quotes import quote_extractor
from tqdm import tqdm

In [2]:
import pandas as pd
from io import StringIO 

In [3]:
s3 = boto3.client("s3")
def read_df_s3(object_key, bucket="newyorktime"):
    """Reads a csv from s3 and loads into pandas;
    Means do not have to store large files locally anymore. 
    """
    csv_obj = s3.get_object(Bucket=bucket, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_string))
    return df

In [4]:
year = 2011
paper = "scmp"
bucket = "newyorktime"
key = f"{paper}/{year}.csv"

In [6]:
s3_client = boto3.client("s3")
nlp = spacy.load("en_core_web_lg")

In [16]:
os.makedirs("data", exist_ok=True) 

In [11]:
filepath = os.path.join('data', key)

In [18]:
os.path.dirname(filepath) and os.makedirs(os.path.dirname(filepath), exist_ok=True) 


In [34]:
s3_client.download_file(bucket, key, filepath)

df = pd.read_csv(filepath)

In [12]:
def timeit(fn, *args, **kwargs):
    s = time.perf_counter()
    ret = fn(*args, **kwargs)
    e = time.perf_counter()
    print(f"{fn.__name__} took {e-s} secs")
    return ret

In [13]:
def extract_quotes(row, text_col, uid_col, publication="scmp", year="2012"):
    """Row is a row in a dataframe with a text column and unique idenifier column."""
    doc = nlp(row[text_col], disable=[
        "tagger",
        "attribute_ruler",
        "lemmatizer"
    ])
    # list of dictionaries
    final_quotes = quote_extractor.extract_quotes(row[uid_col], doc)
    for item in final_quotes:
        item["source"] = row[uid_col]
        item["publication"] = publication
        item["year"] = year
    return final_quotes

In [14]:
def run(input_df, output_name, publication="scmp", year="2012"):
    quote_dcts = input_df.apply(
        lambda row: extract_quotes(row, "Body", "Index", publication, year), axis=1
    )
    quotedf = pd.json_normalize(quote_dcts.explode()).dropna().convert_dtypes()
    quotedf.to_csv(output_name)
    return quotedf

In [15]:
def upload_s3(filepath, bucket, key=None):
    if not key:
        key = filepath
    try:
        response = s3_client.upload_file(filepath, bucket, key)
        return response
    except ClientError as e:
        logging.exception(e)


In [16]:
# quotedf = timeit(run, df, path)
# upload_s3(path, bucket, key=f"{paper}/quotes/q_{year}.csv")    

In [None]:
for year in tqdm(range(2019, 2021)):
    key = f"{paper}/{year}.csv"
    filepath = os.path.join('data', key)    
    path = f"quotes_{paper}_{year}.csv"
    print("working on ", key)
    s3_client.download_file(bucket, key, filepath)
    df = pd.read_csv(filepath)
    df["Body"] = df.Body.astype(str)
    quotedf = timeit(run, df, path, paper, year)
    upload_s3(path, bucket, key=f"{paper}/quotes/q_{year}.csv")    

  0%|          | 0/2 [00:00<?, ?it/s]

working on  scmp/2019.csv
run took 3885.209267205 secs


 50%|█████     | 1/2 [1:04:50<1:04:50, 3890.75s/it]

working on  scmp/2020.csv
