In [2]:
import boto3
import os
import pandas as pd
import time
import spacy
from quotes import quote_extractor
from tqdm import tqdm

In [3]:
import pandas as pd
from io import StringIO 

In [1]:
from thesisutils import utils

In [4]:
from tqdm import tqdm

In [5]:
import pkg_resources
pkg_resources.get_distribution('tqdm').version

'4.64.0'

In [6]:
tqdm.pandas()

In [7]:
import logging
logging.basicConfig(filename='app.log', filemode='w',
                    level=logging.INFO,
                    format='%(asctime)s %(name)s - [%(levelname)s] - %(message)s')

In [8]:
# s3 = boto3.client("s3")
# def read_df_s3(object_key, bucket="newyorktime"):
#     """Reads a csv from s3 and loads into pandas;
#     Means do not have to store large files locally anymore. 
#     """
#     csv_obj = s3.get_object(Bucket=bucket, Key=object_key)
#     body = csv_obj['Body']
#     csv_string = body.read().decode('utf-8')
#     df = pd.read_csv(StringIO(csv_string))
#     return df

In [9]:
nlp = spacy.load("en_core_web_lg")

In [10]:
def df_to_s3(df, key, bucket="newyorktime"):
    """Directly saves a dataframe to a csv on s3 without saving locally."""
    logging.info("uploading %s to s3", key)
    csv_buffer = StringIO()
    df.to_csv(csv_buffer)
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())

In [11]:
def extract_quotes(row, publication):
    """Row is a row in a dataframe with a text column and unique identifier column."""
    text_col = publication.textcol
    uid_col = publication.uidcol
    logging.info("working on %s", row[uid_col])
    doc = nlp(row[text_col], disable=[
        "tagger",
        "attribute_ruler",
        "lemmatizer"
    ])
    # list of dictionaries
    final_quotes = quote_extractor.extract_quotes(row[uid_col], doc)
    for item in final_quotes:
        item[uid_col] = row[uid_col]
        item["publication"] = publication.name
    return final_quotes

In [12]:
def run(publication):
    logging.info("STARTING RUN FOR %s", publication.name)
#     input_df = utils.get_df(publication).head()
    input_df = utils.read_df_s3(f"{publication.name}/{publication.name}_full.csv")
#     print(len(input_df))
    input_df[publication.textcol] = input_df[publication.textcol].astype(str)
    output_name = f"{publication.name}/quotes/quotes_full.csv"
    quote_dcts = input_df.progress_apply(
        lambda row: extract_quotes(row, publication), axis=1
    )
    quotedf = pd.json_normalize(quote_dcts.explode()).dropna().convert_dtypes()
    df_to_s3(quotedf, output_name)
    logging.info("FINSIHED RUN FOR %s", publication.name)
    return quotedf

In [65]:
publication = utils.publications["nyt"]

In [66]:
quotedf = utils.timeit(run, publication)

100%|██████████| 10632/10632 [09:48<00:00, 18.05it/s]


run took 591.1505917969998 secs


In [None]:
publication = utils.publications["hkfp"]
quotedf = utils.timeit(run, publication)

 98%|█████████▊| 20013/20396 [35:17<02:21,  2.71it/s]  

In [None]:
publication = utils.publications["globaltimes"]
quotedf = utils.timeit(run, publication)

 27%|██▋       | 7204/26293 [16:21<1:28:49,  3.58it/s]

In [29]:
import gc

gc.collect()

592

In [19]:
import gc

In [None]:
publication = utils.publications["chinadaily"]
utils.timeit(run, publication)

 95%|█████████▍| 44963/47420 [3:15:19<21:20,  1.92it/s]   

In [4]:
year = 2011
paper = "scmp"
bucket = "newyorktime"
key = f"{paper}/{year}.csv"

In [6]:
s3_client = boto3.client("s3")


In [16]:
os.makedirs("data", exist_ok=True) 

In [11]:
filepath = os.path.join('data', key)

In [18]:
os.path.dirname(filepath) and os.makedirs(os.path.dirname(filepath), exist_ok=True) 


In [34]:
s3_client.download_file(bucket, key, filepath)

df = pd.read_csv(filepath)

In [12]:
# def timeit(fn, *args, **kwargs):
#     s = time.perf_counter()
#     ret = fn(*args, **kwargs)
#     e = time.perf_counter()
#     print(f"{fn.__name__} took {e-s} secs")
#     return ret

In [15]:
# def upload_s3(filepath, bucket, key=None):
#     if not key:
#         key = filepath
#     try:
#         response = s3_client.upload_file(filepath, bucket, key)
#         return response
#     except ClientError as e:
#         logging.exception(e)


In [16]:
# quotedf = timeit(run, df, path)
# upload_s3(path, bucket, key=f"{paper}/quotes/q_{year}.csv")    

In [None]:
# for year in tqdm(range(2019, 2021)):
#     key = f"{paper}/{year}.csv"
#     filepath = os.path.join('data', key)    
#     path = f"quotes_{paper}_{year}.csv"
#     print("working on ", key)
#     s3_client.download_file(bucket, key, filepath)
#     df = pd.read_csv(filepath)
#     df["Body"] = df.Body.astype(str)
#     quotedf = utils.timeit(run, df, path, paper, year)
#     upload_s3(path, bucket, key=f"{paper}/quotes/q_{year}.csv")    

  0%|          | 0/2 [00:00<?, ?it/s]

working on  scmp/2019.csv
run took 3885.209267205 secs


 50%|█████     | 1/2 [1:04:50<1:04:50, 3890.75s/it]

working on  scmp/2020.csv
