Run via sagemaker to get the non-scmp long speakers. Long speakers look to match single work speakers with longer entities. 

In [2]:
import re
from thesisutils import utils
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

In [46]:
def removepunct(s):
    return re.sub(r"[^\w\s]", "", str(s))


def lookupname(quoterow):
    """Looks up a single word speaker and tries to find a full name entity.

    :param quoterow: row in dataframe with quotes in it.
    """

    index = quoterow[publication.uidcol]  # .squeeze()
    s = quoterow["prepro_speaker"]
    y = ner[ner[publication.uidcol].eq(index)]
    # NOTE: could do just PERSON entities?
    candidates = y[y.prepro_entity.str.contains(s)]
    if len(candidates) == 0:
        return s
    longest = candidates.prepro_entity.str.len().idxmax()
    return y.loc[longest].prepro_entity


# %%
def longspeakerpipeline(df, ner):
    """df contains quotes/speakers, ner contains entities. 
    Preprocesses speakers & entities; matches single word speakers to longer
     speakers, stores in "longer speakers" column Takes 2.5 minutes to run
    """
    df["prepro_speaker"] = (
        df.speaker.str.lower().
        str.replace("’s|'s", "").
        progress_apply(removepunct)
    )
    ner["prepro_entity"] = (
        ner.entity.str.lower()
        .str.replace("’s|'s", "")
        .progress_apply(removepunct)
    )
    drops = [
        "he",
        "she",
        "it",
        "they",
        "you",
    ]  # "a source", "the who", "the post"],
    # dropmask just filters for speakers who are NOT pronouns
    # dropmask = ~df.prepro_speaker.isin(drops)
    df["single_speaker"] = (
        df.prepro_speaker
        .str.split()
        .str.len()
        .eq(1)
    )
    # run on non-direct quotes and filter later. 
    mask2 = ~df.prepro_speaker.isin(drops) & df.single_speaker# & ~df.direct
    # match single word, non-pronoun quotes
    df.loc[mask2, "long_speaker"] = df[mask2].progress_apply(lookupname, axis=1)
    # add multi word speakers to long speaker column ()
    # takes ~ 2.5 minutes
    df.loc[~df.single_speaker, "long_speaker"] = df.loc[
        ~df.single_speaker
    ].prepro_speaker
    return df

In [None]:
for publication in utils.publications.values():
    if publication.name not in ("scmp"):
        print(publication.name)
        df = utils.read_df_s3(f"{publication.name}/quotes/quotes_full.csv")    
        ner = utils.read_df_s3(f"{publication.name}/ner/ner_full.csv")
        df2 = longspeakerpipeline(df, ner)
        display(df2.long_speaker.value_counts().head(20))
        utils.df_to_s3(df2, key=f"{publication.name}/quotes/quotes_full_edits.csv")

globaltimes


  str.replace("’s|'s", "").
100%|██████████| 156037/156037 [00:00<00:00, 296957.32it/s]
  .str.replace("’s|'s", "")
100%|██████████| 910403/910403 [00:01<00:00, 591900.04it/s]
100%|██████████| 56513/56513 [01:54<00:00, 491.47it/s]


\n                                              14318
nan                                             11808
the report                                       1341
analysts                                         1040
experts                                          1025
\n\n                                              837
the company                                       796
the statement                                     698
media reports                                     647
the xinhua news agency                            613
carrie lam                                        504
the spokesperson                                  481
who                                               400
reuters                                           381
tian feilong                                      378
observers                                         340
the chinese gold and silver exchange society      294
which                                             279
\nexperts                   

chinadaily


  df = pd.read_csv(StringIO(csv_string))
  df = pd.read_csv(StringIO(csv_string))
  str.replace("’s|'s", "").
100%|██████████| 346619/346619 [00:00<00:00, 540285.44it/s]
  .str.replace("’s|'s", "")
100%|██████████| 1964116/1964116 [00:02<00:00, 657008.82it/s]
 42%|████▏     | 57740/137931 [1:48:58<2:27:15,  9.08it/s]