In [2]:
"""
Uses NewsMTSC to get pos, neg, neutral sentiment towards any entity. 
Current pipeline is to read from NER, link to full body
and targetting entities containing certain substrings
Right now runs on "alibaba" containing entities but nothing else 
and no coref resolution. 

runs on newsentiment environ NOT thesis.
Also see pocs/newsentiment_poc.py
"""

'\nUses NewsMTSC to get pos, neg, neutral sentiment towards any entity. \nCurrent pipeline is to read from NER, link to full body\nand targetting entities containing certain substrings\nRight now runs on "alibaba" containing entities but nothing else \nand no coref resolution. \n\nruns on newsentiment environ NOT thesis.\nAlso see pocs/newsentiment_poc.py\n'

In [3]:
import logging
import os
import pandas as pd
import spacy
from NewsSentiment import TargetSentimentClassifier
from NewsSentiment.customexceptions import TooLongTextException, TargetNotFoundException
from thesisutils import utils
from tqdm import tqdm

import logging, logging.config
from pathlib import Path
import logconfig

lgconf = logconfig.logconfig("newssent")
logging.config.dictConfig(lgconf.config_dct)
logger = logging.getLogger(__name__)
# %%
tsc = TargetSentimentClassifier()
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
nlp = spacy.load("en_core_web_md")
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7f0390880c80>

In [5]:


# %%
# FUNCTIONS ###########################################
def span_clean(span):
    """
    span is a spacy span e.g. doc[1:14];
    default is to return span.text,
    but if we see a "Post" reference,
    remove that clause before text conversion.
    """
    delimiters = [",", "(", "-", ")"]
    delim_idx = []
    for tok in span:
        if tok.text in delimiters:
            delim_idx.append(tok.i - span.start)
    if delim_idx:
        ls = []
        subspan = span[: delim_idx[0]]
        if not "Post" in subspan.text:
            ls.append(subspan)
        for i, el in enumerate(delim_idx):
            if i + 1 == len(delim_idx):
                subspan = span[el:]
                if not "Post" in subspan.text:
                    ls.append(subspan)
            else:
                subspan = span[el : delim_idx[i + 1]]
                if "Post" in subspan.text:
                    continue
                ls.append(span[el : delim_idx[i + 1]])
        return "".join(e.text for e in ls)
    else:
        if "Post" in span.text:
            return ""
        else:
            return span.text


# %%
# NOTE: these functions take a standardized df;
# STANDARDIZE IN FIRST LOADING STEP BEFORE passing to fns.
def get_sentiment(row, doc, publication):
    """
    returns dictionary with positive, negative, and neutral
        probabilities for a given NER object.
    Note: result has nan values when input is too long.
    Be sure to do exploration / reporting later.
    :param row: row of NER datafarme
    :param doc: spacy nlped (w/ sentencizer) doc
    """
    start = row.start
    end = row.end
    sent = doc[start].sent
    targ = doc[start:end].text
    if start == sent.start:
        # empty left, target, full right pattern
        tup = ("", targ, span_clean(doc[end : sent.end]))
    elif end == sent.end:
        tup = (span_clean(doc[sent.start : start]), targ, "")
        # full left target empty right
    else:
        tup = (
            span_clean(doc[sent.start : start]),
            targ,
            span_clean(doc[end : sent.end]),
        )
    result = {
        "ner_index": row._name,
        "publication": publication.name,
        "sentence": sent,
        "Art_id": row["Art_id"],
    }
    # return result
    result["Art_id"] = row["Art_id"]
    result["debug"] = " ".join(tup)
    labels = ["negative", "neutral", "positive"]
    try:
        sentiment = tsc.infer_from_text(*tup)
        # might need to add could not find target too.
    except TooLongTextException as e:
        logging.warning("TOO LONG?", e)
        result.update({label: None for label in labels})
        return result
    except TargetNotFoundException as e2:
        logging.warning("TOO TARGET NOT FOUND?", e2)
        result.update({label: None for label in labels})
        return result
    result.update(
        {
            label: dct["class_prob"]
            for label in labels
            for dct in sentiment
            if dct["class_label"] == label
        }
    )

    return result


def getsent2(row, df_target, publication):
    """Basically wrapper around get_sentiment to set doc variable.
    :param row: entity row.
    """
    try:
        doc = nlp(
            df_target.loc[row.Art_id]["Body"],
            disable=["tagger", "parser", "attribute_ruler", "lemmatizer", "ner"],
        )
        return get_sentiment(row, doc, publication)
    except Exception as e:
        logger.warning(e)
        logging.warning(
            "exception encountered on article %s index %s", row.Art_id, row._name
        )
        result = {
            "ner_index": row._name,
            "publication": publication.name,
            "sentence": "",
            "Art_id": row["Art_id"],
        }
        labels = ["negative", "neutral", "positive"]
        result.update({label: None for label in labels})
        return result


def save(key, maindf, bucket="newyorktime"):
    # key = f"{publication.name}/sentiment/{target}_test.csv"
    path = os.path.join( "baba", key)  # NOTE: baba in path. removed first arg utils.ROOTPATH,
    logger.info("saving %s", key)
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    maindf.to_csv(path)
    logger.info("uploading to %s/%s", bucket, key)
    utils.upload_s3(path, bucket, key)


# %%
def run(pub, target, tts="full", bucket="newyorktime"):
    """Performs sentiment analysis on publications ner
    :param sample: subsets data to just 10% train split.
    :param tts: train test split: 'full', 'train', or 'test'.
        train or test will filter for just that mask.
    """
    # todo: get df from s3
    nerdf = utils.standardize(
        utils.read_df_s3(f"{pub.name}/ner/ner_full.csv", bucket),
        # utils.get_df(pub, "ner", "ner_full.csv"),
        pub,
        drop_dups=False,
    )
    df = utils.standardize(
        utils.read_df_s3(f"{pub.name}/{pub.name}_full.csv", bucket), pub
    )
    df = df.set_index("Art_id")
    if tts != "full":
        split = utils.standardize(
            utils.read_df_s3(f"{pub.name}/tts_mask/{tts}_main1.csv", bucket), pub
        )
        df = df[df.index.isin(split.Art_id)]
        nerdf = nerdf[nerdf.Art_id.isin(split.Art_id)]

    # filter maindf for only cases where baba shows up;
    # takes 1.5 minutes to filter
    mask = nerdf.entity.astype(str).str.lower().str.contains(target)
    ner_target = nerdf[mask]
    # scmp has 12627 entities to look up;
    # ~1000 training examples
    logger.info(f"working on {len(ner_target)} entities")
    # masking main df takes 23 s
    mask2 = df["Body"].astype(str).str.lower().str.contains(target)
    df_target = df[mask2]
    logger.info(f"working on {len(df_target)} documents")
    # 5007 documents with alibaba total; 432 in training set.
    # 1.5-2s per iteration = 5 hours to run on full;30m-1 hr on subset
    rows = ner_target.progress_apply(lambda row: getsent2(row, df_target, pub), axis=1)
    maindf = pd.json_normalize(rows)
    key = f"{pub.name}/sentiment/{target}_{tts}.csv"
    save(key, maindf, bucket)
    return maindf



In [6]:

# %%
target = "alibaba"
bucket = "aliba"

In [7]:
# %%
pub = utils.publications["hkfp"]
maindf = utils.timeit(run, pub, target, "train", bucket)
print(maindf.head())

2022-07-08 16:14:53,892 [INFO] __main__: working on 166 entities
2022-07-08 16:14:53,904 [INFO] __main__: working on 87 documents


 46%|████▌     | 76/166 [01:26<01:44,  1.16s/it]--- Logging error ---
Traceback (most recent call last):
  File "/tmp/ipykernel_22887/2493925981.py", line 77, in get_sentiment
    sentiment = tsc.infer_from_text(*tup)
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/infer.py", line 84, in infer_from_text
    return self.infer(text_left=left, target_mention=target, text_right=right)
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/infer.py", line 131, in infer
    indexed_example = self.tokenizer.create_model_input_seqs(
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/dataset.py", line 753, in create_model_input_seqs
    target_mask_seq_for_text_with_special_tokens = self._create_target_mask(
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/dataset.py", line 442, in _create_target_

2022-07-08 16:18:00,311 [INFO] __main__: saving hkfp/sentiment/alibaba_train.csv
2022-07-08 16:18:00,332 [INFO] __main__: uploading to aliba/hkfp/sentiment/alibaba_train.csv
run took 186.85149712199996 secs
   ner_index publication                                           sentence  \
0         29        hkfp  (\n, The, 51, -, year, -, old, founder, of, in...   
1         49        hkfp  (His, Alibaba, went, public, at, the, New, Yor...   
2        118        hkfp  (\n, Taobao, ’s, parent, company, Alibaba, Gro...   
3        120        hkfp  (\n, In, a, statement, to, Hong, Kong, Free, P...   
4        220        hkfp  (\n, However, ,, unlike, Black, Friday, ,, Chi...   

        Art_id                                              debug  negative  \
0   post-18508  \nThe 51-year-old founder of internet giant Al...  0.083025   
1   post-18508  His Alibaba went public at the New York Stock ...  0.025798   
2   post-28348  \nTaobao’s parent company Alibaba Group said l...  0.205926   
3 




In [10]:
pub = utils.publications["hkfp"]
maindf = utils.timeit(run, pub, target, "test", bucket)
print(maindf.head())

2022-07-08 16:51:42,065 [INFO] __main__: working on 193 entities
2022-07-08 16:51:42,086 [INFO] __main__: working on 87 documents


 77%|███████▋  | 149/193 [02:33<00:46,  1.06s/it]--- Logging error ---
Traceback (most recent call last):
  File "/tmp/ipykernel_22887/2493925981.py", line 77, in get_sentiment
    sentiment = tsc.infer_from_text(*tup)
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/infer.py", line 84, in infer_from_text
    return self.infer(text_left=left, target_mention=target, text_right=right)
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/infer.py", line 131, in infer
    indexed_example = self.tokenizer.create_model_input_seqs(
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/dataset.py", line 815, in create_model_input_seqs
    text_dependency_tree_hop_distances = self._create_dependency_tree_hop_distances_of_tokens_to_target(
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/dataset.py",

2022-07-08 16:55:02,351 [INFO] __main__: saving hkfp/sentiment/alibaba_test.csv
2022-07-08 16:55:02,371 [INFO] __main__: uploading to aliba/hkfp/sentiment/alibaba_test.csv
run took 200.75716947699993 secs
   ner_index publication                                           sentence  \
0         14        hkfp  (\n, According, to, Sina, News, ,, internet, c...   
1         72        hkfp  (\n, Other, large, retailers, including, eBay,...   
2         86        hkfp  (\n, In, China, ,, where, Alibaba, lends, to, ...   
3        128        hkfp  (\n, Ma, ,, Alibaba, ’s, founder, ,, bought, t...   
4        158        hkfp  (\n, Update, (, 14:00, August, 17, ):, A, spok...   

       Art_id                                              debug  negative  \
0  post-16926  \nAccording to Sina News, internet companies i...  0.041235   
1  post-17316  \nOther large retailers including eBay Inc’s P...  0.035805   
2  post-17316  \nIn China, where Alibaba lends to small busin...  0.067813   
3  post-




In [9]:
pub = utils.publications["nyt"]
maindf = utils.timeit(run, pub, target, "train", bucket)
print(maindf.head())

2022-07-08 16:46:36,230 [INFO] __main__: working on 295 entities
2022-07-08 16:46:36,240 [INFO] __main__: working on 166 documents


100%|██████████| 295/295 [04:52<00:00,  1.01it/s]

2022-07-08 16:51:28,836 [INFO] __main__: saving nyt/sentiment/alibaba_train.csv
2022-07-08 16:51:28,860 [INFO] __main__: uploading to aliba/nyt/sentiment/alibaba_train.csv
run took 293.08249660799993 secs
   ner_index publication                                           sentence  \
0         14         nyt  (Alibaba.com, ,, a, fast, -, growing, Chinese,...   
1         17         nyt  (\n\n, In, a, statement, released, late, Monda...   
2         22         nyt  (But, the, announcement, stunned, the, technol...   
3         56         nyt  (The, company, has, been, in, negotiations, wi...   
4         84         nyt  (She, recently, closed, a, $, 7.6, billion, de...   

                                              Art_id  \
0  https://www.nytimes.com/2011/02/22/business/gl...   
1  https://www.nytimes.com/2011/02/22/business/gl...   
2  https://www.nytimes.com/2011/02/22/business/gl...   
3  https://www.nytimes.com/2012/01/18/technology/...   
4  https://www.nytimes.com/2012/09/26/te




In [11]:
pub = utils.publications["nyt"]
maindf = utils.timeit(run, pub, target, "test", bucket)
print(maindf.head())

2022-07-08 16:55:02,873 [INFO] __main__: working on 313 entities
2022-07-08 16:55:02,881 [INFO] __main__: working on 171 documents


100%|██████████| 313/313 [05:14<00:00,  1.00s/it]

2022-07-08 17:00:16,972 [INFO] __main__: saving nyt/sentiment/alibaba_test.csv
2022-07-08 17:00:16,994 [INFO] __main__: uploading to aliba/nyt/sentiment/alibaba_test.csv
run took 314.56090909700015 secs
   ner_index publication                                           sentence  \
0          2         nyt  (\n\n, But, just, as, quickly, as, it, began, ...   
1          6         nyt  (\n\n, The, announcement, is, a, bid, to, end,...   
2          9         nyt  (\n\n, The, announcement, is, a, bid, to, end,...   
3         13         nyt  (\n\n, Shares, of, Yahoo, plunged, after, the,...   
4         76         nyt  (\n\n, Largely, because, of, a, long, -, await...   

                                              Art_id  \
0  https://www.nytimes.com/2011/05/16/technology/...   
1  https://www.nytimes.com/2011/05/16/technology/...   
2  https://www.nytimes.com/2011/05/16/technology/...   
3  https://www.nytimes.com/2011/05/16/technology/...   
4  https://www.nytimes.com/2012/10/23/tech




In [None]:
# pub = utils.publications["globaltimes"]
# maindf = utils.timeit(run, pub, target, "train", bucket)
# print(maindf.head())
# %%

In [None]:
pub = utils.publications["globaltimes"]
maindf = utils.timeit(run, pub, target, "test", bucket)
print(maindf.head())

2022-07-08 18:52:51,309 [INFO] __main__: working on 5903 entities
2022-07-08 18:52:51,332 [INFO] __main__: working on 2344 documents


 69%|██████▉   | 4077/5903 [1:06:42<29:28,  1.03it/s]--- Logging error ---
Traceback (most recent call last):
  File "/tmp/ipykernel_2295/2493925981.py", line 77, in get_sentiment
    sentiment = tsc.infer_from_text(*tup)
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/infer.py", line 84, in infer_from_text
    return self.infer(text_left=left, target_mention=target, text_right=right)
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/infer.py", line 131, in infer
    indexed_example = self.tokenizer.create_model_input_seqs(
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/dataset.py", line 753, in create_model_input_seqs
    target_mask_seq_for_text_with_special_tokens = self._create_target_mask(
  File "/home/ec2-user/SageMaker/.persisted_conda/newssent/lib/python3.8/site-packages/NewsSentiment/dataset.py", line 442, in _create_tar

In [None]:
pub = utils.publications["chinadaily"]
maindf = utils.timeit(run, pub, target, "train", bucket)
print(maindf.head())

2022-07-08 20:46:06,196 [INFO] __main__: working on 9608 entities
2022-07-08 20:46:06,240 [INFO] __main__: working on 3609 documents


 38%|███▊      | 3699/9608 [1:00:46<1:35:40,  1.03it/s]

In [None]:
pub = utils.publications["scmp"]
maindf = utils.timeit(run, pub, target, "test", bucket)
print(maindf.head())

In [None]:



# %%

# %%
pub = utils.publications["globaltimes"]
maindf = utils.timeit(run, pub, target, "train", bucket)
print(maindf.head())
# %%
pub = utils.publications["scmp"]
maindf = utils.timeit(run, pub, target, "train", bucket)
print(maindf.head())
# %%

# sanity check the post removal works.
# y = maindf.sentence.str.contains("Post")
# y.apply(lambda d: print(d.debug, "\n", d.sentence, "\n new \n"), axis=1)

# %%
# GROUP APPROACH SLIGHTLY SLOWER? ########################################
# groups = ner_target.groupby("Art_id")
# dfls = []
# groups = ner_target.groupby("Art_id")
# for name, group in tqdm(list(groups)):
#     # if name == 876073:
#         print(name)
#         doc = nlp(
#             df_target.loc[name]["Body"],
#             disable=["tagger", "parser", "attribute_ruler", "lemmatizer", "ner"],
#         )
# mask = group.entity.str.lower().str.contains("alibaba")
# filtered = group[mask]
# if len(filtered) > 0:
# res = group.progress_apply(lambda row: get_sentiment(row, doc, pub), axis=1)
# resdf = pd.json_normalize(res)
# dfls.append(resdf)
# maindf = pd.concat(dfls)
# len(list(groups))
# %%
# name
# group
# dfls = []

# ner_baba
# ner_target.Art_id.eq(876073).value_counts()
# for name, group in list(groups)[:5]:
#     doc = nlp(
#         df_target.loc[name]["Body"],
#         disable=["tagger", "parser", "attribute_ruler", "lemmatizer", "ner"],
#     )
#     # mask = group.entity.str.lower().str.contains("alibaba")
#     # filtered = group[mask]
#     # if len(filtered) > 0:
#     res = group.apply(lambda row: get_sentiment(row, doc), axis=1)
#     resdf = pd.json_normalize(res)
#     dfls.append(resdf)
# # takes 30 seconds to run on 5 groups which is 18 entities

# very slow for now...
# but does get sentiment for each relevant entity;
# seems better to filter for baba before grouping bc group operations suck.
# maindf = pd.concat(dfls)
# maindf
# upload to s3 here.

# %%

# import logging

# import pandas as pd
# import neuralcoref
# %%
# COREFF APPORACH OLD######################################
# import spacy

# logging.basicConfig(level=logging.INFO)
# nlp = spacy.load("en_core_web_md")
# neuralcoref.add_to_pipe(nlp)
# #%%
# doc = nlp("My sister has a dog. She loves him.")
# nlp.remove_pipe(name="neuralcoref")
# coref = neuralcoref.NeuralCoref(
#     nlp.vocab, conv_dict={"Lam": ["woman", "Carrie", "executive"]}
# )
# nlp.add_pipe(coref, name="neuralcoref")
# doc = nlp(
#     "Carrie Lam passed the extradition bill, which Ted Hui said will ruin Hong Kong. Lam disagrees with him."
# )
# doc._.has_coref
# doc._.coref_clusters[1].main.text  # .mentions[0].text
# doc._.coref_scores
# doc._.coref_resolved

# #%%
# df = pd.read_csv(r"C:\Users\tlebr\OneDrive - pku.edu.cn\Thesis\data\scmp\2021.csv")
# row = df.iloc[3]
# r
# doc = nlp(row.Body)

# ppl = [ent for ent in doc.ents if ent.label_ == "PERSON"]
# ppl[-1].start
# ppl[-1].end
# sent_start = ppl[-1].sent.start
# off_start = ppl[-1].start - sent_start
# off_end = ppl[-1].end
# ppl[-1].sent
# [p.sent for p in ppl][-1].start
# [0]
# # .start
# # [1]

# doc.ents[0].label_ == "PERSON"
# print(x.Body)
maindf.tail()[["negative", "positive", "neutral"]]
maindf.tail().debug.apply(print)
