In [1]:
import re
import json
from collections import defaultdict
from difflib import SequenceMatcher
import pandas as pd
from tqdm.auto import tqdm

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
sentences = pd.read_csv("data_pp/sentences.csv")
sentences_annotated = pd.read_csv("data_pp/sentences_annotated.csv")
collections = pd.read_csv("data_pp/collections.csv")
entries_raw = pd.read_csv("data/entries_raw.csv")

In [3]:
# 1. entries_raw: remove "$/[.+/] " from the begining of entries
def entry_pp(s):
    if not isinstance(s, str):
        return s
    s = re.sub(r"^\[.+\] ", "", s).strip()
    s = re.sub(r"^\(.+\) ", "", s).strip()
    return s
entries_raw["modified_excerpt_text_clean"] = entries_raw["modified_excerpt_text"].apply(entry_pp)
entries_raw = entries_raw[~entries_raw["modified_excerpt_text_clean"].isna()]

In [4]:
# 2. sentences: remove sentences with tagged=True
# deactivated
unannotated_sentences = sentences_annotated[~sentences_annotated["is_selected"]][["lead_id", "excerpt_id"]]
unannotated_sentences["unique_id"] = unannotated_sentences['lead_id'].astype(str) + "_" + unannotated_sentences['excerpt_id'].astype(str)
sentences["unique_id"] = sentences['lead_id'].astype(str) + "_" + sentences['excerpt_id'].astype(str)
#sentences_to_be_matched = sentences[sentences["unique_id"].isin(unannotated_sentences["unique_id"])]
sentences_to_be_matched = sentences

In [5]:
# 3. unannotated_sentences: remove sentences with
#    word count < 4
#    meaningless content, e.g. Low Int, random strings, etc.

def pp(s):
    if not isinstance(s, str) or not s.strip():
        return False
    split = s.split()
    if len(split) < 3:
        return False
    if re.match(r"((Low )|(High )|(Low High )|(High Low ))+Int\.", s):
        return False
    return True

sentences_to_be_matched["keep"] = sentences_to_be_matched["excerpt_text"].map(pp)
sentences_to_be_matched = sentences_to_be_matched[sentences_to_be_matched["keep"]]
sentences_to_be_matched.shape

(354334, 10)

In [6]:
# 4. sentences: remove sentences with doc_ids that do not exist in entries_raw["lead_id"]
sentences_to_be_matched = sentences_to_be_matched[sentences_to_be_matched["lead_id"].isin(entries_raw["lead_id"])]
sentences_to_be_matched.shape

(339549, 10)

In [7]:
# 5. sentences: for each sentence: fuzzymatch it entries that have the same doc_id
# take a sentence
threshold = 70
matches = []
for i, sentence_row in tqdm(enumerate(sentences_to_be_matched.itertuples()), total=len(sentences_to_be_matched)):
    sentence = sentence_row.excerpt_text # sentence_row[1]["excerpt_text"]
    doc_id = sentence_row.lead_id # sentence_row[1]["lead_id"]
    # extract entries with same doc_id
    candidates = entries_raw[entries_raw["lead_id"]==doc_id]
    candidates = candidates["modified_excerpt_text_clean"].to_dict()
    matching_entry_text, ratio, matching_entry_id = process.extractOne(sentence, candidates, scorer=fuzz.token_set_ratio)
    # matching_entry_text, ratio, matching_entry_id = process.extractOne(sentence, candidates, scorer=fuzz.partial_ratio)
    # matching_entry_text, ratio, matching_entry_id = process.extractOne(sentence, candidates, scorer=fuzz.)
    if ratio >= threshold:
        matches.append((sentence_row[0], matching_entry_id, ratio))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=339549.0), HTML(value='')))




In [8]:
long = 0
short = 0
long_matches = []
short_matches = []
shortest_contiguous_sequence_length = 4
for orig_i, mod_i, _ in tqdm(matches):
    s1 = sentences_to_be_matched.loc[orig_i, "excerpt_text"]
    s2 = entries_raw.loc[mod_i, "modified_excerpt_text_clean"]
    if len(s2) > len(s1):
        s1, s2 = s2, s1
    match = SequenceMatcher(None, s1, s2, False).find_longest_match(0, len(s1), 0, len(s2))
    match_ratio = match.size/len(s2)
    match_len = len(s1[match.a: match.a + match.size].split())
    if match_len > shortest_contiguous_sequence_length:
        long += 1
        long_matches.append((orig_i, mod_i, match.size, match_ratio))
    else:
        short += 1
        short_matches.append((orig_i, mod_i, match.size, match_ratio))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=75792.0), HTML(value='')))




In [9]:
long, short #

(66919, 8873)

In [10]:
matches_dict = dict() # orig_sen --> mod_sen
for orig_sen_i, mod_sen_i, _, _ in long_matches:
    matches_dict[sentences.loc[orig_sen_i, "unique_id"]] = (orig_sen_i, mod_sen_i)

In [11]:
# unique id for sentences
sentences["unique_id"] = sentences['lead_id'].astype(str) + "_" + sentences['excerpt_id'].astype(str)
# unique id for sentences_annotated
sentences_annotated["unique_id"] = sentences_annotated['lead_id'].astype(str) + "_" + sentences_annotated['excerpt_id'].astype(str)

In [12]:
already_labeled = 0
sentences_annotated_new = sentences_annotated.copy()
for i, sentence_row in tqdm(enumerate(sentences_annotated.itertuples()), total=len(sentences_annotated)):
    unique_id = sentence_row.unique_id # sentence_row[1]["unique_id"]
    if unique_id in matches_dict:
        # fetch the annotation from entries_raw
        orig_sen_i, mod_sen_i = matches_dict[unique_id]
        assert sentences_annotated_new.loc[orig_sen_i, "unique_id"] == unique_id
        assert entries_raw.loc[mod_sen_i, "lead_id"] == sentences_annotated_new.loc[orig_sen_i, "lead_id"]
        if (entries_raw.loc[mod_sen_i, "label_sectors"] is not None) or (entries_raw.loc[mod_sen_i, "label_dimensions"] is not None):
            if sentences_annotated.loc[orig_sen_i, "is_selected"]:
                already_labeled += 1
            sentences_annotated_new.loc[orig_sen_i, "is_selected"] = True
            sentences_annotated_new.loc[orig_sen_i, "label_sectors"] = entries_raw.loc[mod_sen_i, "label_sectors"]
            sentences_annotated_new.loc[orig_sen_i, "label_dimensions"] = entries_raw.loc[mod_sen_i, "label_dimensions"]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=355715.0), HTML(value='')))




In [13]:
sentences_annotated_new.to_csv("data_pp/sentences_annotated_new.csv", index=None)

In [14]:
already_labeled

41620

In [15]:
len(long_matches)

66919

In [16]:
len(long_matches) - already_labeled

25299

In [17]:
new_labels_count = (~entries_raw["label_sectors"].isna() | ~entries_raw["label_dimensions"].isna()).sum()
new_labels_count

26830

In [18]:
sentences_annotated["is_selected"].sum() + new_labels_count

69476

In [19]:
sentences_annotated_new["is_selected"].sum() - sentences_annotated["is_selected"].sum()

25299

In [20]:
sentences_annotated_new["is_selected"].sum(), sentences_annotated["is_selected"].sum()

(67945, 42646)