### Offilne Testing


1. Query the leads
2. Extract their text
3. Split into sentences with preprocessing
4. Classify the extracted sentences with the model
5. Match the sentences with entries created by human taggers
6. Group the output: Entry, Sentence, Original Tag, Predicted Tag 
- Post-processing: merge contiguous sentences with the same predicted tags.
- Add tagger names
7. Present the results with streamlit

In [1]:
import re
from nostril import nonsense
from urllib.parse import urlparse

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

MIN_NUM_TOKENS = 5
MIN_WORD_LEN = 4

url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
url_regex = re.compile(url_regex)


def preprocess_sentence(sentence):
    tokens = sentence.split(" ")
    if len(tokens) < MIN_NUM_TOKENS:
        return ""
    sensible_token_count = 0
    for token in tokens:
        if len(token) > MIN_WORD_LEN or (len(token) > 7 and not nonsense(token)):
            sensible_token_count += 1
    if sensible_token_count < MIN_NUM_TOKENS:
        return ""
    sentence = " ".join(tokens)
    sentence = url_regex.sub("", sentence)
    keep = re.escape("/\\$.:,;-_()[]{}!'\"% ")
    sentence = re.sub(r"[^\w" + keep + "]", "", sentence)
    sentence = re.sub(r"\s+", " ", sentence)
    sentence = sentence.strip()
    return sentence


def page_to_sentences(page):
    page = re.sub(r"\s+", " ", page)
    sentences = sent_tokenize(page)
    sentences = [preprocess_sentence(sentence) for sentence in sentences]
    return sentences

[nltk_data] Downloading package punkt to /home/abdullah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import requests
from bs4 import BeautifulSoup

blacklist = [
    '[document]',
    'noscript',
    'header',
    'html',
    'meta',
    'head', 
    'input',
    'script',
    # there may be more elements you don't want, such as "style", etc.
]


def extract_html_body(url):
    res = requests.get(url)
    html_page = res.content
    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)

    output = ''
    for t in text:
        if t.parent.name not in blacklist:
            output += '{} '.format(t)
    return output


def html_to_sentences(url):
    text = extract_html_body(url)
    sentences = page_to_sentences(text)

    seen = set()
    seen_add = seen.add
    sentences = [s for s in sentences if not (s in seen or seen_add(s))]
    
    return sentences

In [3]:
from io import StringIO
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

def pdf_to_sentences(file_path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    sentences = []
    with open(file_path, "rb") as fp:
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
            parsed_page = retstr.getvalue()
            sentences.extend(page_to_sentences(parsed_page))

    seen = set()
    seen_add = seen.add
    sentences = [s for s in sentences if not (s in seen or seen_add(s))]
    return sentences

In [5]:
import pandas as pd
leads = pd.read_csv("leads_from_04062021_to_14062021.csv")
entries = pd.read_csv("entries_from_04062021_to_14062021.csv")

In [6]:
leads = leads[[
    'id', 'created_at', 'title', 'status', 'url', 'number_of_entries',
    'tagger_name', 'analysis_framework_title', 'project_title'
]]

In [7]:
import os
PDF_DIR = "./pdf_leads"
if not os.path.exists(PDF_DIR):
    os.makedirs(PDF_DIR)

In [8]:
import urllib.request


def download_file(download_url, file_path):
    response = urllib.request.urlopen(download_url)
    with open(file_path, 'wb') as f:
        f.write(response.read())


### Match

In [9]:
import re
import json
from collections import defaultdict
from difflib import SequenceMatcher
import pandas as pd
from tqdm.notebook import tqdm

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [10]:
sentences = pd.read_csv(
    "paraphrase-multilingual-mpnet-base-v2_secotrs-and-pillars_preds.csv")
entries = pd.read_csv("entries_from_04062021_to_14062021.csv")

In [11]:
entries.columns

Index(['id', 'created_at', 'modified_at', 'excerpt', 'image_raw',
       'analysis_framework_id', 'created_by_id', 'lead_id', 'modified_by_id',
       'entry_type', 'information_date', 'order', 'client_id', 'project_id',
       'tabular_field_id', 'dropped_excerpt', 'highlight_hidden', 'verified',
       'verification_last_changed_by_id', 'image_id', 'tagger_name',
       'project_title', 'analysis_framework_title'],
      dtype='object')

In [12]:
def entry_pp(s):
    if not isinstance(s, str):
        return s
    s = re.sub(r"^\[.+\] ", "", s).strip()
    return s


entries["excerpt_clean"] = entries["excerpt"].apply(entry_pp)
entries = entries[~entries["excerpt"].isna()]
entries = entries[~entries["excerpt"].eq("")]
##

In [13]:
sentences = sentences[~sentences["sentence"].isna()]
sentences = sentences[~sentences["sentence"].eq("")]

In [14]:
# sentences: for each sentence: fuzzymatch its entries that have the same lead_id
# take a sentence
threshold = 70
matches = []
for i, sentence_row in tqdm(enumerate(sentences.itertuples()),
                            total=len(sentences)):
    sentence = sentence_row.sentence  # sentence_row[1]["excerpt_text"]
    lead_id = sentence_row.lead_id  # sentence_row[1]["lead_id"]
    # extract entries with same lead_id
    candidates = entries[entries["lead_id"] == lead_id]
    candidates = candidates["excerpt_clean"].to_dict()
    matching_output = process.extractOne(sentence,
                                         candidates,
                                         scorer=fuzz.token_set_ratio)
    if matching_output is None:
        continue
    matching_entry_text, ratio, matching_entry_idx = matching_output
    if ratio >= threshold:
        matches.append((sentence_row[0], matching_entry_idx, ratio))

HBox(children=(FloatProgress(value=0.0, max=13850.0), HTML(value='')))




In [15]:
long = 0
short = 0
long_matches = []
short_matches = []
for orig_i, mod_i, _ in tqdm(matches):
    s1 = sentences.loc[orig_i, "sentence"]
    s2 = entries.loc[mod_i, "excerpt_clean"]
    if len(s2) > len(s1):
        s1, s2 = s2, s1
    match = SequenceMatcher(None, s1, s2,
                            False).find_longest_match(0, len(s1), 0, len(s2))
    match_ratio = match.size / len(s2)
    match_len = len(s1[match.a:match.a + match.size].split())
    if match_len > 5:
        long += 1
        long_matches.append((orig_i, mod_i, match.size, match_ratio))
    else:
        short += 1
        short_matches.append((orig_i, mod_i, match.size, match_ratio))

HBox(children=(FloatProgress(value=0.0, max=1572.0), HTML(value='')))




In [17]:
entries_labeled = pd.read_csv("data_exported.csv")
entries_labeled.columns

Index(['entry_id', 'lead_id', 'project_id', 'project_title',
       'analysis_framework_id', 'analysis_framework_title', 'excerpt',
       'dropped_excerpt', 'created_by_id', 'tagger_name', 'modified_by_id',
       'verified', 'verification_last_changed_by_id', 'sectors', 'pillars',
       'subpillars'],
      dtype='object')

In [18]:
ready_matches = []
for sen_idx, ent_idx, score in matches:

    entry = entries.at[ent_idx, "excerpt_clean"]
    ent_id = entries.at[ent_idx, "id"]
    if ent_id in entries_labeled["entry_id"].values:
        orig_sectors = entries_labeled.loc[
            entries_labeled["entry_id"].eq(ent_id), "sectors"].iloc[0]
        orig_pillars = entries_labeled.loc[
            entries_labeled["entry_id"].eq(ent_id), "pillars"].iloc[0]
        tagger_name = entries_labeled.loc[
            entries_labeled["entry_id"].eq(ent_id), "tagger_name"].iloc[0]
        project_title = entries_labeled.loc[
            entries_labeled["entry_id"].eq(ent_id), "project_title"].iloc[0]
        analysis_framework_title = entries_labeled.loc[
            entries_labeled["entry_id"].eq(ent_id),
            "analysis_framework_title"].iloc[0]
    else:
        orig_sectors = None
        orig_pillars = None

    sentence = sentences.at[sen_idx, "sentence"]
    p_sectors = sentences.at[sen_idx, "sectors"]
    p_pillars = sentences.at[sen_idx, "pillars"]

    lead_id = sentences.at[sen_idx, "lead_id"]
    lead_url = leads.loc[leads["id"].eq(lead_id), "url"].iloc[0]
    if score < 70:
        # lead_url, entry, original_sector, original_pillar, matched_sentence, predicted_sector, predicted_pillar
        ready_matches.append(
            (lead_url, analysis_framework_title, project_title, None, None,
             None, None, sentence, p_sectors, p_pillars))
    else:
        ready_matches.append((lead_url, analysis_framework_title,
                              project_title, entry, tagger_name, orig_sectors,
                              orig_pillars, sentence, p_sectors, p_pillars))
ready_matches = pd.DataFrame(ready_matches,
                             columns=[
                                 "Lead URL", "Analysis Framework Title",
                                 "Project Title", "Entry", "Tagger Name",
                                 "Sectors", "Pillars", "Matched Sentence",
                                 "Predicted Sectors", "Predicted Pillars"
                             ])

In [20]:
from ast import literal_eval

In [21]:
ready_matches.loc[ready_matches["Pillars"].isna(), "Pillars"] = "None"
ready_matches["Pillars"] = ready_matches["Pillars"].apply(literal_eval)
ready_matches.loc[ready_matches["Sectors"].isna(), "Sectors"] = "None"
ready_matches["Sectors"] = ready_matches["Sectors"].apply(literal_eval)
ready_matches.loc[ready_matches["Predicted Pillars"].isna(), "Predicted Pillars"] = "None"
ready_matches["Predicted Pillars"] = ready_matches["Predicted Pillars"].apply(literal_eval)
ready_matches.loc[ready_matches["Predicted Sectors"].isna(), "Predicted Sectors"] = "None"
ready_matches["Predicted Sectors"] = ready_matches["Predicted Sectors"].apply(literal_eval)

In [23]:
def merge_matched_sentences(group):
    url = group["Lead URL"].iloc[0]
    orig_pillars = group[~group["Pillars"].isna()]["Pillars"]
    if len(orig_pillars):
        orig_pillars = orig_pillars.iloc[0]
    else:
        orig_pillars = None
    orig_sectors = group[~group["Sectors"].isna()]["Sectors"]
    if len(orig_sectors):
        orig_sectors = orig_sectors.iloc[0]
    else:
        orig_sectors = None
    entry = group["Entry"].iloc[0]
    af_title = group["Analysis Framework Title"].iloc[0]
    p_title = group["Project Title"].iloc[0]
    tagger = group["Tagger Name"].iloc[0]
    ##
    concat_sens = ". ".join(group["Matched Sentence"])
    p_pillars = list(set([p for ps in group["Predicted Pillars"] for p in ps]))
    p_sectors = list(set([s for ss in group["Predicted Sectors"] for s in ss]))
    return {
        "Lead URL": url,
        "Analysis Framework Title": af_title,
        "Project Title": p_title,
        "Entry": entry,
        "Tagger Name": tagger,
        "Pillars": orig_pillars,
        "Sectors": orig_sectors,
        "Matched Sentences": concat_sens,
        "Predicted Pillars": p_pillars,
        "Predicted Sectors": p_sectors
    }


grouped_matches = ready_matches.groupby("Entry").apply(merge_matched_sentences)
grouped_matches = pd.DataFrame.from_dict(grouped_matches.tolist())

In [24]:
grouped_matches.to_csv("entries_vs_sentences_tags.csv", index=None)