In [1]:
import json
import re
import spacy

import pandas as pd
import numpy as np

from spacy.pipeline import EntityRuler
from nlp.company_7.task_solution.scripts.keywords import keywords

In [8]:
df = pd.read_csv("../data/data.csv.gz",
                 index_col="id")

In [9]:
df.head()

Unnamed: 0_level_0,url,email,json,title,first_name,last_name,academic_title,department,school,processed,created_at,updated_at
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,https://www.abac.edu/,vfenn@abac.edu,"{""left"": "" the winner. The number on each ball...",,,,,,,,2019-09-16 11:37:24,2020-02-06 03:33:24
2,https://www.abac.edu/,bray@abac.edu,"{""left"": ""er person and can be purchased onlin...",,,,,,,,2019-09-16 11:37:24,2020-02-06 03:33:24
3,https://www.abac.edu/,admissions@abac.edu,"{""left"": ""ty, Prince Automotive Group, Rotary ...",,,,,,,,2019-09-16 11:37:24,2020-02-06 03:33:24
4,https://www.abac.edu/,webmaster@abac.edu,"{""left"": ""mics\nRegistrar\nTranscript Request\...",,,,,,,,2019-09-16 11:37:24,2020-02-06 03:33:24
5,https://www.alu.edu/,admissions@alu.edu,"{""left"": ""Abraham Lincoln University & Online ...",,,,,,,,2019-09-16 11:37:24,2020-02-06 03:33:24


In [10]:
nlp_main = spacy.load("en_core_web_lg", disable=["tagger", "parser"])
nlp_helper = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [11]:
strings = []

for jsn in df["json"]:
    d = json.loads(jsn)
    s = "".join(v.strip()
                .replace("\n", " ")
                .replace("\t", " ")
                for v in d.values())
    strings.append(s)

s_strings = pd.Series(strings, index=df.index)

In [12]:
def is_all_propn(st):
    propns = []
    for w in st.split(" "):
        w_doc = nlp_helper(w)
        for t in w_doc:
            propns.append(t.pos_)
    return all((el == "PROPN" for el in propns))


def correct_person_entities(nlp_doc):
    new_ents = []
    for ent in nlp_doc.ents:
        if ent.label_ == "PERSON":
            if (re.search(r"^([A-Z][\w]+\s[A-Z]?\.?\s?[A-Z][\w]+)$",
                          ent.text) and is_all_propn(ent.text)):
                new_ents.append(ent)
        else:
            new_ents.append(ent)
    nlp_doc.ents = new_ents
    return nlp_doc

In [13]:
patterns = []

for k, v in keywords().items():
    for s in v:
        new = {}
        new["label"] = k
        new["pattern"] = [{"LOWER": w.lower()} for w in s.split(" ")]
        patterns.append(new)

In [14]:
ruler = EntityRuler(nlp_main)
ruler.add_patterns(patterns)

In [15]:
nlp_main.add_pipe(ruler, before="ner")
nlp_main.add_pipe(correct_person_entities, after="ner")

In [None]:
data_ls = []

for i, el in s_strings.items():
    doc = nlp_main(el)
    if (any([n.label_ == "academic_title" for n in doc.ents]) and
            any([n.label_ == "PERSON" for n in doc.ents])):

        names = [n.text for n in doc.ents if n.label_ == "PERSON"]
        nx = np.asarray([n.start_char for n
                         in doc.ents if n.label_ == "PERSON"])
        titles = [n.text for n
                  in doc.ents if n.label_ == "academic_title"]
        ty = np.asarray([n.start_char for n
                         in doc.ents if n.label_ == "academic_title"])

        diff_arr = np.abs(ty - nx[:, np.newaxis])
        min_vals = np.where(diff_arr == np.amin(diff_arr))
        indicies = list(zip(min_vals[0], min_vals[1]))
        data_ls.append((i, names[indicies[0][0]], titles[indicies[0][1]]))

In [None]:
data_df = pd.DataFrame(data_ls, columns=["id", "name", "academic_title"])
data_df.set_index("id", inplace=True)


def remove_middle_name(nn):
    n = nn.split(" ")
    return " ".join((n[0], n[-1]))


data_df.loc[:, "name"] = data_df["name"].map(remove_middle_name)
data_df[["first_name", "last_name"]] = data_df["name"].str.split(expand=True)

data_df.loc[:, "academic_title"] = data_df["academic_title"].str.title()

In [None]:
df.update(data_df)
df.head()

In [None]:
print(1 - df["first_name"].isna().mean())

In [None]:
df.reset_index(inplace=True)
df.to_csv("../results/data_new.csv.gz")

In [16]:
def get_data(line):
    l_doc = nlp_main(line)
    if (any([n.label_ == "academic_title" for n in l_doc.ents]) and
            any([n.label_ == "PERSON" for n in l_doc.ents])):

        names = [n.text for n in l_doc.ents if n.label_ == "PERSON"]
        nx = np.asarray([n.start_char for n
                         in l_doc.ents if n.label_ == "PERSON"])
        titles = [n.text for n
                  in l_doc.ents if n.label_ == "academic_title"]
        ty = np.asarray([n.start_char for n
                         in l_doc.ents if n.label_ == "academic_title"])

        diff_arr = np.abs(ty - nx[:, np.newaxis])
        min_vals = np.where(diff_arr == np.amin(diff_arr))
        indicies = list(zip(min_vals[0], min_vals[1]))
        return (names[indicies[0][0]], titles[indicies[0][1]])


%timeit get_data(s_strings[1])

123 ms ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
