In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re, spacy

import string
from pandarallel import pandarallel

2023-07-03 14:45:42.451462: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-03 14:45:42.561925: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-03 14:45:43.043541: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.8/lib64${LD_LIBRARY_PATH:+:/usr/lib/mesa-diverted/x86_64-linux-gnu:/

In [8]:
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

In [9]:
column_rename = {'first_party': 'fp',
                 'second_party': 'sp',
                 'first_party_winner': 'label'}

train_df.rename(columns=column_rename, inplace=True)
test_df.rename(columns=column_rename, inplace=True)

In [10]:
train_df

Unnamed: 0,ID,fp,sp,facts,label
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [11]:
legal_verbs = {'argue': 740,
               'violate': 651,
               'sue': 526,
               'allege': 468,
               'convict': 436,
               'claim': 393,
               'accuse': 391,
               'implicate': 387,}

In [104]:
fp_name = 'The First Party'
sp_name = 'The Second Party'


def regex_cleaning(fact_subj):
    semi_punctuation = re.sub(rf'["\']', '', string.punctuation)
    fact_subj = fact_subj.replace('’', "'")
    fact_subj = fact_subj.replace('‘', "'")
    fact_subj = fact_subj.replace('“', '"')
    fact_subj = fact_subj.replace('”', '"')
    fact_subj = fact_subj.replace('–', '-')
    fact_subj = re.sub(rf"[\d\s]+", r' ', fact_subj)
    fact_subj = re.sub(rf"({fp_name} ?)+", f'{fp_name} ', fact_subj)
    fact_subj = re.sub(rf"({sp_name} ?)+", f'{sp_name} ', fact_subj)
    # fact_subj = re.sub(rf"([\"'])\1+ ", r'\1', fact_subj)
    fact_subj = re.sub(rf"([\"'])\1? +([\w{semi_punctuation}]+)\2? +([\"'])\3?", r'\1\2\3', fact_subj)
    fact_subj = re.sub(rf" +([{semi_punctuation}])", r'\1', fact_subj)
    fact_subj = re.sub(rf" 's ", "'s ", fact_subj)
    fact_subj = re.sub(r'([^\w\s])\1+', r'\1', fact_subj)
    
    return fact_subj.strip()

def inspect_token(fact_tokens):
    fp_name_list = [n.lower() for n in fp.replace(',', '').split() if len(n) > 1]
    sp_name_list = [n.lower() for n in sp.replace(',', '').split() if len(n) > 1]

    fp_name_list_added = fp_name_list.copy()
    sp_name_list_added = sp_name_list.copy()

    for n in fp_name_list:
        changed_name = re.findall(rf"{n} ?\([a-z]+\)", fact_tokens.text.lower())
        if changed_name:
            fp_name_list_added.extend([re.sub(rf'({n}|[ {string.punctuation}])', '', cn) for cn in changed_name])

    for n in sp_name_list:
        changed_name = re.findall(rf"{n} ?\([a-z]+\)", fact_tokens.text.lower())
        if changed_name:
            sp_name_list_added.extend([re.sub(rf'({n}|[ {string.punctuation}])', '', cn) for cn in changed_name])

    fp_name_list = fp_name_list_added
    sp_name_list = sp_name_list_added

    def is_in_namelist(token):
        if 'NN' in token.tag_ or 'PROPN' in token.pos_:
            if token.text.lower() in fp_name_list:
                return fp_name
            elif token.text.lower() in sp_name_list:
                return sp_name
        return token.text

    def is_has_legal_verb(token, strong=False):
        if 'NN' in token.tag_ or 'PROPN' in token.pos_:
            head = token.head
            if head.text.lower() in fp_name_list:
                if token.pos_ == 'PROPN':
                    fp_name_list.append(token.text.lower())
                if strong:
                    return fp_name
            elif head.text.lower() in sp_name_list:
                if token.pos_ == 'PROPN':
                    sp_name_list.append(token.text.lower())
                if strong:
                    return sp_name
            elif 'VBD' in head.tag_ and head.lemma_ in legal_verbs:
                return is_in_namelist(token)
        return token.text

    before_len_fp = 0
    before_len_sp = 0

    fact_subj = [token.text for token in fact_tokens]

    while before_len_fp != len(fp_name_list) or before_len_sp != len(sp_name_list):
        for i, token in enumerate(reversed(fact_tokens)):
            party = is_in_namelist(token)
            if not party:
                party = is_has_legal_verb(token)
            fact_subj[len(fact_tokens) - i - 1] = party

        before_len_fp = len(fp_name_list)
        before_len_sp = len(sp_name_list)


    fact_subj = ' '.join(fact_subj)
    # fact_subj = re.sub(rf'[{string.punctuation}]+', r' ', fact_subj)
    # """ (.,?"!@#)"""
    return regex_cleaning(fact_subj)

In [105]:
nlp = spacy.load('en_core_web_sm')

In [106]:
idx = 1234
fp = train_df['fp'][idx]
sp = train_df['sp'][idx]
facts = train_df['facts'][idx]
fact_tokens = nlp(facts)

print('first_party: ', fp)
print('second_party: ', sp)
print('facts: ', facts)

first_party:  Carol Anne Bond
second_party:  United States
facts:  Carol Anne Bond was found guilty of trying to poison her husband's mistress, Myrlinda Haynes, with toxic chemicals at least 24 times over the course of several months. A grand jury in the Eastern District of Pennsylvania charged Bond with two counts of possessing and using a chemical weapon, in violation of a criminal statute implementing the treaty obligations of the United States under the 1993 Chemical Weapons Convention. The grand jury also charged Bond with two counts of mail theft. Bond's attorneys argue that the statute was intended to deal with rogue states and terrorists and that their client should have been prosecuted under state law instead. Bond, a laboratory technician, stole the chemical potassium dichromate from the company where she worked. Haynes was not injured. Bond's husband had a child with Haynes while married to Bond. Haynes had contacted police and postal authorities after finding the chemicals 

In [107]:
inspect_token(fact_tokens)

"The First Party was found guilty of trying to poison her husband's mistress, Myrlinda Haynes, with toxic chemicals at least times over the course of several months. A grand jury in the Eastern District of Pennsylvania charged The First Party with two counts of possessing and using a chemical weapon, in violation of a criminal statute implementing the treaty obligations of the The Second Party under the Chemical Weapons Convention. The grand jury also charged The First Party with two counts of mail theft. The First Party's attorneys argue that the statute was intended to deal with rogue The Second Party and terrorists and that their client should have been prosecuted under state law instead. The First Party, a laboratory technician, stole the chemical potassium dichromate from the company where she worked. Haynes was not injured. The First Party's husband had a child with Haynes while married to The First Party. Haynes had contacted police and postal authorities after finding the chemi

In [83]:
pandarallel.initialize(progress_bar=True, nb_workers=8)
train_df['fact_tokens'] = train_df['facts'].parallel_apply(nlp)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=310), Label(value='0 / 310'))), HB…

KeyboardInterrupt: 

In [27]:
test_df['fact_tokens'] = test_df['facts'].parallel_apply(nlp)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=155), Label(value='0 / 155'))), HB…

In [31]:
def change_fp_sp(text):
    removed_fp = re.sub(fp_name, 'TMP_SP', text)
    removed_sp = re.sub(sp_name, 'TMP_FP', removed_fp)
    changed_fp = re.sub('TMP_SP', sp_name, removed_sp)
    changed_sp = re.sub('TMP_FP', fp_name, changed_fp)
    return changed_sp

In [32]:
inspect_token(train_df['fact_tokens'][0])

'On June The First Party a candidate for public office made a television speech in Baton Rouge Louisiana During this speech The First Party accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union Finally The First Party implicated The Second Party an East Baton Rouge deputy sheriff in a scheme to move money between the Teamsters Union and The First Party ’s political opponent The Second Party successfully sued The First Party for defamation Louisiana ’s First Circuit Court of Appeals reversed holding that The Second Party did not show The First Party acted with “ malice ” The Second Party then appealed to the Supreme Court of Louisiana That court held that although public figures forfeit some of their First Amendment protection from defamation The First Party accused The Second Party of a crime with utter disregard of whether the remarks were true Finally that court held that the First Amendment protect

In [33]:
change_fp_sp(inspect_token(train_df['fact_tokens'][0]))

'On June The Second Party a candidate for public office made a television speech in Baton Rouge Louisiana During this speech The Second Party accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union Finally The Second Party implicated The First Party an East Baton Rouge deputy sheriff in a scheme to move money between the Teamsters Union and The Second Party ’s political opponent The First Party successfully sued The Second Party for defamation Louisiana ’s First Circuit Court of Appeals reversed holding that The First Party did not show The Second Party acted with “ malice ” The First Party then appealed to the Supreme Court of Louisiana That court held that although public figures forfeit some of their First Amendment protection from defamation The Second Party accused The First Party of a crime with utter disregard of whether the remarks were true Finally that court held that the First Amendment prote

In [34]:
train_df['facts'][0]

'On June 27, 1962, Phil St. Amant, a candidate for public office, made a television speech in Baton Rouge, Louisiana.  During this speech, St. Amant accused his political opponent of being a Communist and of being involved in criminal activities with the head of the local Teamsters Union.  Finally, St. Amant implicated Herman Thompson, an East Baton Rouge deputy sheriff, in a scheme to move money between the Teamsters Union and St. Amant’s political opponent. \nThompson successfully sued St. Amant for defamation.  Louisiana’s First Circuit Court of Appeals reversed, holding that Thompson did not show St. Amant acted with “malice.”  Thompson then appealed to the Supreme Court of Louisiana.  That court held that, although public figures forfeit some of their First Amendment protection from defamation, St. Amant accused Thompson of a crime with utter disregard of whether the remarks were true.  Finally, that court held that the First Amendment protects uninhibited, robust debate, rather t