# Using regexes to extract petitioner and respondent information from cases

In [1]:
import argparse
import json
import logging
import os
import regex as re
from difflib import get_close_matches

from bs4 import BeautifulSoup

In [113]:
def clean_names(adv_list, check=None):
    """Takes a list of advocate names, a dictionary of advocates with their
    assigned cases and a case text and adds the case to the appropriate
    advocate with the right prefix for petitioner or respondent
    """

    salutations = ['Mr', 'Ms', 'Mrs', 'Dr', 'Mr.', 'Mrs.', 'Ms.', 'Dr.']
    except_tokens = ["For", "CORAM", "Hon'ble", "Advocate", "Advocates"]
    cleaned_advs = []

    for adv in adv_list:
        adv = re.split(r',|\.|\s+', adv)

        adv = list(filter(None, adv))

        # Using replace instead of strip due to abbreviated names with
        adv = [token for token in adv
               if token[0].isupper() and token not in salutations and token not in except_tokens]

        if(len(adv) <= 1):
            continue
        cleaned_advs.append("".join(adv))
        
    if check is not None:
        return list(set([adv for adv in cleaned_advs if adv not in check]))
    
    return list(set(cleaned_advs))


In [64]:
def test(data_path, output_path, num=100):
    # Regex to extract names
    regex = r'(?:D|M)(?:r|s|rs)\.?\s*[A-Za-z.]+\s*[A-Za-z.]+(?:\s*[A-Za-z]+)?,?'
    # r'(?:D|M)(?:r|s|rs)\.?\s*[A-Za-z.]+\s*[A-Za-z.]+(?:\s+[A-Za-z]+)?,?'
    
    # Terms to be used to split the text into sections for petitioner and
    # plaintiff extraction
    petitioner_terms = ['Petitioner', 'Apellant', 'Appellant', 'Plaintiff']
    respondent_terms = ['Respondent']
    petitioner_term_regex = "|".join(petitioner_terms)
    respondent_term_regex = "|".join(respondent_terms)

    adv_cases = {}
    pet_cases = {}
    res_cases = {}

    # Iterating through the files to find advocate names of each case
    for fl in os.listdir(data_path)[:num]:

        filepath = os.path.join(data_path, fl)
        flname = os.path.splitext(fl)[0]

        print(f"Processing {flname}")
        print("="*40)
        with open(filepath, 'r') as f:
            raw = f.read()

        soup = BeautifulSoup(raw, 'html.parser')

        # For some documents, the name of the advocates(especially for the
        # respondents, is given in the first few paragraphs as opposed to the
        # pre div|

        try:
            filtered = soup.pre.text
        except Exception:
            print(
                (f"Case {flname} does not have a pre section."
                 "May not have advocate information"))
        for i in range(1, 4):
            try:
                para = soup.find("p", {"id": f"p_{i}"})
                filtered = filtered + para.text
            except Exception:
                # In the event the document is short and a certain paragraph
                # does not exist
                continue

        # Splitting the text to search through for Petitioners
        unique_combinations = [(pet, res)
                               for res in respondent_terms
                               for pet in petitioner_terms]

        # For getting the text containing petitioners' advocates
        petitioner_text = set()
        respondent_text = set()

        # Works with petitioners. Best to extract respondents as set difference
        petitioner_regex = (rf"(?<={petitioner_term_regex}s?)(?P<pet>.*?)versus.*?"
                            rf"(?={respondent_term_regex}s?)(?P<res>.*?)")
        print(petitioner_regex)
        r = re.compile(petitioner_regex, flags=re.I | re.DOTALL)
        # match = re.findall(
        #     petitioner_regex, filtered, flags=re.I | re.DOTALL)
        # petitioner_text.update(list(match))
        segs = [m.groupdict() for m in r.finditer(filtered)]
        for el in segs:
            for k, v in el.items():
                if k == 'res':
                    respondent_text.update([v])
                else:
                    petitioner_text.update([v])
                    
        # Alternate pattern
        petitioner_regex_alt = rf"(?<=For the {petitioner_term_regex}s?)(?P<pet>.*?)(?=For the {respondent_term_regex}s?)(?P<res>.*?)"
        print(petitioner_regex_alt)
        r_alt = re.compile(petitioner_regex_alt, flags=re.I | re.DOTALL)
        # match = re.findall(
        #     petitioner_regex, filtered, flags=re.I | re.DOTALL)
        # petitioner_text.update(list(match))
        segs = [m.groupdict() for m in r_alt.finditer(filtered)]
        for el in segs:
            for k, v in el.items():
                if k == 'res':
                    respondent_text.update([v])
                else:
                    petitioner_text.update([v])
        # print(petitioner_regex)
            
        print(petitioner_text, respondent_text)
        # Pruning
        petitioner_text = " ".join(petitioner_text)

        # Using regexes to find all names for a particular case
        all_names = re.findall(regex, filtered)
        advocates = set(all_names) # Eliminating any repetitions

        # Getting text that holds names of petitioners' advocates
        petitioner_names = re.findall(regex, petitioner_text)
        petitioners = set(petitioner_names)

        # Getting the respondents' advocate names as the set difference between
        # the total names extracted and those found as petitioners
        respondents = advocates.difference(petitioners)

        petitioners = list(petitioners)
        respondents = list(respondents)

        # Running the clean_names(...) method first for petitioners and then
        # for respondents
        petitioners = clean_names(petitioners)
        respondents = clean_names(respondents, check=petitioners)

        print(f"Found advocates: {clean_names(all_names)}.")
        print(f"Found petitioners: {petitioners}.")
        print(f"Found respondents: {respondents}.")

        adv_cases = update_dict(adv_cases,
                                [*petitioners, *respondents], flname)
        pet_cases = update_dict(pet_cases, petitioners, flname)
        res_cases = update_dict(res_cases, respondents, flname)

    i = 0
    advs = list(adv_cases.keys())
    print(f"A total of {len(advs)} advocates were found.")
    print("Removing and merging duplicates.")

    # Iterates through the dictionary to find duplicate advocate names
    # Forced to use a while loop with a manual iterating variable as variable
    # deletion takes place
    while(i < len(advs)):
        # Cleaning up the names of the advocates by removing punctutations and
        # spaces for easier similarity checking to remove redundancies by
        # gesalt pattern matching
        adv = advs[i]

        # get_close_matches uses gesalt pattern matching
        similar_advs = get_close_matches(
            adv, advs, n=10, cutoff=0.92)[1:]

        print(f"Advocates {similar_advs} found similar to {adv}.")

        # Checking that similar matches are found
        if(len(similar_advs) == 0):
            i += 1
            continue

        shorter = adv
        for s_adv in similar_advs:
            # If one of the matches have already been removed from the
            # dictionary, skip it
            if(adv_cases.get(s_adv, -1) == -1):
                continue

            # Merge two advocates if they have overlaps
            if (set(adv_cases[shorter]).intersection(
                    set(adv_cases[s_adv])) == set()):
                continue

            # Retaining the shorter name
            if(len(shorter) <= len(s_adv)):
                adv_cases[shorter].extend(adv_cases[s_adv])

                if(pet_cases.get(shorter, -1) != -1 and
                        pet_cases.get(s_adv, -1) != -1):
                    pet_cases[shorter].extend(pet_cases[s_adv])
                    del pet_cases[s_adv]

                if(res_cases.get(shorter, -1) != -1 and
                        res_cases.get(s_adv, -1) != -1):
                    res_cases[shorter].extend(res_cases[s_adv])
                    del res_cases[s_adv]

                print(f"Merged {s_adv} with {shorter}.")
                del adv_cases[s_adv]

            else:
                adv_cases[s_adv].extend(adv_cases[shorter])

                if(pet_cases.get(shorter, -1) != -1 and
                        pet_cases.get(s_adv, -1) != -1):
                    pet_cases[s_adv].extend(pet_cases[shorter])
                    del pet_cases[shorter]

                if(res_cases.get(shorter, -1) != -1 and
                        res_cases.get(s_adv, -1) != -1):
                    res_cases[s_adv].extend(res_cases[shorter])
                    del res_cases[shorter]

                print(f"Merged {shorter} with {s_adv}.")
                del adv_cases[shorter]

                shorter = s_adv

        # If the present index dictionary value is removed, do not update
        # the iterating variable
        if(adv_cases.get(adv, -1) == -1):
            i -= 1
        i += 1

        # Reconfiguring the list of keys after deletion
        advs = list(adv_cases.keys())

    adv_cases = {k: list(set(v)) for k, v in adv_cases.items()}
    adv_cases_len = {k: len(v) for k, v in sorted(adv_cases.items(),
                                                  key=lambda x: len(x[1]),
                                                  reverse=True)}

    print(f"{len(adv_cases.keys())} cleaned advocate names retained.")
    print(f"{sum(list(adv_cases_len.values()))} cases were found.")

In [65]:
def update_dict(d, names_list, fl):
    for name in names_list:
        if(d.get(name, -1) == -1):
            d[name] = [fl, ]
        else:
            d[name].append(fl)
    return d


def order(dict_obj: dict) -> dict:
    """Order dictionary in decreasing order of values

    Parameters
    ----------
    dict_obj : dict
        Dictionary to be sorted.

    Returns
    -------
    dict

    """

    ordered_dict = {
            k: v for k, v in sorted(dict_obj.items(),
                                    key=lambda x: x[1],
                                    reverse=True)}
    return ordered_dict


In [66]:
test("/home/workboots/Datasets/DHC/raw/html_doc", ".")

Processing 137654793
(?<=Petitioner|Apellant|Appellant|Plaintiffs?)(?P<pet>.*?)versus.*?(?=Respondents?)(?P<res>.*?)
(?<=For the Petitioner|Apellant|Appellant|Plaintiffs?)(?P<pet>.*?)(?=For the Respondents?)(?P<res>.*?)
{'\n                      Through: Mr. Pankaj Seth, Advocate.\n                 '} {''}
Found advocates: ['PankajSeth'].
Found petitioners: ['PankajSeth'].
Found respondents: [].
Processing 55467641
(?<=Petitioner|Apellant|Appellant|Plaintiffs?)(?P<pet>.*?)versus.*?(?=Respondents?)(?P<res>.*?)
(?<=For the Petitioner|Apellant|Appellant|Plaintiffs?)(?P<pet>.*?)(?=For the Respondents?)(?P<res>.*?)
{'\n                      Through:    Mr. Vijay Kumar Raina, Advocate.\n\n                      ', '\n                Through:          Mr. Vijay Kumar Raina, Advocate.\n\n                      '} {''}
Found advocates: ['VijayKumarRaina', 'PawanSharma'].
Found petitioners: ['VijayKumarRaina'].
Found respondents: ['PawanSharma'].
Processing 121401495
(?<=Petitioner|Apellant|Appell

# Verify new extracted data

In [124]:
import json
from difflib import get_close_matches

Getting both sets of data

In [114]:
# Old
with open("/home/workboots/Datasets/DHC/common/adv_cases.json", 'r') as f:
    old = json.load(f)
# New
with open("/home/workboots/Datasets/DHC/common/adv_cases_new.json", 'r') as f:
    new = json.load(f)

Verifying every key in old is present in new and cases of old are in new

In [131]:
for k, v in old.items():
    idx = -1
    cases = new.get(k, -1)
    if cases == -1:
        if any([exp in k for exp in ['For', 'Advocate', 'CORAM']]):
            idx = min([k.index(exp) for exp in ['For', 'Advocate', 'CORAM'] if exp in k])
            cases_alt = new.get(k[:idx], -1)
            if cases_alt == -1:
                similar_advs = get_close_matches(k, new.keys(), n=10, cutoff=0.90)[1:]
                print(k, len(v), similar_advs)
                for sim in similar_advs:
                    print(set(v).issubset(set(new[sim])))
        else:
            similar_advs = get_close_matches(k, new.keys(), n=10, cutoff=0.90)[1:]
            print(k, len(v), similar_advs)
            for sim in similar_advs:
                print(set(v).issubset(set(new[sim])))

RavinderSingh 26 ['RajinderSingh', 'DavinderSingh']
True
False
SaahilaLambaCORAM 1 []
JitendraSethi 32 []
PriyadarshiManish 9 []
MohitaCORAM 1 []
NSJohn 1 []
MunenderFor 1 []
PriyadarshiManishAdvocate 2 []
AvnishAhlawatCORAM 1 []
ColinGansalves 19 []
NaveenAdvocates 1 []
SachinCORAM 1 []
FrancisAmbooken 1 []
AmalMerinKurain 1 []
PrasannaAdvocatesVersus 1 []
MatruguptaMishra 2 []
AshaAdvocate 1 []
InderpalAdvocate 1 []
RajenderSingh 2 ['RajendraSingh']
False
AnujAggarwalAdvocate 1 []
AtiyaAdvocates 1 []
KavitaCORAM 1 []
SubhavnaForRespondents 1 []
SahilAdvocates 1 []
YunushAdvocatesVersus 1 []
RichaAdvocates 1 []
ManuAdvocatesCORAM 1 []
RameshDassKhanna 1 []
AdvocatesR 1 []
SharddhaBhargavaAdvocate 1 []
PrabhakarCORAM 1 []
SaahilaLambaAdvocates 1 []
RNKaranjawalaCS 1 []
RohitAdvocatesCORAM 1 []


Verifying consistency of the top selected advocates

In [132]:
with open("/home/workboots/Datasets/DHC/common/selected_advs.json", 'r') as f:
    advs = json.load(f)

In [133]:
advs = list(advs.values())

In [135]:
for adv in advs:
    old_cases = old.get(adv, -1)
    new_cases = new.get(adv, -1)
    if old_cases == -1 or new_cases == -1:
        print(adv)
    if not set(old_cases).issubset(set(new_cases)):
        print(adv)
    

RajatArora
PrathibaSingh


In [136]:
len(old['RajatArora'])

146

In [137]:
len(new['RajatArora'])

145

In [139]:
len(old['PrathibaSingh'])

112

In [148]:
len(new['PratibhaSingh'])

4

In [151]:
for adv in get_close_matches('PrathibaSingh', new.keys(), n=10, cutoff=0.80):
    print(adv, len(new[adv]))

PrathibaSingh 2
PrathibhaMSingh 2
PratibhaSingh 4
PratibhaMSingh 111
PratibhanuSingh 1
PratibhaSinha 7
PrathanaSingh 1
SahibaSingh 1
PratapSingh 16
VPratapSingh 1


In [153]:
set(new['PratibhaMSingh']).issubset(set(old['PrathibaSingh']))

True