# Check Statute spans

In [75]:
import os
import re
from typing import Union
import json
from copy import deepcopy

In [41]:
def mask_statutes(data: str, sections: dict) -> Union[str, dict]:
    """Mask given sections from sentences and remove certain elements

    Parameters
    ----------
    data: str
        Text
    sections: dict
        Dictionary of all sections cited in document along with their spans
    """
    if not sections:
        return data
    data_copy = deepcopy(data)
    case_section_regex = re.compile(
            "|".join([sec for sec in sections]))

    alt_case_section_regex = re.compile(
            "|".join([r"Section\s+{}\s+of\s+the\s+Code".format(
                sec.split("_")[-1])
                for sec in sections]))
    alt_case_section_regex_2 = re.compile(
        "|".join([r"Section\s+[0-9]+".format(
            sec.split("_")[-1])
            for sec in sections]))

    replace_texts = []
    for sec, spans in sections.items():
        replace_texts.extend([data_copy[start:end] for (start, end) in spans])

    data_copy = re.sub("|".join(replace_texts), "[SECTION]", data_copy)
    #  data_copy = case_section_regex.sub("[SECTION]", data_copy)
    data_copy = alt_case_section_regex.sub("[SECTION_UNK]", data_copy)
    data_copy = alt_case_section_regex_2.sub("[SECTION_UNK]", data_copy)
    # n_spans = get_new_spans(data_copy, sections)

    return data_copy

In [42]:
base_clean_path = "/home/workboots/Datasets/DHC/common_new/preprocess/clean/"
base_statute_path = "/home/workboots/Datasets/DHC/common_new/preprocess/statutes/"

In [51]:
for doc in os.listdir(base_clean_path)[:10]:
    with open(os.path.join(base_clean_path, doc), 'r') as f:
        text = f.read()
    flname = os.path.splitext(doc)[0]
    with open(os.path.join(base_statute_path, f"{flname}.json"), 'r') as f:
        sections = json.load(f)  
    cleaned = mask_statutes(text, sections)
    if sections:
        print(sections)
        print(cleaned)

{'Constitution_1': [[4150, 4184]], 'Constitution_14': [[3881, 3911]], 'Constitution_19': [[4150, 4184]], 'Constitution_226': [[30442, 30473]], 'Copyright Act, 1957_33': [[2058, 2095]]}
25.05.2011 Introduction 1. The prayer in this writ petition is for a direction to the Union of India, Ministry of Human Resource Development HRD to frame appropriate and objective standards towards the determination and levying of royalties of various copyrighted works administered by Phonographic Performance Limited PPL, Respondent no. 3, and the Indian Performing Right Society Limited IPRSL, Respondent no. 4, or any other society that may be so created by it, and the mode of enforcing and administering such royalties when so fixed. A further direction is for investigation of the books of account of PPL and IPRSL to ascertain whether they have paid their dues to the owners authors of copyrights. Case of the Petitioner 2. The Petitioner, Event and Entertainment Management Association EEMA, is a society r

In [53]:
sections

{'Code of Criminal Procedure, 1973_313': [[27339, 27383], [24935, 24979]],
 'Code of Criminal Procedure, 1973_91': [[8869, 8912], [50253, 50296]],
 'Indian Penal Code, 1860_302': [[33551, 33586]]}

In [60]:
spans=[]
for match in re.finditer(r"\[SECTION\]", cleaned):
    print(match)
    spans.append(match.span())

<re.Match object; span=(8869, 8878), match='[SECTION]'>
<re.Match object; span=(24901, 24910), match='[SECTION]'>
<re.Match object; span=(27270, 27279), match='[SECTION]'>
<re.Match object; span=(33447, 33456), match='[SECTION]'>
<re.Match object; span=(50123, 50132), match='[SECTION]'>


In [79]:
for section, s_spans in sections.items():
    n_spans = []
    for span in s_spans:
        n_span = get_closest(span, spans)
        n_spans.append(n_span)
    sections[section] = n_spans

In [72]:
def get_closest(tuple_a, tuple_list):
    least = -1
    dist = tuple_a[0] - tuple_list[0][0]
    for i in range(len(tuple_list)):
        if not tuple_list[i][0] <= tuple_a[0]:
            continue
        n_dist = tuple_a[0] - tuple_list[i][0]
        if n_dist <= dist:
            dist = n_dist
            least = i
    return tuple_list[least]

In [74]:
print(sections)

{'Code of Criminal Procedure, 1973_313': [[27339, 27383], [24935, 24979]], 'Code of Criminal Procedure, 1973_91': [[8869, 8912], [50253, 50296]], 'Indian Penal Code, 1860_302': [[33551, 33586]]}


In [80]:
test_spans = [span for spans in sections.values() for span in spans]

In [81]:
test_spans

[(27270, 27279), (24901, 24910), (8869, 8878), (50123, 50132), (33447, 33456)]

In [82]:
for span in test_spans:
    print(cleaned[span[0]:span[1]])

[SECTION]
[SECTION]
[SECTION]
[SECTION]
[SECTION]
