# Creation of a targets creation method that balances imbalance ratio by using different granularities of targets

In [1]:
import os
import json
from collections import defaultdict, Counter
from statistics import mean, stdev
from typing import Tuple

In [2]:
aux_path = "/home/workboots/Datasets/DHC/variations/new/var_4/area_act_chapter_section_info/with_areas/overall/case_area_act_chapter_section_info.json"
aux_path_sc = "/home/workboots/Datasets/SC_50k/variations/var_2/area_act_chapter_section_info/with_selected_areas/case_area_act_chapter_section_info.json"
act_areas_path = "/home/workboots/Datasets/DHC/variations/var_1/surveys/areas/Mitali/act_to_area_mapping.json"
section_chapters_path = "/home/workboots/Datasets/IndiaCode/new/CentralActs/section_chapters_alt.json"

In [3]:
with open(aux_path, 'r') as f:
    case_info = json.load(f)
with open(aux_path_sc, 'r') as f:
    case_info_sc = json.load(f)

In [4]:
case_info.update(case_info_sc)

In [24]:
with open("/home/workboots/Datasets/DHC/variations/new/var_4/area_act_chapter_section_info/with_areas/overall/test/case_area_act_chapter_section_info.json"

SyntaxError: unterminated string literal (detected at line 1) (800890545.py, line 1)

In [5]:
with open(act_areas_path, 'r') as f:
    act_areas = json.load(f)

In [6]:
with open(section_chapters_path, 'r') as f:
    section_chapters = json.load(f)

In [7]:
area_acts = defaultdict(lambda: list())
chapter_sections = defaultdict(lambda: list())

In [8]:
for act, areas in act_areas.items():
    for area in areas:
        area_acts[area].append(act)

In [9]:
for section, chapter in section_chapters.items():
    chapter_sections[chapter].append(section)

Starting by defining the targets at the roughest granularity, i.e. areas

In [10]:
target_cases = defaultdict(lambda: list())
present_areas = []
present_acts = []
present_chapters = []
present_sections = []
present_area_acts = defaultdict(lambda: set())
present_act_chapters = defaultdict(lambda: set())
present_chapter_sections = defaultdict(lambda: set())
area_cases = defaultdict(lambda: list())
act_cases = defaultdict(lambda: list())
chapter_cases = defaultdict(lambda: list())
section_cases = defaultdict(lambda: list())

In [11]:
for case, info in case_info.items():
    for area in info["areas"]:
        target_cases[area].append(case)
        present_areas.append(area)
        area_cases[area].append(case)
    for act in info["acts"]:
        present_acts.append(act)
        act_cases[act].append(case)
        for area in act_areas.get(act, []):
            present_area_acts[area].update([act])
    for chapter in info["chapters"]:
        present_chapters.append(chapter)
        chapter_cases[chapter].append(case)
        act = chapter.split("_")[0]
        present_act_chapters[act].update([chapter])
    for section in info["sections"]:
        present_sections.append(section)
        chapter = section_chapters.get(section, "")
        # HACKY
        # TODO: Fix at the area act chapter section annotation level
        if "Constitution" in section:
            present_chapters.append(chapter)
            present_act_chapters["Constitution"].update([chapter])
            chapter_cases[chapter].append(case)

        present_chapter_sections[chapter].update([section])
        section_cases[section].append(case)

In [12]:
present_areas = dict(Counter(present_areas))
present_acts = dict(Counter(present_acts))
present_chapters = dict(Counter(present_chapters))
present_sections = dict(Counter(present_sections))

In [13]:
def imbalance_ratios(target_cases: dict) -> Tuple[float, float, float, float]:
    """Compute imbalance ratios for given targets"""
    max_class = max(map(lambda x: len(x), target_cases.values()))
    imbl = [max_class * 1./len(v) for v in target_cases.values()]
    max_imbl = max(imbl)
    min_imbl = min(imbl)
    stdev_imbl = stdev(imbl)
    mean_imbl = mean(imbl)
    return max_imbl, min_imbl, stdev_imbl, mean_imbl

In [14]:
def split_area(
        target_cases: dict,
        largest_class: str,
        label_cases: dict,
        present_lb1_lb2: dict,
        min_cases: int,
        big_atomic_classes: list) -> Tuple[dict, float]:
    new_areas = {
        k: label_cases[k] for k in present_lb1_lb2[largest_class]
        if len(label_cases[k]) >= min_cases}
    if len(new_areas) == 0:
        big_atomic_classes.append(largest_class)

        n_max_imbl, n_min_imbl, n_stdev_imbl, n_mean_imbl = imbalance_ratios(
                                                                target_cases)
        return big_atomic_classes, target_cases, n_mean_imbl

    composite_area = list(set(target_cases[largest_class]) -
                          set([v for val in new_areas.values() for v in val]))
    if len(composite_area) != 0 and len(composite_area) < min_cases:
        least_new = min([len(v) for v in new_areas.values()])
        least_new_class = [
                k for k, v in new_areas.items()
                if len(v) == least_new]
        least_new_class = least_new_class[0]
        composite_area.extend(new_areas[least_new_class])
        del new_areas[least_new_class]

    del target_cases[largest_class]
    target_cases.update(new_areas)
    if len(composite_area) != 0:
        target_cases.update({f"{largest_class}_COMPOSITE": composite_area})
        big_atomic_classes.append(f"{largest_class}_COMPOSITE")
    n_max_imbl, n_min_imbl, n_stdev_imbl, n_mean_imbl = imbalance_ratios(
                                                                target_cases)
    return big_atomic_classes, target_cases, n_mean_imbl

In [15]:
max_imbl, min_imbl, stdev_imbl, mean_imbl = imbalance_ratios(target_cases)

In [16]:
stdev_imbl

138.4361350608662

In [17]:
mean_imbl

110.67541198045105

In [18]:
max_imbl

621.8846153846154

In [19]:
min_imbl

1.0

Finding the largest class and breaking it down

In [20]:
min_cases = 150
imbl = 25

In [21]:
big_atomic_classes = []
non_atomic_usable_classes = [
        k for k, v in target_cases.items()
        if len(v) >= min_cases and
        k not in big_atomic_classes]

# Define initial imbalance ratio
max_imbl, min_imbl, stdev_imbl, mean_imbl = imbalance_ratios(target_cases)

while len(non_atomic_usable_classes) > 0:
    # Get largest class
    largest = max([len(v) for k, v in target_cases.items()
                   if k not in big_atomic_classes])
    largest_class = [
            k for k, v in target_cases.items()
            if len(v) == largest and k not in big_atomic_classes]
    largest_class = largest_class[0]
    # Check what hierarchy largest class belongs to
    # and split accordingly
    if present_areas.get(largest_class, -1) != -1:
        big_atomic_classes, target_cases, mean_imbl = split_area(
                target_cases, largest_class,
                act_cases, present_area_acts,
                min_cases, big_atomic_classes)
    elif present_acts.get(largest_class, -1) != -1:
        big_atomic_classes, target_cases, mean_imbl = split_area(
                target_cases, largest_class,
                chapter_cases, present_act_chapters,
                min_cases, big_atomic_classes)
    elif present_chapters.get(largest_class, -1) != -1:
        big_atomic_classes, target_cases, mean_imbl = split_area(
                target_cases, largest_class,
                section_cases, present_chapter_sections,
                min_cases, big_atomic_classes)
    else:
        big_atomic_classes.append(largest_class)

    non_atomic_usable_classes = [
            k for k, v in target_cases.items()
            if len(v) >= min_cases and
            k not in big_atomic_classes]


In [22]:
list(target_cases.keys())

['HUMAN RIGHTS LAW',
 'EDUCATION LAW',
 'IMMIGRATION LAW',
 'TRADE LAW',
 'TELECOMMUNICATION LAW',
 'INTERNATIONAL LAW',
 'ESTATE PLANNING (WILLS AND TRUSTS) LAW',
 'CYBER LAW',
 'MEDIA LAW',
 'INSURANCE LAW',
 'ENVIRONMENT LAW',
 'AGRICULTURE LAW',
 'COMPETITION LAW',
 'CONSUMER LAW',
 'MEDICAL AND HEALTHCARE LAW',
 'AVIATION LAW',
 'RTI LAW',
 'Constitution',
 'CRIMINAL LAW_COMPOSITE',
 'Indian Penal Code, 1860_OF OFFENCES RELATING TO THE ARMY',
 'Indian Penal Code, 1860_OF PUNISHMENTS',
 'Indian Penal Code, 1860_GENERAL EXCEPTIONS',
 'Indian Penal Code, 1860_COMPOSITE',
 'Indian Penal Code, 1860_341',
 'Indian Penal Code, 1860_304',
 'Indian Penal Code, 1860_353',
 'Indian Penal Code, 1860_302',
 'Indian Penal Code, 1860_325',
 'Indian Penal Code, 1860_323',
 'Indian Penal Code, 1860_354',
 'Indian Penal Code, 1860_377',
 'Indian Penal Code, 1860_326',
 'Indian Penal Code, 1860_342',
 'Indian Penal Code, 1860_364',
 'Indian Penal Code, 1860_299',
 'Indian Penal Code, 1860_376',
 'In

In [24]:
mean_imbl

30.473590668066148

In [543]:
target_cases

defaultdict(<function __main__.<lambda>()>,
            {'JUDICIARY AND COURTS LAW': ['169303756',
              '1197726',
              '148971770',
              '107647019',
              '174309355',
              '109410701',
              '4259489',
              '8870064',
              '132664379',
              '139419968',
              '16872759',
              '92774315',
              '99552111',
              '49924187',
              '57696159',
              '55878138',
              '130829582',
              '154398064',
              '80656985',
              '104002724',
              '139390691',
              '38717945',
              '132001300',
              '26560868',
              '9584575',
              '6007633',
              '73028989',
              '137143719',
              '3687588',
              '71989492',
              '5842793',
              '136687668',
              '6399643',
              '173661670',
              '107806938',
          

In [22]:
max([len(v) for v in target_cases.values()])

5770

In [26]:
min([len(v) for v in target_cases.values()])

26

In [23]:
len(target_cases)

219

In [28]:
for k, v in sorted(target_cases.items(), key=lambda x: len(x[1]), reverse=True):
    print(k, len(v))

Constitution_226 5770
Indian Penal Code, 1860_302 4742
Indian Penal Code, 1860_34 4048
Constitution_14 3698
Code of Civil Procedure, 1882 3454
Constitution_136 2658
Constitution_32 2250
Code of Criminal Procedure, 1973_313 2184
Indian Penal Code, 1860_120 1976
Code of Criminal Procedure, 1973_MISCELLANEOUS_COMPOSITE 1976
Indian Penal Code, 1860_304 1761
Constitution_227 1743
Indian Penal Code, 1860_307 1628
Constitution_21 1481
Indian Penal Code, 1860_149 1424
Indian Penal Code, 1860_OF OFFENCES RELATINGTO MARRIAGE_COMPOSITE 1279
Land Acquisition Act, 1894 1252
Indian Penal Code, 1860_420 1184
Code of Criminal Procedure, 1973_161 1179
Indian Penal Code, 1860_323 1153
Constitution_16 1106
Indian Penal Code, 1860_148 1098
Constitution_191 1043
Constitution_1 996
Constitution_The Union_COMPOSITE 933
Constitution_19 891
Indian Penal Code, 1860_376 886
Indian Penal Code, 1860_406 876
Indian Penal Code, 1860_506 875
Indian Penal Code, 1860_201 865
Indian Penal Code, 1860_324 847
Indian Penal

In [24]:
dhc_cases_path = "/home/workboots/Datasets/DHC/variations/new/var_4/area_act_chapter_section_info/with_areas/retained_cases.txt"
with open(dhc_cases_path, 'r') as f:
    dhc_cases = f.readlines()

In [25]:
dhc_cases = list(filter(None, map(lambda x: x.strip("\n"), dhc_cases)))

In [26]:
len(dhc_cases)

11640

In [27]:
dhc_target_cases = {k: list(set(v).intersection(set(dhc_cases))) for k,v in target_cases.items()}

In [28]:
dhc_case_targets = defaultdict(lambda: list())
for label, cases in dhc_target_cases.items():
    for case in cases:
        dhc_case_targets[case].append(label)

dhc_label_case_num = {
        k: len(v)
        for k, v in sorted(dhc_target_cases.items(),
                           key=lambda x: len(x[1]),
                           reverse=True)}

In [29]:
with open("/home/workboots/Datasets/DHC/variations/new/var_4/area_act_chapter_section_info/with_areas/overall/label_case_info.json", 'w') as f:
    json.dump(dhc_target_cases, f, indent=4)
with open("/home/workboots/Datasets/DHC/variations/new/var_4/area_act_chapter_section_info/with_areas/overall/label_case_num.json", 'w') as f:
    json.dump(dhc_label_case_num, f, indent=4)
with open("/home/workboots/Datasets/DHC/variations/new/var_4/targets/with_areas/balanced_label_targets.json", 'w') as f:
    json.dump(dhc_case_targets, f, indent=4)

In [30]:
sc_cases_path = "/home/workboots/Datasets/SC_50k/variations/var_2/area_act_chapter_section_info/with_selected_areas/selected_cases.txt"
with open(sc_cases_path, 'r') as f:
    sc_cases = f.readlines()

In [31]:
sc_cases = list(filter(None, map(lambda x: x.strip("\n"), sc_cases)))

In [32]:
len(sc_cases)

25237

In [33]:
sc_target_cases = {k: list(set(v).intersection(set(sc_cases))) for k,v in target_cases.items()}

In [34]:
sc_case_targets = defaultdict(lambda: list())
for label, cases in sc_target_cases.items():
    for case in cases:
        sc_case_targets[case].append(label)

sc_label_case_num = {
        k: len(v)
        for k, v in sorted(sc_target_cases.items(),
                           key=lambda x: len(x[1]),
                           reverse=True)}

In [35]:
with open("/home/workboots/Datasets/SC_50k/variations/var_2/area_act_chapter_section_info/with_selected_areas/label_case_info.json", 'w') as f:
    json.dump(sc_target_cases, f, indent=4)
with open("/home/workboots/Datasets/SC_50k/variations/var_2/area_act_chapter_section_info/with_selected_areas/label_case_num.json", 'w') as f:
    json.dump(sc_label_case_num, f, indent=4)
with open("/home/workboots/Datasets/SC_50k/variations/var_2/targets/with_selected_areas/balanced_label_targets.json", 'w') as f:
    json.dump(sc_case_targets, f, indent=4)