# Notebook for analyzing the Delhi High Court dataset and the results of the models

In [128]:
import os
import json
import plotly.express as px
from collections import Counter, defaultdict
import statistics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from itertools import chain

## For the entire dataset

Getting advocate cases

In [120]:
adv_cases_path = "/home/workboots/Datasets/DHC/common_new/adv_info/adv_cases.json"
case_chapters_path = "/home/workboots/Datasets/DHC/common_new/targets/case_chapters.json"

In [121]:
with open(adv_cases_path, 'r') as f:
    adv_cases = json.load(f)

In [122]:
with open(case_chapters_path, 'r') as f:
    case_chapters = json.load(f)

Getting the charge modality of advocates

In [123]:
adv_charges = defaultdict(list)

In [130]:
for adv, cases in adv_cases.items():
    adv_charges[adv].extend(chain.from_iterable([case_chapters.get(case, []) for case in cases]))

In [132]:
adv_case_counts = {k: len(v) for k, v in adv_cases.items()}

In [136]:
adv_modality = {k: len(statistics.multimode(v)) for k, v in adv_charges.items()}

In [138]:
adv_count_modality = {k: (adv_modality[k], adv_case_counts[k]) for k in adv_modality}a

In [156]:
modality_cases_counts = Counter(adv_count_modality.values())

In [157]:
modality_cases_counts

Counter({(1, 404): 1,
         (2, 1): 1028,
         (1, 227): 1,
         (1, 3): 413,
         (0, 1): 5149,
         (1, 7): 156,
         (2, 17): 13,
         (2, 155): 1,
         (2, 269): 1,
         (1, 5): 241,
         (0, 3): 277,
         (1, 24): 23,
         (1, 22): 21,
         (0, 2): 861,
         (7, 5): 33,
         (2, 3): 232,
         (1, 201): 1,
         (1, 122): 1,
         (4, 26): 2,
         (1, 2): 654,
         (1, 149): 1,
         (2, 59): 5,
         (1, 19): 28,
         (1, 10): 108,
         (1, 37): 11,
         (2, 6): 86,
         (1, 54): 6,
         (2, 76): 1,
         (1, 125): 1,
         (12, 24): 2,
         (3, 10): 27,
         (1, 35): 10,
         (4, 45): 2,
         (2, 9): 47,
         (2, 11): 28,
         (1, 29): 15,
         (1, 44): 11,
         (1, 91): 3,
         (2, 88): 2,
         (5, 8): 18,
         (2, 87): 1,
         (1, 48): 3,
         (1, 40): 12,
         (1, 255): 2,
         (11, 5): 14,
         (5, 3): 102

In [142]:
unique_modalities = np.unique([v[0] for v in adv_count_modality.values()])

In [182]:
len(unique_modalities)

36

In [145]:
unique_case_counts = np.unique([v[1] for v in adv_count_modality.values()])

In [146]:
len(unique_case_counts)

192

In [165]:
max(unique_case_counts)

501

In [189]:
counts = np.zeros((max(unique_modalities)+1, max(unique_case_counts)+1))

In [190]:
counts.shape

(42, 502)

In [191]:
for k, v in modality_cases_counts.items():
    counts[k[0],k[1]] = v

In [192]:
idx_col = np.argwhere(np.all(counts[..., :] == 0, axis=0))

In [193]:
counts = np.delete(counts, idx_col, axis=1)

In [194]:
idx_row = np.argwhere(np.all(counts[:, ...] == 0, axis=1))

In [195]:
counts = np.delete(counts, idx_row, axis=0)

In [196]:
counts.shape

(36, 192)

In [209]:
chi2 = stats.chi2_contingency(counts, correction=False)[0]
sample_size = np.sum(counts)
min_dim = min(cont_arr.shape) - 1
min_dim = min_dim - min_dim**2/(sample_size - 1)
cramer_v = np.sqrt((chi2 / sample_size) / min_dim)

cramer_v

0.575540768431439

In [202]:
df = pd.DataFrame([[k[0], k[1]] for k in adv_count_modality.values()], columns = ["Modality", "Number of Cases"])

In [203]:
len(df)

21531

In [218]:
spearman = df.corr(method="spearman")["Modality"].iloc[1]

In [219]:
spearman

0.27297649794767903

## For the used dataset

In [220]:
adv_cases_path = "/home/workboots/Datasets/DHC/variations/new/var_1/adv_info/adv_cases.json"
case_chapters_path = "/home/workboots/Datasets/DHC/variations/new/var_1/targets/case_chapters.json"

In [221]:
with open(adv_cases_path, 'r') as f:
    adv_cases = json.load(f)

In [222]:
with open(case_chapters_path, 'r') as f:
    case_chapters = json.load(f)

In [223]:
adv_charges = defaultdict(list)

In [224]:
for adv, cases in adv_cases.items():
    adv_charges[adv].extend(chain.from_iterable([case_chapters.get(case, []) for case in cases]))

In [225]:
adv_case_counts = {k: len(v) for k, v in adv_cases.items()}

In [226]:
adv_modality = {k: len(statistics.multimode(v)) for k, v in adv_charges.items()}

In [228]:
adv_count_modality = {k: (adv_modality[k], adv_case_counts[k]) for k in adv_modality}

In [229]:
modality_cases_counts = Counter(adv_count_modality.values())

In [230]:
unique_modalities = np.unique([v[0] for v in adv_count_modality.values()])

In [231]:
unique_case_counts = np.unique([v[1] for v in adv_count_modality.values()])

In [232]:
counts = np.zeros((max(unique_modalities)+1, max(unique_case_counts)+1))

In [233]:
for k, v in modality_cases_counts.items():
    counts[k[0],k[1]] = v

In [234]:
idx_col = np.argwhere(np.all(counts[..., :] == 0, axis=0))

In [235]:
counts = np.delete(counts, idx_col, axis=1)

In [236]:
idx_row = np.argwhere(np.all(counts[:, ...] == 0, axis=1))

In [237]:
counts = np.delete(counts, idx_row, axis=0)

In [238]:
chi2 = stats.chi2_contingency(counts, correction=False)[0]
sample_size = np.sum(counts)
min_dim = min(cont_arr.shape) - 1
min_dim = min_dim - min_dim**2/(sample_size - 1)
cramer_v = np.sqrt((chi2 / sample_size) / min_dim)

cramer_v

0.6829077318351967

In [239]:
df = pd.DataFrame([[k[0], k[1]] for k in adv_count_modality.values()], columns = ["Modality", "Number of Cases"])

In [240]:
spearman = df.corr(method="spearman")["Modality"].iloc[1]

In [241]:
spearman

-0.0454414055987521