# NDCG Calculation

In [1]:
from collections import defaultdict
import json
import math

To calculate the NDCG values, the following are required/need to be implemented:
- Finding number of relevant targets (copy from map scripts)
- Implement DCG
- Implement IDCG
- Combine to get nDCG

In [2]:
def create_adv_scores(charges: list, charge_adv_win_ratios: dict,
                      weights: dict = None, strategy: str = 'equal',):
    """Generate scores of advocates for a given combination of charges
    depending on the weightage strategy
    
    Parameters
    ----------
    charges: list
        List of charges.
    charge_adv_win_ratios: dict
        Dictionary of win ratios of each advocate for each charge.
    weight: dict, default None
        Weights to be used for combining win-ratios for resultant score. Used when 'strategy' is not 'equal'.
    strategy: str
        Weightage strategy to use.
        'equal': Equal weightage to each charge.
        'case_fraction': Weigh charges based on fraction of cases.
        
    Returns
    -------
    scores: dict
        Scores of advocates.
    """
    assert strategy in ['equal', 'case_fraction'], f"{strategy} is not a valid weightage strategy."
    if strategy == 'case_fraction':
        assert weights is not None, f"Weights need to be specified when using 'case_fraction' weightage."
        
    scores = defaultdict(float)
    if strategy == "equal":
        weights = {charge: 1./len(charges) for charge in charges}
        for charge in charges:
            for adv, score in charge_adv_win_ratios[charge].items():
                scores[adv] += weights[charge] * score
    else:
        for charge in charges:
            for adv, score in charge_adv_win_ratios[charge].items():
                scores[adv] += weights[charge] * score
                
    scores = {k: v for k, v in sorted(scores.items(), key=lambda x: x[1], reverse=True)}
    return scores

## Testing the above method

### Equal weightage

**Loading win-ratios**

In [9]:
with open("/home/workboots/Datasets/DHC/variations/var_3/targets/charge_adv_win_ratios_new.json", 'r') as f:
    charge_adv_win_ratios = json.load(f)

**Creating a dummy set of charges**

In [10]:
charges = ["Of Hurt", "Introduction"]

**Testing**

In [11]:
create_adv_scores(charges, charge_adv_win_ratios)

{'ZubedaBegum': 1.0,
 'SanjeevSabharwal': 0.8181818181818181,
 'RaviGupta': 0.75,
 'GautamNarayan': 0.75,
 'RajatArora': 0.75,
 'SanjayJain': 0.7333333333333334,
 'HarshPrabhakar': 0.7103174603174603,
 'AnuragJain': 0.7102272727272727,
 'ArvindNigam': 0.7,
 'SandeepSethi': 0.675,
 'VijayAggarwal': 0.6666666666666666,
 'AnkitJain': 0.6666666666666666,
 'AmitGupta': 0.6611111111111111,
 'RichaKapoor': 0.65,
 'RajdipaBehura': 0.65,
 'MeenakshiChauhan': 0.6469298245614035,
 'ManjeetArya': 0.625,
 'ManojOhri': 0.5850202429149798,
 'MPSingh': 0.5833333333333333,
 'PuneetMittal': 0.5833333333333333,
 'AjayVerma': 0.5668449197860963,
 'RahulMehra': 0.5666666666666667,
 'SahilaLamba': 0.5656565656565657,
 'NanditaRao': 0.5641025641025641,
 'SMuralidhar': 0.5563725490196079,
 'SidharthLuthra': 0.5307692307692308,
 'MLYadav': 0.5297619047619048,
 'AashaTiwari': 0.5287878787878788,
 'VarunGoswami': 0.5182186234817814,
 'RakhiDubey': 0.5097402597402597,
 'TSinghdev': 0.5,
 'HemantSingh': 0.5,
 'Har

**Testing with real cases**

In [12]:
with open("/home/workboots/Datasets/DHC/variations/var_3/targets/ipc_case_offences.json", 'r') as f:
    case_offences = json.load(f)
with open("/home/workboots/Datasets/DHC/variations/var_3/targets/unique_labels.txt", 'r') as f:
    targets = f.readlines()
targets = list(filter(None, map(lambda x: x.strip("\n"), targets)))

In [13]:
charges = case_offences[list(case_offences.keys())[0]]

In [14]:
charges = set(charges).intersection(set(targets))

In [15]:
create_adv_scores(charges, charge_adv_win_ratios)

{'SanjayJain': 1.0,
 'GauravGupta': 1.0,
 'SandeepSethi': 1.0,
 'SanjeevNarul': 1.0,
 'SumeetVerma': 0.85,
 'AmitKumar': 0.8333333333333333,
 'SanjeevSabharwal': 0.8333333333333333,
 'AkhilSibal': 0.8333333333333333,
 'SMuralidhar': 0.7857142857142857,
 'DayanKrishnan': 0.7638888888888888,
 'HemantSingh': 0.75,
 'RaviGupta': 0.75,
 'ChetanSharma': 0.75,
 'ManojOhri': 0.7083333333333333,
 'RahulMehra': 0.7083333333333333,
 'MukeshGupta': 0.7083333333333333,
 'RajdipaBehura': 0.7023809523809523,
 'SunilSharma': 0.7,
 'MeenakshiChauhan': 0.6958041958041958,
 'RameshGupta': 0.6190476190476191,
 'AnuragJain': 0.6190476190476191,
 'SahilaLamba': 0.6190476190476191,
 'ManjeetArya': 0.5944055944055944,
 'PramodKumarDubey': 0.5857142857142856,
 'RichaKapoor': 0.5833333333333333,
 'MLYadav': 0.5833333333333333,
 'SanjayLao': 0.5714285714285714,
 'HarshPrabhakar': 0.5508021390374331,
 'AashaTiwari': 0.5454545454545454,
 'KusumDhalla': 0.5414740626605034,
 'VarunGoswami': 0.5375,
 'MukeshKumar': 0

**Checking to see if it works on all cases in the dataset**

In [16]:
results = {}
for case, charges in case_offences.items():
    rel_charges = set(charges).intersection(set(targets))
    results[case] = create_adv_scores(rel_charges, charge_adv_win_ratios)

In [17]:
results

{'72678796': {'SanjayJain': 1.0,
  'GauravGupta': 1.0,
  'SandeepSethi': 1.0,
  'SanjeevNarul': 1.0,
  'SumeetVerma': 0.85,
  'AmitKumar': 0.8333333333333333,
  'SanjeevSabharwal': 0.8333333333333333,
  'AkhilSibal': 0.8333333333333333,
  'SMuralidhar': 0.7857142857142857,
  'DayanKrishnan': 0.7638888888888888,
  'HemantSingh': 0.75,
  'RaviGupta': 0.75,
  'ChetanSharma': 0.75,
  'ManojOhri': 0.7083333333333333,
  'RahulMehra': 0.7083333333333333,
  'MukeshGupta': 0.7083333333333333,
  'RajdipaBehura': 0.7023809523809523,
  'SunilSharma': 0.7,
  'MeenakshiChauhan': 0.6958041958041958,
  'RameshGupta': 0.6190476190476191,
  'AnuragJain': 0.6190476190476191,
  'SahilaLamba': 0.6190476190476191,
  'ManjeetArya': 0.5944055944055944,
  'PramodKumarDubey': 0.5857142857142856,
  'RichaKapoor': 0.5833333333333333,
  'MLYadav': 0.5833333333333333,
  'SanjayLao': 0.5714285714285714,
  'HarshPrabhakar': 0.5508021390374331,
  'AashaTiwari': 0.5454545454545454,
  'KusumDhalla': 0.5414740626605034,


### Case fraction weightage

**Getting the charge case fractions**

In [20]:
with open("/home/workboots/Datasets/DHC/variations/var_3/targets/ipc_charge_cases.json", 'r') as f:
    charges_cases = json.load(f)

In [21]:
total_cases = set([value for values in charges_cases.values() for value in values])
print(len(total_cases))

6012


In [22]:
weights = {}
for charge, cases in charges_cases.items():
    weights[charge] = len(cases) * 1./len(total_cases)

In [23]:
results_weighted = {}
for case, charges in case_offences.items():
    rel_charges = set(charges).intersection(set(targets))
    results_weighted[case] = create_adv_scores(rel_charges, charge_adv_win_ratios, strategy="case_fraction", weights = weights)

In [25]:
results_weighted

{'72678796': {'SanjayJain': 0.18280106453759148,
  'GauravGupta': 0.18280106453759148,
  'SandeepSethi': 0.18280106453759148,
  'SanjeevNarul': 0.18280106453759148,
  'AkhilSibal': 0.15823907740075405,
  'SumeetVerma': 0.15006653359946773,
  'AmitKumar': 0.14642936349523175,
  'SanjeevSabharwal': 0.14642936349523175,
  'DayanKrishnan': 0.13914763066459673,
  'SMuralidhar': 0.13603744891170041,
  'RajdipaBehura': 0.13303551626904922,
  'MeenakshiChauhan': 0.13252050677200378,
  'MukeshGupta': 0.13096030161898425,
  'HemantSingh': 0.1282435129740519,
  'RaviGupta': 0.1282435129740519,
  'ChetanSharma': 0.1282435129740519,
  'ManojOhri': 0.12800787314260367,
  'RahulMehra': 0.12800787314260367,
  'SunilSharma': 0.11733200266134398,
  'RameshGupta': 0.11147546177486298,
  'AnuragJain': 0.11147546177486298,
  'SahilaLamba': 0.11147546177486298,
  'VarunGoswami': 0.11021290751829674,
  'MLYadav': 0.1095863827899756,
  'PuneetMittal': 0.10911510312707917,
  'AnujAgarwal': 0.10911510312707917,

## Getting target advocates in lenient setup

**Re-writing `create_targets()` to give the list of targets for a particular case**

In [28]:
def create_targets(targets_dict, adv_index, case,
                   case_charges=None, adv_charges=None, threshold=None):
    """Create targets from a dictionary of targets and advocate ordering.

    Parameters
    ----------

    targets_dict : dict
        Dictionary with the targets of each case.
    adv_list : list
        List of advocates to consider.
    cases : list
        Ordered list cases.

    Returns
    -------
    result : numpy.array
        Stacked target mult-hot vectors.
    """
    actual = []
    lenient = []

    # Lenient
    if all(ele is not None
        for ele in [case_charges, adv_charges, threshold]):
        lenient = [adv
                    for adv in list(adv_index.keys())
                    if adv not in targets_dict[case] and
                        len(set(adv_charges[adv]).intersection(
                                set(case_charges[case]))) * 1./len(
                                case_charges[case]) >= threshold]

    actual = [adv
              for adv in list(adv_index.keys())
              if adv in targets_dict[case]]
    
    return actual, lenient

**Getting advocate charges**

In [29]:
with open("/home/workboots/Datasets/DHC/variations/var_3/targets/adv_ipc_charges_new.json", 'r') as f:
    adv_charges = json.load(f)


**Getting targets**

In [30]:
with open("/home/workboots/Datasets/DHC/common/case_advs_new.json", 'r') as f:
    case_advs = json.load(f)

**Getting selected advocates**

In [31]:
with open("/home/workboots/Datasets/DHC/common/selected_advs.json", 'r') as f:
    advs = json.load(f)
advs = {k: i for i, k in advs.items()}

**Testing create targets**

In [32]:
actual, lenient = create_targets(targets_dict=case_advs,
                   adv_index=advs,
                   case_charges=case_offences,
                   adv_charges=adv_charges,
                   threshold=1.0,
                   case=list(case_offences.keys())[5])
               

In [33]:
len(lenient)

46

## Testing overlap of ranked advocate list and targets

In [34]:
list(results_weighted[list(case_offences.keys())[5]].keys()).index('ManjeetArya')

21

In [35]:
len(set(list(results_weighted[list(case_offences.keys())[5]].keys())[:(len(lenient) + len(actual))]).intersection(set(lenient)))

36

In [36]:
case_offences[list(case_offences.keys())[5]]

['Of Criminal intimidation, Insult and Annoyance',
 'Introduction',
 'Of Kidnapping, Abduction, Slavery and Forced Labour',
 'Sexual Offences including rape and Sodomy',
 'Of Hurt']

## Discounted Cumulative Gain

In [37]:
def dcg(relevance, predicted):
    score = sum([relevance[name] * 1./math.log(i+2, 2) for i, name in enumerate(predicted)])
    return score 

**Ideal DCG**

In [38]:
def idcg(relevance, predicted):
    relevance = {k: v for k, v in sorted(relevance.items(), key=lambda x: x[1], reverse=True)[:len(predicted)]}
    score = sum([val * 1./math.log(i+2, 2) for i, val in enumerate(relevance.values())])
    return score

**Getting the relevance ranking**

In [39]:
def relevance_rank(actual, lenient, full):
    relevance = {}
    for adv in full:
        if adv in actual:
            relevance[adv] = 3
        elif adv in lenient[:int(len(lenient)/2) + 1]:
            relevance[adv] = 2
        elif adv in lenient[int(len(lenient)/2) + 1:]:
            relevance[adv] = 1
        else:
            relevance[adv] = 0
    relevance = {k: v for k, v in sorted(relevance.items(), key=lambda x: x[1], reverse=True)}
    return relevance

In [40]:
relevance = relevance_rank(actual, lenient, advs)

In [49]:
predicted = ["KewalSinghAhuja", "AjayVerma", "PramodKumarDubey"]

In [50]:
dcg(relevance, predicted)

4.761859507142915

In [51]:
idcg(relevance, predicted)

5.261859507142915

In [52]:
relevance

{'KewalSinghAhuja': 3,
 'AnuragAhluwalia': 2,
 'SanjayJain': 2,
 'PawanSharma': 2,
 'VijayAggarwal': 2,
 'RajdipaBehura': 2,
 'ManojOhri': 2,
 'AjayVerma': 2,
 'PuneetMittal': 2,
 'VikasPahwa': 2,
 'DayanKrishnan': 2,
 'NanditaRao': 2,
 'RichaKapoor': 2,
 'SandeepSethi': 2,
 'KusumDhalla': 2,
 'RaviGupta': 2,
 'MeenakshiChauhan': 2,
 'RameshGupta': 2,
 'AmitKumar': 2,
 'JyotiSingh': 2,
 'MohitMathur': 2,
 'MLYadav': 2,
 'LovkeshSawhney': 2,
 'ChetanLokur': 2,
 'MukeshKumar': 2,
 'PramodKumarDubey': 1,
 'ManjeetArya': 1,
 'HarshPrabhakar': 1,
 'VarunGoswami': 1,
 'AkhilSibal': 1,
 'SanjayLao': 1,
 'SudhirNandrajog': 1,
 'AmitGupta': 1,
 'RajatKatyal': 1,
 'SumeetVerma': 1,
 'SanjeevSabharwal': 1,
 'RakhiDubey': 1,
 'RahulMehra': 1,
 'SMuralidhar': 1,
 'AnuragJain': 1,
 'RajeshMahajan': 1,
 'SidharthLuthra': 1,
 'SunilSharma': 1,
 'AashaTiwari': 1,
 'MPSingh': 1,
 'SahilaLamba': 1,
 'SanjeevNarul': 1,
 'KavitaJha': 0,
 'AjayVohra': 0,
 'SudhirChandra': 0,
 'AtulKumar': 0,
 'VivekSingh': 

**Testing against wikipedia example: https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Example**

In [162]:
predicted = ["d1", "d2", "d3", "d4", "d5", "d6"]

In [164]:
full = ["d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"]

In [165]:
actual = ["d1", "d3"]
lenient = ["d2", "d6", "d5"]

In [182]:
relevance = relevance_rank(actual, lenient, full)

In [186]:
relevance_full = relevance

In [184]:
dcg(relevance, predicted)

6.861126688593501

In [195]:
idcg(relevance_full, predicted)

8.740262365546284

In [187]:
relevance_full["d7"] = 3
relevance_full["d8"] = 2

In [188]:
relevance_full

{'d1': 3, 'd3': 3, 'd2': 2, 'd6': 2, 'd5': 1, 'd4': 0, 'd7': 3, 'd8': 2}