In [1]:
import json
from ast import literal_eval
from collections import Counter

import pandas as pd
from IPython.display import display

from constants import (SECTORS, SUBPILLARS_2D, SUBPILLARS_1D,
                       DEMOGRAPHIC_GROUPS, SPECIFIC_NEEDS_GROUPS,
                       AFFECTED_GROUPS, SEVERITY, RELIABILITY)

In [2]:
label_to_tagname = dict()
label_to_tagname.update(dict.fromkeys(SECTORS, "SECTORS"))
label_to_tagname.update(dict.fromkeys(SUBPILLARS_2D, "SUBPILLARS_2D"))
label_to_tagname.update(dict.fromkeys(SUBPILLARS_1D, "SUBPILLARS_1D"))
label_to_tagname.update(dict.fromkeys(SPECIFIC_NEEDS_GROUPS, "SPECIFIC_NEEDS_GROUPS"))
label_to_tagname.update(dict.fromkeys(AFFECTED_GROUPS, "AFFECTED_GROUPS"))
label_to_tagname.update(dict.fromkeys(DEMOGRAPHIC_GROUPS, "DEMOGRAPHIC_GROUPS"))
label_to_tagname.update(dict.fromkeys(SEVERITY, "SEVERITY"))
label_to_tagname.update(dict.fromkeys(RELIABILITY, "RELIABILITY"))

In [3]:
df = pd.read_csv("feedback_output_14.16.28.09.2021.csv")
def parse_fb(fb):
    try:
        return literal_eval(fb)
    except:
        return None
df["Feedback"] = df["Feedback"].apply(parse_fb)
df = df[~df["Feedback"].isna()]

In [4]:
df.columns

Index(['ID', 'Version', 'Entry', 'Feedback', 'Feedback Type', 'File Name',
       'Extracted Content Link', 'Review Date', 'Reviewer', 'Sectors',
       '2D Sub Pillars', '1D Sub Pillars', 'Demographic Groups',
       'Specific Needs Groups', 'Severity', 'Geolocation', 'Sector All',
       '2D Sub Pillars All', '1D Sub Pillars All', 'Demographic Groups All',
       'Specific Needs Groups All', 'Severity All'],
      dtype='object')

In [5]:
def process_row(row):
    fb = row[3]
    sectors_mis = []
    pill_2d_mis = []
    pill_1d_mis = []
    demog_g_mis = []
    specific_n_g_mis = []
    severity_mis = []
    geo_loc_mis = []
    affected_mis = []
    reliab_mis = []
    for mis_lbl in fb["missing"]:
        if mis_lbl.startswith("Geo:"):
            geo_loc_mis.append(mis_lbl[5:])
            continue
        mis_tag = label_to_tagname[mis_lbl]
        if mis_tag == "AFFECTED_GROUPS":
            affected_mis.append(mis_lbl)
        elif mis_tag == "DEMOGRAPHIC_GROUPS":
            demog_g_mis.append(mis_lbl)
        elif mis_tag == "RELIABILITY":
            reliab_mis.append(mis_lbl)
        elif mis_tag == "SECTORS":
            sectors_mis.append(mis_lbl)
        elif mis_tag == "SEVERITY":
            severity_mis.append(mis_lbl)
        elif mis_tag == "SPECIFIC_NEEDS_GROUPS":
            specific_n_g_mis.append(mis_lbl)
        elif mis_tag == "SUBPILLARS_1D":
            pill_1d_mis.append(mis_lbl)
        elif mis_tag == "SUBPILLARS_2D":
            pill_2d_mis.append(mis_lbl)
    ##
    sectors_wrong = []
    pill_2d_wrong = []
    pill_1d_wrong = []
    demog_g_wrong = []
    specific_n_g_wrong = []
    severity_wrong = []
    geo_loc_wrong = []
    affected_wrong = []
    reliab_wrong = []
    for wrong_lbl in fb["wrong"]:
        if wrong_lbl.startswith("Geo:"):
            geo_loc_wrong.append(wrong_lbl[5:])
            continue
        wrong_tag = label_to_tagname[wrong_lbl]
        if wrong_tag == "AFFECTED_GROUPS":
            affected_wrong.append(wrong_lbl)
        elif wrong_tag == "DEMOGRAPHIC_GROUPS":
            demog_g_wrong.append(wrong_lbl)
        elif wrong_tag == "RELIABILITY":
            reliab_wrong.append(wrong_lbl)
        elif wrong_tag == "SECTORS":
            sectors_wrong.append(wrong_lbl)
        elif wrong_tag == "SEVERITY":
            severity_wrong.append(wrong_lbl)
        elif wrong_tag == "SPECIFIC_NEEDS_GROUPS":
            specific_n_g_wrong.append(wrong_lbl)
        elif wrong_tag == "SUBPILLARS_1D":
            pill_1d_wrong.append(wrong_lbl)
        elif wrong_tag == "SUBPILLARS_2D":
            pill_2d_wrong.append(wrong_lbl)
    ##
    sectors_old = [
        p.strip()[:p.strip().index("(")] for p in row[9].split(",")
    ] if isinstance(row[9], str) else []
    ##
    pill_2d_old = row[10].replace(",", "COMMA") if isinstance(row[10],
                                                              str) else ""
    pill_2d_old = [
        p.strip()[:p.strip().index("(")] for p in pill_2d_old.split(",")
    ] if isinstance(row[10], str) else []
    pill_2d_old = [p.replace("COMMA", ",") for p in pill_2d_old]
    ##
    pill_1d_old = row[11].replace("(pop", "{pop") if isinstance(row[11],
                                                                str) else ""
    pill_1d_old = pill_1d_old.replace("(hum", "{hum") if isinstance(
        row[11], str) else ""
    pill_1d_old = [
        p.strip()[:p.strip().index("(")] for p in pill_1d_old.split(",")
    ] if isinstance(row[11], str) else []
    pill_1d_old = [p.replace("{", "(") for p in pill_1d_old]
    ##
    demog_g_old = row[12].replace("e (", "e {") if isinstance(row[12],
                                                                str) else ""
    demog_g_old = demog_g_old.replace("r (", "r {")
    demog_g_old = demog_g_old.replace("s (", "s {")
    demog_g_old = [
        p.strip()[:p.strip().index("(")] for p in demog_g_old.split(",")
    ] if isinstance(row[12], str) else []
    demog_g_old = [p.replace("{", "(") for p in demog_g_old]
    ##
    specific_n_g_old = row[13].replace("n (", "n {") if isinstance(row[13],
                                                                str) else ""
    specific_n_g_old = [
        p.strip()[:p.strip().index("(")] for p in specific_n_g_old.split(",")
    ] if isinstance(row[13], str) else []
    specific_n_g_old = [p.replace("{", "(") for p in specific_n_g_old]
    ##
    severity_old = [
        p.strip()[:p.strip().index("(")] for p in row[14].split(",")
    ] if isinstance(row[14], str) else []
    geo_loc_old = [p.strip() for p in row[15].split(",")] if isinstance(
        row[15], str) else []
    reliab_old = []
    affected_old = []
    ##
    sectors_gt = list((set(sectors_old) | set(sectors_mis)) -
                      set(sectors_wrong))
    pill_2d_gt = list((set(pill_2d_old) | set(pill_2d_mis)) -
                      set(pill_2d_wrong))
    pill_1d_gt = list((set(pill_1d_old) | set(pill_1d_mis)) -
                      set(pill_1d_wrong))
    demog_g_gt = list((set(demog_g_old) | set(demog_g_mis)) -
                      set(demog_g_wrong))
    specific_n_g_gt = list((set(specific_n_g_old)
                            | set(specific_n_g_mis)) - set(specific_n_g_wrong))
    severity_gt = list((set(severity_old) | set(severity_mis)) -
                       set(severity_wrong))
    geo_loc_gt = list((set(geo_loc_old) | set(geo_loc_mis)) -
                      set(geo_loc_wrong))
    reliab_gt = list((set(reliab_old) | set(reliab_mis)) - set(reliab_wrong))
    affected_gt = list((set(affected_old) | set(affected_mis)) -
                       set(affected_wrong))
    return {
        "ID": row[0],
        "Entry": row[2],
        ##
        "Sectors_Pred_0.2": list(sorted(sectors_old)),
        "Subpillars_2D_Pred_0.2": list(sorted(pill_2d_old)),
        "Subpillars_1D_Pred_0.2": list(sorted(pill_1d_old)),
        "Demographic_Groups_Pred_0.2": list(sorted(demog_g_old)),
        "Specific_Needs_Groups_Pred_0.2": list(sorted(specific_n_g_old)),
        "Severity_Pred_0.2": list(sorted(severity_old)),
        "Geo_Pred_0.2": list(sorted(geo_loc_old)),
        "Reliability_Pred_0.2": list(sorted(reliab_old)),
        "Affected_Groups_Pred_0.2": list(sorted(affected_old)),
        ##
        "Sectors_Wrong_Pred_0.2": list(sorted(sectors_wrong)),
        "Subpillars_2D_Wrong_Pred_0.2": list(sorted(pill_2d_wrong)),
        "Subpillars_1D_Wrong_Pred_0.2": list(sorted(pill_1d_wrong)),
        "Demographic_Groups_Wrong_Pred_0.2": list(sorted(demog_g_wrong)),
        "Specific_Needs_Groups_Wrong_Pred_0.2":
        list(sorted(specific_n_g_wrong)),
        "Severity_Wrong_Pred_0.2": list(sorted(severity_wrong)),
        "Geo_Wrong_Pred_0.2": list(sorted(geo_loc_wrong)),
        "Reliability_Wrong_Pred_0.2": list(sorted(reliab_wrong)),
        "Affected_Groups_Wrong_Pred_0.2": list(sorted(affected_wrong)),
        ##
        "Sectors_Mis_Pred_0.2": list(sorted(sectors_mis)),
        "Subpillars_2D_Mis_Pred_0.2": list(sorted(pill_2d_mis)),
        "Subpillars_1D_Mis_Pred_0.2": list(sorted(pill_1d_mis)),
        "Demographic_Groups_Mis_Pred_0.2": list(sorted(demog_g_mis)),
        "Specific_Needs_Groups_Mis_Pred_0.2": list(sorted(specific_n_g_mis)),
        "Severity_Mis_Pred_0.2": list(sorted(severity_mis)),
        "Geo_Mis_Pred_0.2": list(sorted(geo_loc_mis)),
        "Reliability_Mis_Pred_0.2": list(sorted(reliab_mis)),
        "Affected_Groups_Mis_Pred_0.2": list(sorted(affected_mis)),
        ##
        "Sectors_GT": list(sorted(sectors_gt)),
        "Subpillars_2D_GT": list(sorted(pill_2d_gt)),
        "Subpillars_1D_GT": list(sorted(pill_1d_gt)),
        "Demographic_Groups_GT": list(sorted(demog_g_gt)),
        "Specific_Needs_Groups_GT": list(sorted(specific_n_g_gt)),
        "Severity_GT": list(sorted(severity_gt)),
        "Geo_GT": list(sorted(geo_loc_gt)),
        "Reliability_GT": list(sorted(reliab_gt)),
        "Affected_Groups_GT": list(sorted(affected_gt)),
    }

In [6]:
df_processed = pd.DataFrame.from_records(df.apply(process_row, axis=1))

In [7]:
df_processed.to_csv("fb_processed_0.2.csv", index=None)

In [8]:
df_processed.shape

(125, 38)

### Sanity Checks

In [9]:
i = 4
(df["Feedback"].iloc[i], 
 df_processed["Subpillars_2D_Pred_0.2"].iloc[i],
 df_processed["Subpillars_2D_Wrong_Pred_0.2"].iloc[i],
 df_processed["Subpillars_2D_Mis_Pred_0.2"].iloc[i],
 df_processed["Subpillars_2D_GT"].iloc[i])

({'missing': ['Capacities & Response->National Response',
   'Host',
   'Infants/Toddlers (<5 years old)',
   'Critical',
   'Casualties->Dead'],
  'text': '[Sector]:\n[2D]:\n[1D]:\n[Demographic]:\n[Specific]:\n[Affected]:\n[Severity]:\n[Geo]:\n',
  'wrong': ['Displaced',
   'Affected',
   'Children/Youth Unspecified gender (5 to 17 years old)',
   'Chronically Ill',
   'Geo: 250 Bed District Sadar Hospital',
   'Geo: Cholera',
   'Geo: Diphtheria',
   'Geo: RDT',
   'Geo: RDT/',
   'Major']},
 ['Humanitarian Conditions->Physical And Mental Well Being'],
 [],
 ['Capacities & Response->National Response'],
 ['Capacities & Response->National Response',
  'Humanitarian Conditions->Physical And Mental Well Being'])

In [10]:
i = 9
(df["Feedback"].iloc[i], 
 df_processed["Subpillars_1D_Pred_0.2"].iloc[i],
 df_processed["Subpillars_1D_Wrong_Pred_0.2"].iloc[i],
 df_processed["Subpillars_1D_Mis_Pred_0.2"].iloc[i],
 df_processed["Subpillars_1D_GT"].iloc[i])

({'missing': [],
  'text': '[Sector]:\n[2D]:\n[1D]:\n[Demographic]:\n[Specific]:\n[Affected]:\n[Severity]:\n[Geo]:\n',
  'wrong': ['Context->Demography',
   'Affected',
   'Migrants',
   'Geo: 250 Bed District Sadar Hospital']},
 ['Context->Demography'],
 ['Context->Demography'],
 [],
 [])

In [11]:
i = 4
(df["Feedback"].iloc[i], 
 df_processed["Geo_Pred_0.2"].iloc[i],
 df_processed["Geo_Wrong_Pred_0.2"].iloc[i],
 df_processed["Geo_Mis_Pred_0.2"].iloc[i],
 df_processed["Geo_GT"].iloc[i])

({'missing': ['Capacities & Response->National Response',
   'Host',
   'Infants/Toddlers (<5 years old)',
   'Critical',
   'Casualties->Dead'],
  'text': '[Sector]:\n[2D]:\n[1D]:\n[Demographic]:\n[Specific]:\n[Affected]:\n[Severity]:\n[Geo]:\n',
  'wrong': ['Displaced',
   'Affected',
   'Children/Youth Unspecified gender (5 to 17 years old)',
   'Chronically Ill',
   'Geo: 250 Bed District Sadar Hospital',
   'Geo: Cholera',
   'Geo: Diphtheria',
   'Geo: RDT',
   'Geo: RDT/',
   'Major']},
 ['250 Bed District Sadar Hospital',
  'Cholera',
  'Diphtheria',
  'RDT',
  'RDT/',
  'Teknaf',
  'Ukhiya'],
 ['250 Bed District Sadar Hospital', 'Cholera', 'Diphtheria', 'RDT', 'RDT/'],
 [],
 ['Teknaf', 'Ukhiya'])