In [1]:
import json
from itertools import islice
from ast import literal_eval

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from skmultilearn.model_selection import iterative_train_test_split
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix

In [2]:
tqdm.pandas()

In [3]:
df = pd.read_csv('generated_dataset/full_dataset.csv')
##
print(df.columns)
##
for col in [
        'sectors', 'subpillars_2d', 'subpillars_1d', 'geo_location',
        'specific_needs_groups', 'demographic_groups',
        'reliability', 'affected_groups'
]:
    df[col] = df[col].apply(lambda x: [] if x != x else literal_eval(x))
##
def count_vals(df, col):
    vals = Counter()
    for e in df[col]:
        vals.update(e)
    return vals.most_common()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Index(['entry_id', 'excerpt', 'analysis_framework_id', 'lead_id', 'project_id',
       'verified', 'sectors', 'subpillars_2d', 'subpillars_1d', 'geo_location',
       'specific_needs_groups', 'severity', 'info_date', 'demographic_groups',
       'reliability', 'affected_groups', 'source_type', 'url', 'website'],
      dtype='object')


In [4]:
df["analysis_framework_id"] = df["analysis_framework_id"].apply(lambda x: str(int(x)))

In [5]:
tagname_to_tagvalues = {
    'sectors': set(),
    'subpillars_2d': set(),
    'subpillars_1d': set(),
    'specific_needs_groups': set(),
    'severity': set(),
    'demographic_groups': set(),
    'reliability': set(),
    'affected_groups': set(),
    'analysis_framework_id': set(),
}

In [6]:
def unique_tags(col):
    unique = set()
    for label in col:
        unique.update(label)
    return unique

In [7]:
for tagname in tagname_to_tagvalues:
    if tagname in ['analysis_framework_id', 'severity']:
        tagname_to_tagvalues[tagname] = set(df[tagname].unique().tolist())
    else:
        tagname_to_tagvalues[tagname] = unique_tags(df[tagname])

In [8]:
tagname_to_tagvalues

{'sectors': {'Agriculture',
  'Cross',
  'Education',
  'Food Security',
  'Health',
  'Livelihoods',
  'Logistics',
  'NOT_MAPPED',
  'Nutrition',
  'Protection',
  'Shelter',
  'WASH'},
 'subpillars_2d': {'At Risk->Number Of People At Risk',
  'At Risk->Risk And Vulnerabilities',
  'Capacities & Response->International Response',
  'Capacities & Response->Local Response',
  'Capacities & Response->National Response',
  'Capacities & Response->Number Of People Reached/Response Gaps',
  'Context->Economy',
  'Covid-19->Restriction Measures',
  'Humanitarian Conditions->Coping Mechanisms',
  'Humanitarian Conditions->Living Standards',
  'Humanitarian Conditions->Number Of People In Need',
  'Humanitarian Conditions->Physical And Mental Well Being',
  'Impact->Driver/Aggravating Factors',
  'Impact->Impact On People',
  'Impact->Impact On Systems, Services And Networks',
  'Impact->Number Of People Affected',
  'NOT_MAPPED',
  'Priority Interventions->Expressed By Humanitarian Staff',
 

In [9]:
classes = set()
for tagvalues in tagname_to_tagvalues.values():
    classes |= tagvalues 
len(classes) # total number of tags

139

In [10]:
classes = classes - {np.nan}
classes

{'1004',
 '1306',
 '136',
 '137',
 '1465',
 '273',
 '495',
 '537',
 '552',
 '699',
 '726',
 '799',
 '829',
 'Adult Female (18 to 59 years old)',
 'Adult Male (18 to 59 years old)',
 'Affected',
 'Agriculture',
 'All',
 'Asylum Seekers',
 'At Risk->Number Of People At Risk',
 'At Risk->Risk And Vulnerabilities',
 'Back to Venezuela',
 'Camp',
 'Capacities & Response->International Response',
 'Capacities & Response->Local Response',
 'Capacities & Response->National Response',
 'Capacities & Response->Number Of People Reached/Response Gaps',
 'Casualties->Dead',
 'Casualties->Injured',
 'Casualties->Missing',
 'Child Head of Household',
 'Children/Youth Female (5 to 17 years old)',
 'Children/Youth Male (5 to 17 years old)',
 'Chronically Ill',
 'Completely Reliable',
 'Context->Demography',
 'Context->Economy',
 'Context->Environment',
 'Context->Legal & Policy',
 'Context->Politics',
 'Context->Security & Stability',
 'Context->Socio Cultural',
 'Covid-19->Cases',
 'Covid-19->Contact 

In [11]:
class_to_id = {cls:i for i, cls in enumerate(classes)}

In [27]:
class_to_id

{'Displaced': 0,
 'Elderly Head of Household': 1,
 'Covid-19->Testing': 2,
 'Covid-19->Cases': 3,
 'Food Security': 4,
 '495': 5,
 'Cross': 6,
 'Humanitarian Access->Number Of People Facing Humanitarian Access Constraints/Humanitarian Access Gaps': 7,
 'Covid-19->Vaccination': 8,
 '136': 9,
 'Shock/Event->Underlying/Aggravating Factors': 10,
 'Displacement->Push Factors': 11,
 'All': 12,
 'In transit': 13,
 'Impact->Driver/Aggravating Factors': 14,
 'Context->Legal & Policy': 15,
 '699': 16,
 '1306': 17,
 'Completely Reliable': 18,
 'Casualties->Injured': 19,
 'Infants/Toddlers (<5 years old)': 20,
 'Not affected': 21,
 'Protection': 22,
 'At Risk->Number Of People At Risk': 23,
 '829': 24,
 'Migrants': 25,
 'Priority Interventions->Expressed By Population': 26,
 'Major': 27,
 'Child Head of Household': 28,
 'Impact->Impact On Systems, Services And Networks': 29,
 'Context->Politics': 30,
 'Nutrition': 31,
 'Usually reliable': 32,
 'Humanitarian Conditions->Coping Mechanisms': 33,
 'St

In [12]:
for tagvalues in tagname_to_tagvalues.values():
    for tagvalue in tagvalues:
        df[tagvalue] = 0

In [13]:
labels_2d = np.zeros((df.shape[0], len(classes)))
for row in tqdm(df.iterrows(), total=df.shape[0]):
    for tagname in tagname_to_tagvalues.keys():
        idx = row[0]
        yi = row[1][tagname]
        if isinstance(yi, str):
            df.at[idx, yi] = 1
            labels_2d[idx, class_to_id[yi]] = 1
        elif isinstance(yi, list):
            for lbl in yi:
                df.at[idx, lbl] = 1
                labels_2d[idx, class_to_id[lbl]] = 1

  0%|          | 0/157920 [00:00<?, ?it/s]

In [14]:
labels_2d.sum(axis=0, dtype=int)

array([   627,   2142,   1789,   1852,    722,   1287,  12538,     32,
         3095,  38148,  15649,   1326,  16421,   1751,   6202,   2256,
        17244,     88,   2798,   1486,    228,     25,   1841,   4587,
          501,   2080,   2364,   1622,   5673,    290,    199,     45,
         3098,   1587,    192,     83,     11,    396,   7905,  20792,
          277,  73750,  34290,    221,   4082,  12737,    202, 152833,
        16516,   5171,    948,   6946,  11243,   3346,   1743,   1659,
          726,   8189,  49874,    572,   3819,    855,   3472,  13741,
         5078,   1914,   2051,    304,    347,   5507,   3496,   5332,
         2804,   5299,  26668,    126,   2596,    116,  42059,  15405,
         8721,     10,   2095,   1339,   5072,     46,   4575,  27948,
          814,    770,   1205,   1207,   2332,    512,   2071,  16634,
          130,    272,   2355,  10000,    155,   3114,   2975,   4859,
        16382,  15129,    671,    224,   1336,   1686,    275,  16304,
      

In [15]:
df[classes].sum().astype(int).sort_values().to_dict()

{'Not Affected': 10,
 'Other': 11,
 'Venezuelans engaged in pendular/circular movement ': 16,
 'Venezuelans in transit (caminantes)': 25,
 '726': 32,
 '799': 45,
 'Unreliable': 46,
 'Venezuelans with intention to remain in Colombia': 83,
 'Stateless': 88,
 'Back to Venezuela': 108,
 'Elderly Head of Household': 116,
 'Not Usually Reliable': 121,
 'Non Host': 126,
 'Out of Venezuela': 130,
 'Information And Communication->Information Challenges And Barriers': 155,
 'Child Head of Household': 192,
 'Minor Problem': 199,
 'Single Women (including Widows)': 202,
 'Unregistered Refugee': 221,
 'Information And Communication->Communication Means And Preferences': 222,
 'No legal documentation': 224,
 'Humanitarian Access->Population To Relief': 228,
 'Capacities & Response->Local Response': 259,
 'Casualties->Missing': 272,
 'Pendular': 275,
 'At Risk->Number Of People At Risk': 277,
 'Priority Interventions->Expressed By Population': 290,
 'Information And Communication->Knowledge And Info 

In [16]:
X_train, y_train, X_test, y_test = iterative_train_test_split(
    df.index.to_numpy().reshape(-1, 1), labels_2d, test_size=0.1)

In [17]:
X_train, y_train, X_val, y_val = iterative_train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.1)

In [18]:
df_train = df[df.index.isin(X_train.reshape(-1))]
df_val = df[df.index.isin(X_val.reshape(-1))]
df_test = df[df.index.isin(X_test.reshape(-1))]

In [19]:
(df_train.index & df_val.index, df_train.index & df_test.index,
 df_test.index & df_val.index)

  (df_train.index & df_val.index, df_train.index & df_test.index,
  df_test.index & df_val.index)


(Int64Index([], dtype='int64'),
 Int64Index([], dtype='int64'),
 Int64Index([], dtype='int64'))

In [20]:
cols = [
    'entry_id', 'excerpt', 'analysis_framework_id', 'lead_id', 'project_id',
    'verified', 'sectors', 'subpillars_2d', 'subpillars_1d', 'geo_location',
    'specific_needs_groups', 'severity', 'info_date', 'demographic_groups',
    'reliability', 'affected_groups', 'source_type', 'url', 'website'
]
df_train = df_train[cols]
df_val = df_val[cols]
df_test = df_test[cols]

In [21]:
df_train.to_csv("generated_dataset/train_v0.7.csv", index=None)
df_val.to_csv("generated_dataset/val_v0.7.csv", index=None)
df_test.to_csv("generated_dataset/test_v0.7.csv", index=None)

In [22]:
for tagname in tagname_to_tagvalues.keys():
    perc = df_train[tagname].apply(lambda x: x!= []).sum()
    perc = perc/df_train.shape[0]
    print(f"{tagname}: {1-perc:.2f}")

sectors: 0.22
subpillars_2d: 0.24
subpillars_1d: 0.48
specific_needs_groups: 0.95
severity: 0.00
demographic_groups: 0.84
reliability: 0.01
affected_groups: 0.46
analysis_framework_id: 0.00


In [23]:
for tagname in tagname_to_tagvalues.keys():
    perc = df_val[tagname].apply(lambda x: x!= []).sum()
    perc = perc/df_val.shape[0]
    print(f"{tagname}: {1-perc:.2f}")

sectors: 0.21
subpillars_2d: 0.23
subpillars_1d: 0.55
specific_needs_groups: 0.94
severity: 0.00
demographic_groups: 0.84
reliability: 0.01
affected_groups: 0.45
analysis_framework_id: 0.00


In [24]:
for tagname in tagname_to_tagvalues.keys():
    perc = df_test[tagname].apply(lambda x: x!= []).sum()
    perc = perc/df_test.shape[0]
    print(f"{tagname}: {1-perc:.2f}")

sectors: 0.23
subpillars_2d: 0.25
subpillars_1d: 0.52
specific_needs_groups: 0.95
severity: 0.00
demographic_groups: 0.85
reliability: 0.01
affected_groups: 0.47
analysis_framework_id: 0.00


In [25]:
for tagname in tagname_to_tagvalues.keys():
    perc = df[tagname].apply(lambda x: x!= []).sum()
    perc = perc/df.shape[0]
    print(f"{tagname}: {1-perc:.2f}")

sectors: 0.22
subpillars_2d: 0.24
subpillars_1d: 0.49
specific_needs_groups: 0.95
severity: 0.00
demographic_groups: 0.85
reliability: 0.01
affected_groups: 0.46
analysis_framework_id: 0.00


In [29]:
tagnames = list(tagname_to_tagvalues.keys())
def merge_all_labels(x):
    res = []
    for t in x[tagnames[:-1]]:
        if isinstance(t, list):
            res.extend(t)
        elif isinstance(t, str):
            res.append(t)
    return res

df['target'] = df.progress_apply(merge_all_labels, axis=1)

  0%|          | 0/157920 [00:00<?, ?it/s]

In [31]:
df['target'].apply(lambda x: x==[]).sum()

38

In [32]:
df_bis = df[['entry_id', 'target', 'lead_id']].copy()

df_bis['count'] = df_bis['target'].apply(lambda x: len(x))

max_counts = df_bis[['lead_id', 'count']].groupby('lead_id',
                                                  as_index=False).max()
tagged_leads = max_counts[max_counts['count'] > 0]['lead_id'].tolist()

all_negative_ids = df_bis[
    df_bis.lead_id.isin(tagged_leads)
    & df_bis.target.apply(lambda x: len(x) == 0)].entry_id.unique().tolist()

In [33]:
df_tagged = df[~df.lead_id.isin(all_negative_ids)]
df_tagged.shape, df.shape

((157896, 159), (157920, 159))

In [34]:
df_tagged = df[~df.lead_id.isin(all_negative_ids)]
for tagname in tagname_to_tagvalues.keys():
    perc = df_tagged[tagname].apply(lambda x: x!= []).sum()
    perc = perc/df_tagged.shape[0]
    print(f"{tagname}: {1-perc:.2f}")

sectors: 0.22
subpillars_2d: 0.24
subpillars_1d: 0.49
specific_needs_groups: 0.95
severity: 0.00
demographic_groups: 0.85
reliability: 0.01
affected_groups: 0.46
analysis_framework_id: 0.00


In [35]:
df_train_tagged = df_train[df_train.index.isin(tagged_leads)]
for tagname in tagname_to_tagvalues.keys():
    perc = df_train_tagged[tagname].apply(lambda x: x!= []).sum()
    perc = perc/df_train_tagged.shape[0]
    print(f"{tagname}: {1-perc:.2f}")

sectors: 0.28
subpillars_2d: 0.28
subpillars_1d: 0.43
specific_needs_groups: 0.94
severity: 0.00
demographic_groups: 0.88
reliability: 0.03
affected_groups: 0.47
analysis_framework_id: 0.00


In [36]:
df_val_tagged = df_val[df_val.index.isin(tagged_leads)]
for tagname in tagname_to_tagvalues.keys():
    perc = df_val_tagged[tagname].apply(lambda x: x!= []).sum()
    perc = perc/df_val_tagged.shape[0]
    print(f"{tagname}: {1-perc:.2f}")

sectors: 0.25
subpillars_2d: 0.25
subpillars_1d: 0.47
specific_needs_groups: 0.96
severity: 0.00
demographic_groups: 0.89
reliability: 0.02
affected_groups: 0.45
analysis_framework_id: 0.00


In [37]:
df_test_tagged = df_test[df_test.index.isin(tagged_leads)]
for tagname in tagname_to_tagvalues.keys():
    perc = df_test_tagged[tagname].apply(lambda x: x!= []).sum()
    perc = perc/df_test_tagged.shape[0]
    print(f"{tagname}: {1-perc:.2f}")

sectors: 0.26
subpillars_2d: 0.26
subpillars_1d: 0.46
specific_needs_groups: 0.96
severity: 0.00
demographic_groups: 0.89
reliability: 0.03
affected_groups: 0.47
analysis_framework_id: 0.00
