In [1]:
import pandas as pd
import os
from ast import literal_eval
from tqdm import tqdm
import numpy as np

In [2]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [3]:
DATA_PATH = os.path.join(
    '..', '..', '..', "data", "frameworks_data", 'data_v0.7.1'
)

In [4]:
full_data = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset_with_translations.csv'))
test_data = pd.read_csv(os.path.join(DATA_PATH, 'test_v0.7.1.csv'))
modified_age = pd.concat([
    pd.read_csv(os.path.join(DATA_PATH, 'train_v0.7.1_gender_snorkel.csv')),
    pd.read_csv(os.path.join(DATA_PATH, 'val_v0.7.1_gender_snorkel.csv'))
])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
full_data = pd.merge(
    right=full_data,
    left=modified_age,
    on='entry_id',
    how='right'
)

In [6]:
full_data.columns

Index(['entry_id', 'gender_snorkel', 'excerpt', 'analysis_framework_id',
       'lead_id', 'project_id', 'verified', 'sectors', 'subpillars_2d',
       'subpillars_1d', 'geo_location', 'specific_needs_groups', 'severity',
       'info_date', 'reliability', 'affected_groups_level_0',
       'affected_groups_level_1', 'affected_groups_level_2',
       'affected_groups_level_3', 'age', 'gender', 'source_type', 'url',
       'website', 'lang', 'translation_en', 'translation_fr',
       'translation_es'],
      dtype='object')

In [7]:
full_data['present_prim_tags'] = full_data.apply(
    lambda x: [column for column in ['sectors','subpillars_2d', 'subpillars_1d'] if len(x[column])>2], axis=1
               )

In [8]:
full_data['sectors'] = full_data.sectors.apply(
    lambda x: [item for item in literal_eval(x) if item !='NOT_MAPPED']
)
full_data['subpillars_2d'] = full_data.subpillars_2d.apply(
    lambda x: [item for item in literal_eval(x) if item !='NOT_MAPPED']
)
full_data['subpillars_1d'] = full_data.subpillars_1d.apply(
    lambda x: [item for item in literal_eval(x) if item !='NOT_MAPPED']
)
full_data['pillars_1d'] = full_data.subpillars_1d.apply(
    lambda x: list(np.unique([item.split('->')[0] for item in x]))
               )

full_data['pillars_2d'] = full_data.subpillars_2d.apply(
    lambda x: list(np.unique([item.split('->')[0] for item in x]))
               )

full_data['prim_tags_level1'] = full_data.apply(
    lambda x: flatten([
        [f"{column_name}->{tag}" for tag in x[column_name]
    ] for column_name in ['sectors', 'pillars_2d', 'pillars_1d']]), axis=1
)

In [9]:
#prop of positive examples in that
full_data['present_prim_tags'].apply(lambda x: len(x)>0).sum() / full_data.shape[0]

0.9523520101297879

In [10]:
full_data['prim_tags_level1']

0         [sectors->Health, pillars_2d->Capacities & Res...
1         [sectors->Education, pillars_2d->Humanitarian ...
2         [sectors->Nutrition, sectors->Food Security, p...
3         [pillars_1d->Displacement, pillars_1d->Humanit...
4                                     [pillars_1d->Context]
                                ...                        
157945                                                   []
157946                                                   []
157947                                                   []
157948                                                   []
157949                                                   []
Name: prim_tags_level1, Length: 157950, dtype: object

In [11]:
full_data['subpillars_2d_part1'] = full_data.subpillars_2d.apply(
    lambda x: [
        item for item in x if item in [
            'Humanitarian Conditions->Living Standards',
             'Humanitarian Conditions->Physical And Mental Well Being',
             'Impact->Impact On Systems, Services And Networks',
             'Capacities & Response->International Response',
             'Impact->Driver/Aggravating Factors',
             'Impact->Impact On People',
             'At Risk->Risk And Vulnerabilities'
        ]
    ]
)

In [12]:
full_data['subpillars_2d_part2'] = full_data.subpillars_2d.apply(
    lambda x: [
        item for item in x if item in [
            'Capacities & Response->National Response',
             'Priority Interventions->Expressed By Humanitarian Staff',
             'Humanitarian Conditions->Coping Mechanisms',
             'Capacities & Response->Number Of People Reached/Response Gaps',
             'Priority Needs->Expressed By Population',
             'Impact->Number Of People Affected',
             'Priority Needs->Expressed By Humanitarian Staff',
             'Humanitarian Conditions->Number Of People In Need',
             'Priority Interventions->Expressed By Population',
             'Capacities & Response->Local Response', 
             'At Risk->Number Of People At Risk'
        ]
    ]
)

In [13]:
full_data['subpillars_1d_part1'] = full_data.subpillars_1d.apply(
    lambda x: [
        item for item in x if item in [
            'Displacement->Type/Numbers/Movements',
             'Context->Security & Stability', 
             'Covid-19->Restriction Measures', 
             'Covid-19->Cases', 
             'Context->Economy', 
             'Shock/Event->Hazard & Threats', 
             'Casualties->Dead', 
             'Covid-19->Deaths', 
             'Context->Demography',
             'Displacement->Local Integration'
        ]
    ]
)

In [14]:
full_data['subpillars_1d_part2'] = full_data.subpillars_1d.apply(
    lambda x: [
        item for item in x if item in [
            'Context->Legal & Policy', 
             'Context->Politics', 
             'Displacement->Push Factors', 
             'Covid-19->Vaccination', 
             'Shock/Event->Underlying/Aggravating Factors', 
             'Context->Socio Cultural', 
             'Humanitarian Access->Physical Constraints', 
             'Covid-19->Testing', 
             'Shock/Event->Type And Characteristics', 
             'Context->Environment', 
             'Humanitarian Access->Relief To Population'
        ]
    ]
)

In [15]:
full_data['subpillars_1d_part3'] = full_data.subpillars_1d.apply(
    lambda x: [
        item for item in x if item in [
            'Information And Communication->Information Challenges And Barriers', 
             'Information And Communication->Communication Means And Preferences', 
             'Information And Communication->Knowledge And Info Gaps (Pop)',
             'Casualties->Missing', 
             'Humanitarian Access->Population To Relief', 
             'Displacement->Pull Factors', 
             'Covid-19->Hospitalization & Care', 
             'Displacement->Intentions', 
             'Covid-19->Contact Tracing', 
             'Casualties->Injured', 
             'Information And Communication->Knowledge And Info Gaps (Hum)', 
             'Humanitarian Access->Number Of People Facing Humanitarian Access Constraints/Humanitarian Access Gaps'
        ]
    ]
)

In [16]:
def custom_eval(x):
    if str(x)=='nan':
        return []
    if type(x)==list:
        return x
    else:
        return literal_eval(x)
    
full_data['affected_groups'] = full_data['affected_groups_level_0']

In [17]:
full_data['age'] = full_data.age.apply(custom_eval)
full_data['gender'] = full_data.gender.apply(custom_eval)
full_data['gender_snorkel'] = full_data.gender_snorkel.apply(custom_eval)
full_data['specific_needs_groups'] = full_data.specific_needs_groups.apply(custom_eval)

In [18]:
full_data['present_sec_tags'] = full_data.apply(
    lambda x: [column for column in [
        'specific_needs_groups',
        'affected_groups', 
        'age',
        'gender_snorkel',
        'gender'] if len(custom_eval(x[column]))>0], axis=1
               )


In [19]:
affected_groups_level_2_3_merger = {
    'Asylum Seekers': 'Displaced->Asylum Seekers', 
    'Host': 'Non Displaced->Host', 
    'Non Host': 'Non Displaced->Non Host',
    'IDP': 'Displaced->IDP', 
    'In Transit': 'Displaced->In Transit', 
    'Migrants': 'Displaced->Migrants',
    'NOT_MAPPED': 'NOT_MAPPED',
    'Others of Concern': 'Displaced->Others of Concern', 
    'Pendular': 'Displaced->Pendular',
    'Permanent': 'Displaced->Permanent', 
    'Refugees': 'Displaced->Refugees', 
    'Returnees': 'Displaced->Returnees', 
    'Stateless': 'Displaced->Stateless'
}

In [20]:
def custom_eval(x):
    if type(x)==list:
        return x
    else:
        return literal_eval(x)
    
full_data['affected_groups_level_3_nona'] = full_data['affected_groups_level_3'].apply(
    lambda x: [] if str(x)=='nan' else x
)
full_data['affected_groups_levels_2_3'] = full_data['affected_groups_level_3_nona'].\
apply(
    lambda x: [affected_groups_level_2_3_merger[item] for item in custom_eval(x) if item!='None']
)

In [22]:
"""np.unique(flatten(full_data.affected_groups_level_0.dropna().apply(literal_eval).tolist()))
np.unique(flatten(full_data.affected_groups_level_1.dropna().apply(literal_eval).tolist()))
np.unique(flatten(full_data.affected_groups_level_2.dropna().apply(literal_eval).tolist()))
np.unique(flatten(full_data['affected_groups_levels_2_3'].apply(custom_eval).tolist()))"""

"np.unique(flatten(full_data.affected_groups_level_0.dropna().apply(literal_eval).tolist()))\nnp.unique(flatten(full_data.affected_groups_level_1.dropna().apply(literal_eval).tolist()))\nnp.unique(flatten(full_data.affected_groups_level_2.dropna().apply(literal_eval).tolist()))\nnp.unique(flatten(full_data['affected_groups_levels_2_3'].apply(custom_eval).tolist()))"

In [23]:
train_val_full = full_data[~full_data.entry_id.isin(test_data.entry_id)]


In [24]:
fr_df = train_val_full[['entry_id', 'translation_fr']]\
        .rename(columns={'translation_fr':'excerpt'}).dropna()
en_df = train_val_full[['entry_id', 'translation_en']]\
        .rename(columns={'translation_en':'excerpt'}).dropna()
es_df = train_val_full[['entry_id', 'translation_es']]\
        .rename(columns={'translation_es':'excerpt'}).dropna()

augmented_data = pd.concat([en_df, fr_df, es_df])

In [25]:
augmented_data = pd.merge(
    right=train_val_full.drop(columns=[
        'excerpt', 'translation_en', 'translation_fr', 'translation_es']
                           ),
    left=augmented_data[['entry_id', 'excerpt']],
    on='entry_id',
    how='right'
)

In [26]:
train_val_df = pd.concat([train_val_full, augmented_data]).drop(columns=[
        'translation_en', 'translation_fr', 'translation_es', 'lang']
                           )

In [27]:
train_val_df.to_csv(os.path.join(DATA_PATH, 'new_columns_train_val.csv'), index=None)

In [28]:
test_df = full_data[full_data.entry_id.isin(test_data.entry_id)].drop(
    columns=['translation_en', 'translation_fr', 'translation_es']
)
test_df['gender_snorkel'] = test_df['gender']
test_df.to_csv(os.path.join(DATA_PATH, 'new_columns_test_v0.7.1.csv'), index=None)

In [33]:
train_val_df.shape

(422353, 36)

In [34]:
test_df.shape

(17202, 37)

In [36]:
train_val_df.columns

Index(['entry_id', 'gender_snorkel', 'excerpt', 'analysis_framework_id',
       'lead_id', 'project_id', 'verified', 'sectors', 'subpillars_2d',
       'subpillars_1d', 'geo_location', 'specific_needs_groups', 'severity',
       'info_date', 'reliability', 'affected_groups_level_0',
       'affected_groups_level_1', 'affected_groups_level_2',
       'affected_groups_level_3', 'age', 'gender', 'source_type', 'url',
       'website', 'present_prim_tags', 'pillars_1d', 'pillars_2d',
       'subpillars_2d_part1', 'subpillars_2d_part2', 'subpillars_1d_part1',
       'subpillars_1d_part2', 'subpillars_1d_part3', 'affected_groups',
       'present_sec_tags', 'affected_groups_level_3_nona',
       'affected_groups_levels_2_3'],
      dtype='object')

0

In [18]:
dict(Counter(flatten(full_data.pillars_2d)))

{'Capacities & Response': 21492,
 'Humanitarian Conditions': 46467,
 'At Risk': 11245,
 'Impact': 35473,
 'Priority Needs': 3538,
 'Priority Interventions': 5946,
 'NOT_MAPPED': 20044}