In [7]:
import pandas as pd
import os
from ast import literal_eval
from tqdm import tqdm
import numpy as np

In [8]:
DATA_PATH = os.path.join(
    '..', '..', '..', "data", "frameworks_data", 'data_v0.7.1'
)

In [9]:
full_data = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset_with_translations.csv'))
test_data = pd.read_csv(os.path.join(DATA_PATH, 'test_v0.7.1.csv'))

In [10]:
full_data['present_prim_tags'] = full_data.apply(
    lambda x: [column for column in ['sectors','subpillars_2d', 'subpillars_1d'] if len(x[column])>2], axis=1
               )

In [11]:
full_data['pillars_1d'] = full_data.subpillars_1d.apply(
    lambda x: list(np.unique([item.split('->')[0] for item in literal_eval(x)]))
               )

In [12]:
full_data['pillars_2d'] = full_data.subpillars_2d.apply(
    lambda x: list(np.unique([item.split('->')[0] for item in literal_eval(x)]))
               )

In [13]:
full_data['impact_capresp_humcond'] = full_data.subpillars_2d.apply(
    lambda x: [
        item for item in literal_eval(x) if item.split('->')[0] in [
            'Capacities & Response', 'Humanitarian Conditions', 'Impact'
        ]
    ]
)

In [14]:
full_data['need_intervention_risk'] = full_data.subpillars_2d.apply(
    lambda x: [
        item for item in literal_eval(x) if item.split('->')[0] in [
            'At Risk', 'Priority Needs', 'Priority Interventions'
        ]
    ]
)

In [15]:
full_data['context_covid'] = full_data.subpillars_1d.apply(
    lambda x: [
        item for item in literal_eval(x) if item.split('->')[0] in [
            'Context', 'Covid-19'
        ]
    ]
)

In [16]:
full_data['displacement_shockevent'] = full_data.subpillars_1d.apply(
    lambda x: [
        item for item in literal_eval(x) if item.split('->')[0] in [
            'Displacement', 'Shock/Event'
        ]
    ]
)

In [17]:
full_data['access_infcom_casualities'] = full_data.subpillars_1d.apply(
    lambda x: [
        item for item in literal_eval(x) if item.split('->')[0] in [
            'Humanitarian Access', 'Information And Communication', 'Casualties'
        ]
    ]
)

In [18]:
train_val_full = full_data[~full_data.entry_id.isin(test_data.entry_id)]

In [19]:
fr_df = train_val_full[['entry_id', 'translation_fr']]\
        .rename(columns={'translation_fr':'excerpt'}).dropna()
en_df = train_val_full[['entry_id', 'translation_en']]\
        .rename(columns={'translation_en':'excerpt'}).dropna()
es_df = train_val_full[['entry_id', 'translation_es']]\
        .rename(columns={'translation_es':'excerpt'}).dropna()

augmented_data = pd.concat([en_df, fr_df, es_df])

In [20]:
augmented_data = pd.merge(
    right=train_val_full.drop(columns=[
        'excerpt', 'translation_en', 'translation_fr', 'translation_es']
                           ),
    left=augmented_data[['entry_id', 'excerpt']],
    on='entry_id',
    how='right'
)

In [21]:
train_val_df = pd.concat([train_val_full, augmented_data]).drop(columns=[
        'translation_en', 'translation_fr', 'translation_es', 'lang']
                           )

In [22]:
train_val_df.to_csv(os.path.join(DATA_PATH, 'new_columns_train_val.csv'))

In [23]:
test_df = full_data[full_data.entry_id.isin(test_data.entry_id)].drop(
    columns=['translation_en', 'translation_fr', 'translation_es']
)
test_df.to_csv(os.path.join(DATA_PATH, 'new_columns_test_v0.7.1.csv'))

In [18]:
train_val_df.shape

(422327, 31)

In [19]:
test_df.shape

(17202, 31)

In [9]:
train_val_data = pd.read_csv(os.path.join(DATA_PATH, 'prim_tags.csv'))


In [16]:
train_val_data.sectors.apply(lambda x: len(x)>2).sum()

326079

In [14]:
tot_df = pd.read_csv(os.path.join(DATA_PATH, 'new_columns_train_val.csv'))


In [15]:
tot_df.drop_duplicates().sectors.apply(lambda x: len(x)>2).sum()

327001