In [75]:
import os
import pandas as pd
from ast import literal_eval
import random

In [87]:
def flatten(t):
    return [item for sublist in t for item in sublist]

DATA_PATH = os.path.join(
   '..', '..', '..', '..', "data", "frameworks_data", 'development_cca'
)

In [127]:
cca_data = (
    pd.read_csv(
        os.path.join(DATA_PATH, "cca_deep_projects_with_translations_v0.2.csv"),
        usecols=[
            "id",
            "lang",
            "en",
            "fr",
            "es",
            "pt",
            "title",
            "sectors",
            "2dpillars",
            "2dsubpillars",
            "1dpillars",
            "1dsubpillars",
        ],
    )
    .rename(
        columns={
            "1dpillars": "pillars_1d",
            "2dpillars": "pillars_2d",
            "1dsubpillars": "subpillars_1d",
            "2dsubpillars": "subpillars_2d",
            "id": "entry_id",
            "created_at": "creation_year",
            "lang": "original_language",
        }
    )
    .drop_duplicates()
)

classification_cols = [
    "sectors",
    "pillars_2d",
    "subpillars_2d",
    "pillars_1d",
    "subpillars_1d",
]

for col in classification_cols:
    if "pillar" not in col:
        cca_data[col] = cca_data[col].apply(lambda x: list(set(literal_eval(x))))
    else:
        cca_data[col] = cca_data[col].apply(literal_eval)

cca_data["subpillars_2d"] = cca_data.apply(
    lambda x: [
        f"{x['pillars_2d'][i]}->{x['subpillars_2d'][i]}"
        for i in range(len(x["subpillars_2d"]))
    ],
    axis=1,
)
cca_data["subpillars_1d"] = cca_data.apply(
    lambda x: [
        f"{x['pillars_1d'][i]}->{x['subpillars_1d'][i]}"
        for i in range(len(x["subpillars_1d"]))
    ],
    axis=1,
)

cca_data["pillars"] = cca_data.apply(
    lambda x: x["pillars_1d"] + x["pillars_2d"],
    axis=1,
)
cca_data["subpillars"] = cca_data.apply(
    lambda x: x["subpillars_1d"] + x["subpillars_2d"],
    axis=1,
)

#cca_data = cca_data.drop(columns=["pillars_2d", "pillars_1d"])
cca_data["target"] = cca_data.apply(
    lambda x: str(
        flatten(
            [
                x[col]
                for col in [
                    "sectors",
                    "subpillars_2d",
                    "subpillars_1d",
                    #"specific needs groups",
                    #"demographic groups",
                ]
            ]
        )
    ),
    axis=1,
)


In [120]:
cca_data.shape

(25442, 14)

In [121]:
cca_data.title.unique()

array(['CCA/DEEP Central African Republic', 'CCA/DEEP Ghana',
       'CCA/DEEP Somalia', 'CCA/DEEP Philippines', 'CCA/DEEP Ukraine',
       'CCA/DEEP Namibia'], dtype=object)

In [130]:
cca_data["sectors"] = cca_data.sectors.apply(
    lambda x: list(set([f"first_level_tags->sectors->{item}" for item in x]))
)

cca_data["pillars"] = cca_data.pillars.apply(
    lambda x: list(set([f"first_level_tags->pillars->{item}" for item in x]))
)

first_level_tags = [
    "sectors",
    "pillars"
]

cca_data["first_level_tags"] = cca_data.apply(
    lambda x: flatten([x[tag] for tag in first_level_tags]), axis=1
)

sorted(list(set(flatten(cca_data['first_level_tags']))))

['first_level_tags->pillars->COVID-19 Impact',
 'first_level_tags->pillars->High Level tags',
 'first_level_tags->pillars->Progress towards 2030 Agenda',
 'first_level_tags->pillars->Risks',
 'first_level_tags->pillars->Stakeholder/ Partnerships',
 'first_level_tags->sectors->Partnership',
 'first_level_tags->sectors->Peace',
 'first_level_tags->sectors->People',
 'first_level_tags->sectors->Planet',
 'first_level_tags->sectors->Prosperity']

In [131]:
cca_data["subpillars"] = cca_data.subpillars.apply(
    lambda x: list(set([f"subpillars->{item}" for item in x]))
)

sorted(list(set(flatten(cca_data['subpillars']))))

['subpillars->subpillars->COVID-19 Impact->Negative Coping Strategies',
 'subpillars->subpillars->COVID-19 Impact->Socio-Economic Impact',
 'subpillars->subpillars->COVID-19 Impact->Status of Recovery',
 'subpillars->subpillars->High Level tags->Capacities',
 'subpillars->subpillars->High Level tags->Challenges / Opportunities towards 2030 Agenda',
 'subpillars->subpillars->High Level tags->Information Gaps',
 'subpillars->subpillars->High Level tags->Progress towards 2030 Agenda',
 'subpillars->subpillars->High Level tags->Recommendations from Stakeholders',
 'subpillars->subpillars->Progress towards 2030 Agenda->Achievements / Situational Snapshot',
 'subpillars->subpillars->Progress towards 2030 Agenda->Challenges & Barriers',
 'subpillars->subpillars->Progress towards 2030 Agenda->Development Plans',
 'subpillars->subpillars->Progress towards 2030 Agenda->Enabling Factors',
 'subpillars->subpillars->Progress towards 2030 Agenda->National Capacities',
 'subpillars->subpillars->Risks

In [132]:
cca_data['target'] = cca_data.apply(
    lambda x: x['first_level_tags'] + x['subpillars'], axis=1
)

In [133]:
def get_deleted_duplicates_ids(df):
    # find if there are any duplicates
    counts = cca_data['en'].value_counts().rename('en_counts')

    treated_df_with_counts = cca_data.merge(counts.to_frame(),
                                    left_on='en',
                                    right_index=True)

    duplicates_df = treated_df_with_counts[treated_df_with_counts.en_counts>1]
    duplicates_df.drop(columns=['fr', 'es', 'pt'])

    duplicates_tmp = duplicates_df.copy()
    
    duplicates_tmp = duplicates_tmp[["en", "target"]].drop_duplicates()
    counts_tmp = duplicates_tmp['en'].value_counts().rename('en_counts').to_frame()

    kept_entries = list(counts_tmp[counts_tmp.en_counts==1].index)

    same_duplicates_df = duplicates_df[duplicates_df.en.isin(kept_entries)]
    kept_ids = same_duplicates_df.groupby('en').agg({'entry_id': lambda x: list(x)[0]}).entry_id.tolist()

    deleted_ids = list(set(duplicates_df.entry_id.tolist()) - set(kept_ids))

    return deleted_ids

In [134]:
deleted_ids = get_deleted_duplicates_ids(cca_data)
cca_data = cca_data[~cca_data.entry_id.isin(deleted_ids)]
cca_data.shape 

TypeError: unhashable type: 'list'

In [135]:
def custom_stratified_train_test_split(df, ratios):
    """
    custom function for stratified train test splitting
    1) take unique sub-tags (example: ['Health'])
    2) For each unique subtag:
        i) take all indexes that have that specific subtag
        ii) split them randomly to train and test sets
    """
    train_ids = []
    val_ids = []
    positive_df = df.copy()
    positive_df["target"] = positive_df["target"].apply(str)
    ids = positive_df.groupby("target")["entry_id"].agg(list).values
    unique_ids = [list(set(list_)) for list_ in ids]

    for ids_entry in unique_ids:

        train_ids_entry = random.sample(
            ids_entry, int(len(ids_entry) * ratios["train"])
        )
        val_ids_entry = list(set(ids_entry) - set(train_ids_entry))

        train_ids.append(train_ids_entry)
        val_ids.append(val_ids_entry)

    return flatten(train_ids), flatten(val_ids)

In [136]:
fr_df = cca_data[['entry_id', 'fr']]\
        .rename(columns={'fr':'excerpt'}).dropna()

en_df = cca_data[['entry_id', 'en']]\
        .rename(columns={'en':'excerpt'}).dropna()

es_df = cca_data[['entry_id', 'es']]\
        .rename(columns={'es':'excerpt'}).dropna()

pt_df = cca_data[['entry_id', 'pt']]\
        .rename(columns={'pt':'excerpt'}).dropna()

augmented_data = pd.concat([en_df, fr_df, es_df, pt_df])

augmented_data = pd.merge(
    right=cca_data.drop(columns=[
        'en', 'fr', 'es', 'pt']
                           ),
    left=augmented_data[['entry_id', 'excerpt']],
    on='entry_id',
    how='right'
)

ratios = {'train': 0.95, 'test': 0.05}
train_val_ids, test_ids = custom_stratified_train_test_split(cca_data, ratios)

train_val_data = augmented_data[augmented_data.entry_id.isin(train_val_ids)]
ratios = {'train': 0.9, 'test': 0.1}
train_ids, val_ids = custom_stratified_train_test_split(train_val_data, ratios)

test_data = augmented_data[augmented_data.entry_id.isin(test_ids)]
train_data = augmented_data[augmented_data.entry_id.isin(train_ids)]
val_data = augmented_data[augmented_data.entry_id.isin(val_ids)]

In [137]:
train_data.shape[0], val_data.shape[0], test_data.shape[0]

(74072, 12208, 15488)

In [138]:
train_val_data[['entry_id', 'excerpt', 'target']].to_csv(os.path.join(DATA_PATH, 'train_val_data.csv') , index=None)
test_data[['entry_id', 'excerpt', 'target']].to_csv(os.path.join(DATA_PATH, 'test_data.csv') , index=None)