In [1]:
import random
from ast import literal_eval

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
tqdm.pandas()

In [3]:
random.seed(2021)
np.random.seed(2021)

In [4]:
df_train = pd.read_csv("train_0.7.1_kw_gender.csv")
df_val = pd.read_csv("val_0.7.1_kw_gender.csv")

In [5]:
cols = ["gender", "gender_kw_pred"]
for col in cols:
    for df in [df_train, df_val]:
        df[col] = df[col].apply(lambda x: [] if x != x else literal_eval(x))

In [6]:
df_train = df_train[df_train['gender'] == df_train['gender_kw_pred']].copy()
df_train['male'] = df_train['gender'].apply(lambda x: 1 if 'Male' in x else 0)
df_train['female'] = df_train['gender'].apply(lambda x: 1
                                              if 'Female' in x else 0)

In [7]:
df_val = df_val[df_val['gender'] == df_val['gender_kw_pred']].copy()
df_val['male'] = df_val['gender'].apply(lambda x: 1 if 'Male' in x else 0)
df_val['female'] = df_val['gender'].apply(lambda x: 1 if 'Female' in x else 0)

In [8]:
df_train_male_pos = df_train[df_train["male"].eq(1)]
df_train_male_neg = df_train[df_train["male"].eq(0)].sample(
    n=len(df_train_male_pos))
##
df_train_female_pos = df_train[df_train["female"].eq(1)]
df_train_female_neg = df_train[df_train["female"].eq(0)].sample(
    n=len(df_train_female_pos))

In [9]:
df_val_male_pos = df_val[df_val["male"].eq(1)]
df_val_male_neg = df_val[df_val["male"].eq(0)].sample(
    n=len(df_val_male_pos))
##
df_val_female_pos = df_val[df_val["female"].eq(1)]
df_val_female_neg = df_val[df_val["female"].eq(0)].sample(
    n=len(df_val_female_pos))

In [10]:
df_train_male_balanced = pd.concat([df_train_male_pos, df_train_male_neg])
df_train_male_balanced = df_train_male_balanced.sample(frac=1.)
##
df_val_male_balanced = pd.concat([df_val_male_pos, df_val_male_neg])
df_val_male_balanced = df_val_male_balanced.sample(frac=1.)

In [11]:
df_train_female_balanced = pd.concat([df_train_female_pos, df_train_female_neg])
df_train_female_balanced = df_train_female_balanced.sample(frac=1.)
##
df_val_female_balanced = pd.concat([df_val_female_pos, df_val_female_neg])
df_val_female_balanced = df_val_female_balanced.sample(frac=1.)

### Merge

In [12]:
df_train_final = pd.concat([df_train_female_balanced, df_train_male_balanced
                            ]).drop_duplicates(subset=['entry_id'])
(
    df_train_final.shape,
    df_train_final["male"].sum(),
    df_train_male_balanced["male"].sum(),
    df_train_final["female"].sum(),
    df_train_female_balanced["female"].sum()
)

((14517, 10), 2205, 2205, 6132, 6132)

In [13]:
df_val_final = pd.concat([df_val_female_balanced, df_val_male_balanced
                            ]).drop_duplicates(subset=['entry_id'])
(
    df_val_final.shape,
    df_val_final["male"].sum(),
    df_val_male_balanced["male"].sum(),
    df_val_final["female"].sum(),
    df_val_female_balanced["female"].sum()
)

((1396, 10), 226, 226, 584, 584)

In [15]:
df_train_final.to_csv("df_train_balanced_gender.csv", index=None)
df_val_final.to_csv("df_val_balanced_gender.csv", index=None)