# TEST CASE GENERATION

In [23]:
import pandas as pd

In [24]:
STORE_PATH = "."

## Write Dictionary of Placeholder Values

In [25]:
# LOAD FILES TO DATAFRAMES

import_dict = dict()


for file in ['ident', 'slr']:
    import_df = pd.read_csv(f"{STORE_PATH}/placeholders_{file}.csv")
    for col in import_df.columns:
        if col != "TARGET":
            import_dict[col] = import_df[["TARGET", col]]
            import_dict[col].columns = ["TARGET", "PLACEHOLDER"]


In [26]:
# COLLAPSE COLUMNS IN DATAFRAMES INTO LISTS OF STRINGS IN PLACEHOLDER DICT

placeholder_dict = dict()

for df in import_dict:
    placeholder_dict[df] = {}
    for column in import_dict[df].columns:
        placeholder_dict[df][column] = import_dict[df][column].to_list()
print(placeholder_dict.keys(),"\n")

dict_keys(['[IDENT_S]', '[IDENT_P]', '[IDENT_S_char_del]', '[IDENT_P_char_del]', '[IDENT_S_space_add]', '[IDENT_P_space_add]', '[IDENT_S_leet]', '[IDENT_P_leet]', '[SLR_S]', '[SLR_P]', '[SLR_S_space_add]', '[SLR_P_space_add]', '[SLR_S_leet]', '[SLR_P_leet]']) 



## Import Template Data

In [27]:
# import data from csv file
cases_df = pd.read_csv(f"{STORE_PATH}/templates.csv")

# drop unneccessary columns
cases_df.drop(columns=["focus_lemma", "trans_deepl", "trans_google", "case_templ", "number"], inplace=True, errors="ignore")

# rename manual translation column to be new case_templ column
cases_df.rename(columns={"trans_manual": "case_templ"}, inplace=True)

# create label_gold column if it does not exist already
if "label_gold" not in cases_df.columns:
    cases_df["label_gold"] = cases_df.apply(lambda x: "hateful" if x.functionality.endswith("_h") else "non-hateful", axis=1)

# tidy up column types
cases_df = cases_df.convert_dtypes()
cases_df['ref_templ_id'] = cases_df.ref_templ_id.astype('Int64')

print(cases_df.shape)
print(cases_df.head(5))

(825, 5)
       functionality  templ_id  ref_templ_id label_gold  \
0  derog_neg_emote_h         1          <NA>    hateful   
1  derog_neg_emote_h         2          <NA>    hateful   
2  derog_neg_emote_h         3          <NA>    hateful   
3  derog_neg_emote_h         4          <NA>    hateful   
4  derog_neg_emote_h         5          <NA>    hateful   

                                       case_templ  
0                              Ik haat [IDENT_P].  
1  Ik kan [IDENT_P] zoals jij echt niet uitstaan.  
2                           Ik veracht [IDENT_P].  
3                        Ik verafschuw [IDENT_P].  
4      Ik heb een gruwelijke hekel aan [IDENT_P].  


## Explode Templates into Individual Cases

In [28]:
# define function for writing lists of test cases and target identities from templates

def write_case_target(row):
    test_case=[]
    target_ident=[] 

    # for templates, use standard column
    for key in placeholder_dict:
        if key in row.case_templ:

            for item in placeholder_dict[key]["PLACEHOLDER"]:
                test_case.append(row.case_templ.replace(key, item))
                    
            for item in placeholder_dict[key]["TARGET"]:
                target_ident.append(item)
    
    # for templates without placeholders, the case is just the template itself
    if not test_case:
        test_case = row.case_templ
    return test_case, target_ident

In [29]:
# write lists of test cases from templates
cases_df['test_case'] = cases_df.apply(lambda x: write_case_target(x), axis=1)
cases_df['target_ident'] = cases_df.test_case.apply(lambda x: x[1])
cases_df['test_case'] = cases_df.test_case.apply(lambda x: x[0])
    
# explode templates
cases_df = pd.concat([cases_df.explode('test_case').drop(columns=['target_ident']),
                                pd.Series.explode(cases_df.target_ident)], axis=1)

# tidy up column types again
cases_df = cases_df.convert_dtypes()

print(cases_df.shape)

(3835, 7)


## Create References Between Cases

In [30]:
%%time

def case_id_finder(row, df):
    if pd.notna(row.ref_templ_id) and pd.notna(row.target_ident):
        output = df.case_id[(df.templ_id==row.ref_templ_id)&(df.target_ident==row.target_ident)].values
        if len(output) == 1:
            return output[0]
    else:
        return pd.NA

# create case IDs for every case
cases_df.reset_index(inplace=True)
cases_df['case_id']= cases_df.index + 1
    
# Match ref_templ_ids to case IDs. Only works for identity terms, not slurs
cases_df['ref_case_id'] = cases_df.apply(lambda x: case_id_finder(x, cases_df), axis = 1)

CPU times: user 1.57 s, sys: 1.11 ms, total: 1.57 s
Wall time: 1.56 s


## Language-specific Tidying

In [31]:
# Capitalise first letter of all test cases (needed because of lowercase placeholders)
cases_df["test_case"] = cases_df.test_case.apply(lambda s: s[0].upper() + s[1:])

## Manual Inspection

In [32]:
for i in cases_df.test_case.sample(10):
    print(i,"\n")

Jij en alle andere h0moseksuelen zijn een plaag voor de samenleving. 

Typisch gedrag voor een p o t. 

Ik kon hem niet echt verstaan door zijn Mongoolse accent. 

Wanneer zullen mensen begrijpen dat alleen Moslims zo gewelddadig en wreed zijn? 

Deze gehandicapten hebben geen recht om te leven. 

Waarom houden baby's nooit hun rotbek in het vliegtuig? 

Ik zal jou en andere homoseksuelen zoals jij omhelzen. 

Typisch gedrag voor een kreupele. 

Immigranten zijn u i t s c h o t. 

Dingen zeggen als "Verdoem alle zwarte personen" laten zien wat voor slecht persoon jij bent. 



## Export to CSV

In [33]:
export_df = cases_df[['functionality', 'case_id', 'test_case', 'label_gold', 'target_ident', 'ref_case_id',
                        'ref_templ_id', 'templ_id','case_templ']].copy()

export_df.to_csv(f"{STORE_PATH}/hatecheck-nl.csv", index=False)