## Ablation - modify (a,b), (c,d) to (a,d) and (c,b) 

In [40]:
import pandas as pd
import numpy as np

In [41]:
import random

In [42]:
data_path = "../../datasets/fb15k-237/"

In [43]:
relations_df = pd.read_csv(f"{data_path}/derived/relations_with_ids.csv")

In [44]:
freebase_vs_wiki_df =  pd.read_csv(f"{data_path}/fb_wiki_mapping.tsv", sep='\t')

In [45]:
triplets_with_src_dst_wiki_ids_df = pd.read_csv(f"{data_path}/derived/triplets_with_src_dst_wiki_ids.csv")

In [46]:
# add test split with empty labels
df_test = pd.read_csv(f"{data_path}/derived/triplets_with_src_dst_wiki_ids.test.csv")

In [65]:
def perturb_graph(df):
    n = len(df)
    num_errors = int(0.25*n)

    triple_ids = list(df.triple_id.values)
    # permute triple_ids and select top 2*num_errors
    # rather select small 
    selected_edges = random.sample(triple_ids, 2*num_errors)

    perturbed_rows_dicts = []
    for i in range(num_errors):
        edge_ab = int(selected_edges[2*i])
        edge_cd = int(selected_edges[2*i+1])
        if i < 5:
            print(f"swapping {edge_ab} and {edge_cd}")
    
        row_ab = df.loc[df['triple_id']==edge_ab]
        row_cd = df.loc[df['triple_id']==edge_cd]
    
        row_ab_dict = row_ab.to_dict(orient='records')[0]
        row_cd_dict = row_cd.to_dict(orient='records')[0]
    
        a = row_ab_dict['src_wikidata_id']
        b = row_ab_dict['dest_wikidata_id']
    
        c = row_cd_dict['src_wikidata_id']
        d = row_cd_dict['dest_wikidata_id']
    
        if i < 5:
            print(f"{a}->{b} , {c}->{d}")
    
        row_ad_new = row_ab_dict.copy()
        row_ad_new['dest_wikidata_id'] = d
    
        row_ad_new['freebase_id_y'] = row_cd_dict['freebase_id_y']
        row_ad_new['label_y'] = row_cd_dict['label_y']
        
        row_cb_new = row_cd_dict.copy()
        row_cb_new['dest_wikidata_id'] = b
    
        row_cb_new['freebase_id_y'] = row_ab_dict['freebase_id_y']
        row_cb_new['label_y'] = row_ab_dict['label_y']
    
        perturbed_rows_dicts.append(row_ad_new)
        perturbed_rows_dicts.append(row_cb_new)
    
    df_errors = pd.DataFrame.from_dict(perturbed_rows_dicts)
    print(len(df_errors))

    df_clean = df[~df['triple_id'].isin(selected_edges)]
    print(len(df_clean))

    # For train dataset
    df_errors['disapprove'] = 1
    df_clean['disapprove'] = 0
    
    return pd.concat([df_clean, df_errors])

In [66]:
df_train_perturbed = perturb_graph(triplets_with_src_dst_wiki_ids_df)

swapping 131286 and 250881
280658->8723 , 60->376880
swapping 168257 and 155864
151599->213 , 822946->884
swapping 15180 and 124388
71243->242729 , 455545->639669
swapping 201681 and 47172
128581->7212330 , 180224->220192
swapping 55061 and 1544
233868->3218669 , 336865->60
121818
120912


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['disapprove'] = 0


In [67]:
df_train_perturbed.to_csv(f"{data_path}/exp3/triplets_with_src_dst_wiki_ids.csv", index=False)

In [69]:
df_test_perturbed = perturb_graph(df_test)

swapping 283384 and 280120
155653->16 , 122563->811595
swapping 275023 and 275870
7414->132863 , 305250->213
swapping 289246 and 273087
8567->236 , 1215884->224164
swapping 273401 and 288610
240869->223316 , 6607->858388
swapping 285753 and 276360
34069->104000 , 937->70
9142
9144


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['disapprove'] = 0


In [70]:
df_test_perturbed.to_csv(f"{data_path}/exp3/triplets_with_src_dst_wiki_ids.test.csv", index=False)

In [62]:
# Evaluation on test set, do the same perturbation
# for them label to be empty per guidelines - https://kumo.ai/docs/examples/fraud-chargeback-abuse/

In [63]:
# permute as earlier
