## Ablation - modify (a,b), (c,d) to (a,d) and (c,b) 

In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
import random

In [2]:
data_path = "../../datasets/fb15k-237/"

In [4]:
relations_df = pd.read_csv(f"{data_path}/derived/relations_with_ids.csv")

In [5]:
freebase_vs_wiki_df =  pd.read_csv(f"{data_path}/fb_wiki_mapping.tsv", sep='\t')

In [3]:
test = True

df_test = pd.read_csv(f"{data_path}/derived/triplets_with_src_dst_wiki_extracts.test.csv")

In [5]:
file_list = glob.glob(f"{data_path}/derived/triplets_with_src_dst_wiki_extracts.*0*.csv")
df_list = [pd.read_csv(file) for file in file_list]

triplets_with_src_dst_wiki_ids_df = pd.concat(df_list, ignore_index=True)

triplets_with_src_dst_wiki_ids_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239880 entries, 0 to 239879
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   Unnamed: 0             239880 non-null  int64 
 1   src_freebase_id        239880 non-null  object
 2   relation               239880 non-null  object
 3   dest_freebase_id       239880 non-null  object
 4   triple_id              239880 non-null  int64 
 5   timestamp              239880 non-null  object
 6   relation_name          239880 non-null  object
 7   relation_id            239880 non-null  int64 
 8   src_wikidata_label     239880 non-null  object
 9   src_wikidata_id        239880 non-null  int64 
 10  dest_wikidata_label    239880 non-null  object
 11  dest_wikidata_id       239880 non-null  int64 
 12  src_wikidata_title     239880 non-null  object
 13  src_wikidata_extract   239880 non-null  object
 14  dest_wikidata_title    239880 non-null  object
 15  

In [8]:
# triplets_with_src_dst_wiki_ids_df = pd.read_csv(f"{data_path}/derived/triplets_with_src_dst_wiki_extracts.csv.gz")
# triplets_with_src_dst_wiki_ids_df.info()

In [11]:
def perturb_graph(df):
    n = len(df)
    num_errors = int(0.25*n)

    triple_ids = list(df.triple_id.values)
    # permute triple_ids and select top 2*num_errors
    # rather select small 
    selected_edges = random.sample(triple_ids, 2*num_errors)

    perturbed_rows_dicts = []
    for i in range(num_errors):
        edge_ab = int(selected_edges[2*i])
        edge_cd = int(selected_edges[2*i+1])
        if i < 5:
            print(f"swapping {edge_ab} and {edge_cd}")
    
        row_ab = df.loc[df['triple_id']==edge_ab]
        row_cd = df.loc[df['triple_id']==edge_cd]
    
        row_ab_dict = row_ab.to_dict(orient='records')[0]
        row_cd_dict = row_cd.to_dict(orient='records')[0]
    
        a = row_ab_dict['src_wikidata_id']
        b = row_ab_dict['dest_wikidata_id']
    
        c = row_cd_dict['src_wikidata_id']
        d = row_cd_dict['dest_wikidata_id']
    
        if i < 5:
            print(f"{a}->{b} , {c}->{d}")
    
        row_ad_new = row_ab_dict.copy()
        row_ad_new['dest_wikidata_id'] = d
    
        row_ad_new['dest_wikidata_title'] = row_cd_dict['dest_wikidata_title']
        row_ad_new['dest_wikidata_label'] = row_cd_dict['dest_wikidata_label']
        row_ad_new['dest_wikidata_extract'] = row_cd_dict['dest_wikidata_extract']
        
        row_cb_new = row_cd_dict.copy()
        row_cb_new['dest_wikidata_id'] = b
    
        row_cb_new['dest_wikidata_title'] = row_ab_dict['dest_wikidata_title']
        row_cb_new['dest_wikidata_label'] = row_ab_dict['dest_wikidata_label']
        row_ad_new['dest_wikidata_extract'] = row_cd_dict['dest_wikidata_extract']
    
        perturbed_rows_dicts.append(row_ad_new)
        perturbed_rows_dicts.append(row_cb_new)
    
    df_errors = pd.DataFrame.from_dict(perturbed_rows_dicts)
    print(len(df_errors))

    df_clean = df[~df['triple_id'].isin(selected_edges)]
    print(len(df_clean))

    # For train dataset
    df_errors['disapprove'] = 1
    df_clean['disapprove'] = 0
    
    return pd.concat([df_clean, df_errors])

In [12]:
df_train_perturbed = perturb_graph(triplets_with_src_dst_wiki_ids_df)

swapping 55643 and 152715
922402->48337 , 46717->884
swapping 170009 and 119076
316709->244333 , 393686->924339
swapping 236138 and 194859
369292->163 , 3312129->218
swapping 244999 and 194663
191084->728488 , 463313->674456
swapping 2539 and 163696
180338->518675 , 2487->80702
119940
119407


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['disapprove'] = 0


In [14]:
# df_train_perturbed.to_csv(f"{data_path}/exp4/triplets_with_src_dst_wiki_extracts.csv", index=False)

In [11]:
# df_train_perturbed = pd.read_csv(f"{data_path}/exp4/triplets_with_src_dst_wiki_extracts.csv.gz")

In [12]:
prefix = f"{data_path}/exp4/triplets_with_src_dst_wiki_extracts"

In [13]:
# split file into segments to overcome git limitations with large files

# split file into 10k chunks each
k = 10000
df = df_train_perturbed

num_rows = len(df)
for i in range(0, num_rows, k):
    chunk = df.iloc[i:i + k]
    print(len(chunk))
    filename = f"{prefix}.{i}.csv"
    chunk.to_csv(filename, index=False)

10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
9347


In [15]:
df_test_perturbed = perturb_graph(df_test)

swapping 279059 and 286411
220376->4500 , 915845->182015
swapping 283248 and 283794
168504->816 , 76717->1198
swapping 286282 and 281807
168010->8311 , 237072->319221
swapping 272467 and 275231
372514->182015 , 374065->28389
swapping 285623 and 289822
105031->334 , 108745->180645
8970
8970


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['disapprove'] = 0


In [16]:
df_test_perturbed.to_csv(f"{data_path}/exp4/triplets_with_src_dst_wiki_extracts.test.csv", index=False)

In [62]:
# Evaluation on test set, do the same perturbation
# for them label to be empty per guidelines - https://kumo.ai/docs/examples/fraud-chargeback-abuse/

In [31]:
triplets_with_src_dst_wiki_ids_df.loc[triplets_with_src_dst_wiki_ids_df.triple_id.isin( [2539, 163696])]

# [55643, 152715]
# [170009, 119076]
# [236138, 194859]
# [244999, 194663]
# [2539, 163696]


Unnamed: 0.1,Unnamed: 0,src_freebase_id,relation,dest_freebase_id,triple_id,timestamp,relation_name,relation_id,src_wikidata_label,src_wikidata_id,dest_wikidata_label,dest_wikidata_id,src_wikidata_title,src_wikidata_extract,dest_wikidata_title,dest_wikidata_extract
2240,2240,/m/0f0kz,/award/award_nominee/award_nominations./award/...,/m/09sb52,2539,2024-11-10,/award/award_nominee/award_nominations./award/...,19,Christopher Lee,180338,Screen Actors Guild Award for Outstanding Perf...,518675,Christopher Lee,Sir Christopher Frank Carandini Lee (27 May 19...,Screen Actors Guild Award for Outstanding Perf...,The Screen Actors Guild Award for Outstanding ...
144228,144228,/m/07j9n,/base/culturalevent/event/entity_involved,/m/01s47p,163696,2024-10-06,/base/culturalevent/event/entity_involved,88,Thirty Years' War,2487,Spanish Empire,80702,Thirty Years' War,"The Thirty Years' War, fought primarily in Cen...",Spanish Empire,"The Spanish Empire, sometimes referred to as t..."


In [23]:
df_test.loc[df_test.triple_id.isin([285623, 289822])]
# [279059, 286411]

# [283248, 283794]
# [286282, 281807]
# [272467, 275231]
# [285623, 289822]


Unnamed: 0.1,Unnamed: 0,src_freebase_id,relation,dest_freebase_id,timestamp,relation_name,relation_id,src_wikidata_label,src_wikidata_id,dest_wikidata_label,dest_wikidata_id,triple_id,src_wikidata_title,src_wikidata_extract,dest_wikidata_title,dest_wikidata_extract
13243,13243,/m/01fmys,/film/film/release_date_s./film/film_regional_...,/m/06t2t,2025-01-02,/film/film/release_date_s./film/film_regional_...,13,Home Alone,105031,Singapore,334,285623,Home Alone,Home Alone is a 1990 American Christmas comedy...,Singapore,"Singapore, officially the Republic of Singapor..."
17368,17368,/m/01d2v1,/film/film/film_production_design_by,/m/0fqjks,2025-01-02,/film/film/film_production_design_by,184,Ghostbusters,108745,"John DeCuir, Sr.",180645,289822,Ghostbusters,Ghostbusters is a 1984 American supernatural c...,John DeCuir,"John DeCuir (June 4, 1918 – October 29, 1991) ..."
