# Cut and reformat Goldstandards

In [1]:
import pandas as pd
from sklearn.utils import shuffle


In [2]:
# paths
path_gs_dir    = '../Data/goldstandard/'
path_ir_gs     = '../JavaProjectUsingWinter/data/goldstandard/'

path_wd_input  = path_gs_dir + 'intermediates/gs_biodiversity_wikidata.xlsx'
path_es_input  = path_ir_gs  + 'gs_biodiversity_endangeredSpecies.csv'

path_wd_full_output  = path_ir_gs + 'gs_biodiversity_wikidata_full.csv'
path_wd_test_output  = path_ir_gs + 'gs_biodiversity_wikidata_test.csv'
path_wd_train_output = path_ir_gs + 'gs_biodiversity_wikidata_train.csv'
path_es_full_output  = path_ir_gs + 'gs_biodiversity_endangeredSpecies_full.csv'
path_es_test_output  = path_ir_gs + 'gs_biodiversity_endangeredSpecies_test.csv'
path_es_train_output = path_ir_gs + 'gs_biodiversity_endangeredSpecies_train.csv'

In [3]:
columns = ['id1', 'id2', 'match']
df_wd = pd.read_excel(path_wd_input, header=None, names=columns)
df_es = pd.read_csv(path_es_input,   header=None, names=columns)

display(df_wd.shape, df_wd.head(3))
display(df_es.shape, df_es.head(3))

(282, 3)

Unnamed: 0,id1,id2,match
0,BIO31710,http://www.wikidata.org/entity/q794337,True
1,BIO40264,http://www.wikidata.org/entity/q1770032,True
2,BIO11144,http://www.wikidata.org/entity/q941102,True


(385, 3)

Unnamed: 0,id1,id2,match
0,BIO05795,ES1416,True
1,BIO05795,ES1416,True
2,BIO07214,ES1627,True


In [4]:
def convertMatch(val):
    if val:
        val = 'TRUE'
    else:
        val = 'FALSE'
    return val

def createGS(df):
    split        = 0.7
    #df           = df_es #df_wd
    df_out_full  = pd.DataFrame(columns=columns)
    df_out_test  = pd.DataFrame(columns=columns)
    df_out_train = pd.DataFrame(columns=columns)
    true_false   = df.match.unique()#[True, False]

    nr_true             = len(df[df.match == True])
    nr_false            = len(df[df.match == False])
    nr_balanced_matches = min(nr_true, nr_false)


    for match in true_false:
        df_out_full = pd.concat([df_out_full, df[df.match == match].sample(n=nr_balanced_matches)])    
    df_out_full = shuffle(df_out_full).reset_index(drop=True)
    


    nr_train = int(split * nr_balanced_matches)


    for match in true_false:
        df_match = df_out_full[df_out_full.match == match].reset_index(drop=True)
        df_train_match = df_match.loc[:nr_train]
        df_test_match  = df_match.loc[nr_train:]

        df_out_train = pd.concat([df_out_train, df_train_match])
        df_out_test  = pd.concat([df_out_test,  df_test_match])

    df_out_full.match  = df_out_full.match.apply(lambda x: convertMatch(x))
    df_out_test.match  = df_out_test.match.apply(lambda x: convertMatch(x))
    df_out_train.match = df_out_train.match.apply(lambda x: convertMatch(x))

    return df_out_full, df_out_test, df_out_train


df_out_full_wd, df_out_test_wd, df_out_train_wd = createGS(df_wd)
df_out_full_es, df_out_test_es, df_out_train_es = createGS(df_es)

In [5]:
def saveToCSV(df, path):
    df.to_csv(path, header=False, index=False)
    
saveToCSV(df_out_full_wd,  path_wd_full_output)  
saveToCSV(df_out_test_wd,  path_wd_test_output)  
saveToCSV(df_out_train_wd, path_wd_train_output)  
saveToCSV(df_out_full_es,  path_es_full_output)  
saveToCSV(df_out_test_es,  path_es_test_output)  
saveToCSV(df_out_train_es, path_es_train_output)  

In [6]:
df_out_full_wd

Unnamed: 0,id1,id2,match
0,BIO07840,http://www.wikidata.org/entity/q839349,FALSE
1,BIO06384,http://www.wikidata.org/entity/q1260070,TRUE
2,BIO06888,http://www.wikidata.org/entity/q579713,TRUE
3,BIO36966,http://www.wikidata.org/entity/q312119,TRUE
4,BIO29500,http://www.wikidata.org/entity/q76471,TRUE
...,...,...,...
277,BIO38864,http://www.wikidata.org/entity/q550382,TRUE
278,BIO38300,http://www.wikidata.org/entity/q1523894,TRUE
279,BIO00236,http://www.wikidata.org/entity/q2038905,FALSE
280,BIO40215,http://www.wikidata.org/entity/q2099113,FALSE


In [7]:
wd_nr = len(df_out_full_wd)
es_nr = len(df_out_full_es)
print(wd_nr, es_nr, wd_nr + es_nr)

282 236 518
