Split into train and test pairs based on connected components.
Ensures that there are no records referring to the same real world entity in train and test sets.


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import networkx as nx
from collections import Counter
import random

pairs_fv= pd.read_csv(output_path+"pairs_fv.csv")

matching_pairs = pairs_fv[pairs_fv.label]
non_matching_pairs = pairs_fv[~pairs_fv.label]

print("Matching Pairs: ", matching_pairs.shape[0])
print("Non-Matching Pairs: ", non_matching_pairs.shape[0])


Graphtype = nx.Graph()
G = nx.from_pandas_edgelist(matching_pairs, source= 'source', target='target', create_using=Graphtype)

con_components = list(nx.connected_components(G))
subgraphs =  [G.subgraph(c).copy() for c in nx.connected_components(G)]
con_components_lengths = [len(x) for x in con_components]
print(Counter(con_components_lengths))
print(len(subgraphs))

random.Random(42).shuffle(con_components)
train_components = con_components[:int(0.7*len(con_components))]


test_components = con_components[int(0.7*len(con_components)):]

print("Components train: ", len(train_components))
print("Components test: ", len(test_components))

subgraph_train = [G.subgraph(c).copy() for c in train_components]
train_graph = nx.compose_all(subgraph_train)
subgraph_test = [G.subgraph(c).copy() for c in test_components]
test_graph = nx.compose_all(subgraph_test)

pairs_fv['train_or_test'] = 'not_assigned'
clean_train_neg=0
clean_test_neg=0
for ind, row in pairs_fv.iterrows():
    is_match = row['label']
    assigned=False
    if is_match and train_graph.has_edge(row.source,row.target):
        pairs_fv.at[ind, 'train_or_test']='train'
        assigned=True
    if is_match and test_graph.has_edge(row.source,row.target):
        if assigned: 
            import pdb;pdb.set_trace();
            print("Already assigned")
        pairs_fv.at[ind, 'train_or_test']='test'
        assigned=True
    if not(is_match) and train_graph.has_node(row.source) and train_graph.has_node(row.target):
        clean_train_neg +=1 
        pairs_fv.at[ind, 'train_or_test']='train'
        if assigned:
            import pdb;pdb.set_trace();
            print("Already assigned")
        assigned=True
    if not(is_match) and test_graph.has_node(row.source) and test_graph.has_node(row.target):
        clean_test_neg +=1 
        pairs_fv.at[ind, 'train_or_test']='test'
        if assigned: 
            import pdb;pdb.set_trace();
            print("Already assigned")
        assigned=True
    elif not(is_match):
        flip = random.randint(1, 10)
        if flip<=3: pairs_fv.at[ind, 'train_or_test']='test'
        else:  pairs_fv.at[ind, 'train_or_test']='train'
        

Matching Pairs:  4827
Non-Matching Pairs:  69680
Counter({2: 1005, 3: 309, 4: 224, 5: 131, 6: 74, 7: 53})
1796
Components train:  1257
Components test:  539


In [6]:
#save
print("positives distribution")
display(Counter(pairs_fv[pairs_fv.label]['train_or_test']))
print("negatives destribution")
display(Counter(pairs_fv[~pairs_fv.label]['train_or_test']))
print("clean train negatives:", clean_train_neg)
print("clean test negatives:", clean_test_neg)

train_fv= pairs_fv[pairs_fv.train_or_test=='train'].drop(columns=['train_or_test'])
test_fv= pairs_fv[pairs_fv.train_or_test=='test'].drop(columns=['train_or_test'])

train_fv.to_csv(output_path+"train_pairs_fv.csv", index= False)
test_fv.to_csv(output_path+"test_pairs_fv.csv", index= False)

positives distribution


Counter({'train': 3424, 'test': 1403})

negatives destribution


Counter({'train': 47307, 'test': 22373})

clean train negatives: 11451
clean test negatives: 1963
