In [37]:
import pandas as pd
from tqdm import tqdm
import pickle

columns = ["node_id"]

for i in range(128):
    columns.append("dim"+str(i+1))

emb = pd.read_csv('embeddings/node2vec/cora.emb', delim_whitespace=True, names=columns).sort_values(by=['node_id'], ascending=True)

dimensions = []
for i in range(128):
    dimensions.append("dim"+str(i+1))

nodes = []
for index, row in emb.iterrows():
    nodes.append(row["node_id"])

pairs = []
for node1 in nodes:
    for node2 in nodes:
        pairs.append([node1, node2])

pairs = pd.DataFrame.from_records(pairs)
pairs.columns=["node1", "node2"]

node_embeddings = {}
for index, row in emb.iterrows():
    node_embeddings[row["node_id"]] = [row[dim] for dim in dimensions]

edges = pd.read_csv('edgelists/cora.edgelist', delim_whitespace=True, names=["node1", "node2"])

In [33]:
# ones = [[1]] * len(edges)
# ones = pd.DataFrame.from_records(ones)
# ones.columns=["edge"]
# ones

Unnamed: 0,edge
0,1
1,1
2,1
3,1
4,1
...,...
5424,1
5425,1
5426,1
5427,1


In [42]:
edges

Unnamed: 0,node1,node2
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960
...,...,...
5424,853116,19621
5425,853116,853155
5426,853118,1140289
5427,853155,853118


In [43]:
pairs

Unnamed: 0,node1,node2
0,35.0,35.0
1,35.0,40.0
2,35.0,114.0
3,35.0,117.0
4,35.0,128.0
...,...,...
7333259,1155073.0,1154500.0
7333260,1155073.0,1154520.0
7333261,1155073.0,1154524.0
7333262,1155073.0,1154525.0


In [40]:
negative = pd.concat([edges, pairs]).drop_duplicates(keep=False)

In [82]:
negative = negative.sample(len(edges))
negative = negative.reset_index(drop=True)

In [76]:
ones = [[1]] * len(edges)
ones = pd.DataFrame.from_records(ones)
ones.columns=["edge"]

positive_edges = pd.concat([edges,ones], axis=1)

In [84]:
zeros = [[0]] * len(negative)
zeros = pd.DataFrame.from_records(zeros)
zeros.columns=["edge"]

negative_edges = pd.concat([negative,zeros], axis=1)

In [88]:
labeled_dataset = pd.concat([positive_edges, negative_edges])

In [93]:
labeled_dataset = labeled_dataset.sample(frac=1)
labeled_dataset['node1'] = labeled_dataset['node1'].map(node_embeddings)
labeled_dataset['node2'] = labeled_dataset['node2'].map(node_embeddings)

In [94]:
labeled_dataset

Unnamed: 0,node1,node2,edge
271,"[0.01682152, 0.036482435, -0.10163992, 0.17839...","[0.0266073, 0.01914338, -0.13107838, 0.2468097...",0
2006,"[0.0120655885, 0.030571472, -0.114680365, 0.18...","[0.034957357, 0.033103924, -0.07110345, 0.1556...",1
3108,"[0.09055799, 0.021777663, -0.062177338, 0.1894...","[0.030534532, 0.03675201, -0.09628818, 0.19239...",1
1552,"[-0.0036284549, 0.026947137, -0.15846787, 0.23...","[-0.00033028593, 0.028075011, -0.09929825, 0.2...",1
5287,"[-0.11126907, 0.049645636, -0.17567395, 0.7479...","[0.02974692, 0.017634995, -0.08224627, 0.15496...",0
...,...,...,...
897,"[0.033211906, 0.045847543, -0.106993936, 0.168...","[0.13800791, 0.22947067, -0.07696886, 0.179571...",0
2862,"[0.027448405, 0.042384673, -0.108271345, 0.209...","[0.0041442034, 0.036308967, -0.0942077, 0.2047...",1
655,"[0.042097338, 0.025081744, -0.12576607, 0.2173...","[0.030789873, 0.0019824253, -0.09344356, 0.222...",0
1057,"[0.028951548, 0.079896666, -0.15622036, 0.2238...","[0.036439802, 0.08807214, -0.12908758, 0.26387...",1


In [None]:
for index_edge, row_edge in tqdm(edges.iterrows(), total=edges.shape[0]):
    pair_row = pairs.loc[(pairs['node1'] == row_edge['node1']) & (pairs['node2'] == row_edge['node2'])]
    pairs.loc[pair_row.index, "edge"] = 1

labeled_dataset = pairs

labeled_dataset['node1'] = labeled_dataset['node1'].map(node_embeddings)
labeled_dataset['node2'] = labeled_dataset['node2'].map(node_embeddings)

with open('blogcat-labelled.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump(labeled_dataset,f)

In [5]:
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

labeled_dataset = pd.read_pickle('cora-labelled.pkl', compression='infer')
labeled_sample = labeled_dataset.sample(1000000)

training_set, validation_set = train_test_split(labeled_sample, test_size = 0.5, random_state = 21)

X_train = training_set[["node1", "node2"]]
Y_train = training_set[["edge"]]
X_val = validation_set[["node1", "node2"]]
Y_val = validation_set[["edge"]]

print("Splitting...")
X_train_split = pd.concat([pd.DataFrame(X_train['node1'].to_list()),pd.DataFrame(X_train['node2'].to_list())], axis=1)
# with open('airport-X_train_split.pkl', 'wb') as f:
#     pickle.dump(X_train_split,f)

logisticRegr = LogisticRegression()
logisticRegr.fit(X_train_split, Y_train.values.ravel())

X_val_split = pd.concat([pd.DataFrame(X_val['node1'].to_list()),pd.DataFrame(X_val['node2'].to_list())], axis=1)
# with open('airport-X_val_split.pkl', 'wb') as f:
#     pickle.dump(X_val_split,f)

Y_pred = logisticRegr.predict(X_val_split)

Splitting...


In [9]:
import numpy as np

In [10]:
result = np.where(Y_pred == 1)
result

(array([], dtype=int64),)

In [19]:
Ynp = Y_val.to_numpy()
result = np.where(Ynp == 1)
result

(array([  4305,   6775,   6991,   7469,   9576,  12613,  12683,  18430,
         19779,  20233,  21198,  22764,  25730,  27080,  30076,  30784,
         35207,  35420,  36289,  39443,  41143,  41390,  47305,  48031,
         49554,  50274,  50837,  51610,  51635,  52249,  53898,  56665,
         59546,  59609,  60251,  60546,  60620,  60813,  60890,  61787,
         64003,  65391,  66390,  67613,  69062,  69796,  70070,  72153,
         73481,  75179,  76928,  77211,  79940,  80363,  81060,  81649,
         81686,  83824,  88110,  97652,  99792, 104202, 108105, 108523,
        111550, 115581, 116534, 119015, 119221, 121151, 121283, 122445,
        123348, 123737, 124877, 125797, 125884, 127172, 127238, 129646,
        131070, 131798, 132958, 133442, 136330, 137509, 142321, 144329,
        144994, 145076, 145702, 145935, 146215, 147307, 148250, 148838,
        150463, 150625, 151167, 151346, 151512, 151606, 151845, 152926,
        158037, 158247, 158405, 159311, 159945, 160195, 161382, 

In [14]:
labeled_sample.loc[labeled_sample.edge==1]

Unnamed: 0,node1,node2,edge
746631,"[-0.056155603, -0.0409242, -0.16373701, 0.3488...","[-0.04251378, -0.022469688, -0.20103596, 0.345...",1
552614,"[0.14025748, 0.03694469, -0.28000578, 0.271357...","[0.043592736, 0.015466789, -0.1431284, 0.22636...",1
1088264,"[-0.00796651, 0.008717331, -0.12601106, 0.2215...","[0.008620377, 0.025781916, -0.061466355, 0.119...",1
766267,"[0.059516735, 0.05293871, -0.18680592, 0.22133...","[0.02890071, -0.013273703, -0.14632192, 0.2305...",1
337920,"[0.08806988, 0.11127243, 0.03586152, 0.0456522...","[0.03359271, 0.052589666, -0.10384312, 0.19832...",1
...,...,...,...
1562,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.050701216, 0.012143207, -0.24918853, 0.3414...",1
161010,"[0.08755281, -0.4173949, -0.22374095, 0.184246...","[0.08039461, -0.13120776, -0.19063137, 0.08743...",1
3334939,"[0.019068811, 0.027376289, -0.12192372, 0.2285...","[0.0044769943, 0.0067409165, -0.089271404, 0.2...",1
117251,"[0.09169634, 0.02237468, -0.15178661, 0.056438...","[0.053917963, -0.1275742, -0.16802923, 0.21672...",1


In [16]:
Y_train.loc[Y_train.edge==1]

Unnamed: 0,edge
1285287,1
281647,1
2124669,1
433331,1
2687797,1
...,...
193433,1
880,1
1846546,1
1014251,1


In [17]:
Y_val.loc[Y_val.edge==1]

Unnamed: 0,edge
2489572,1
127891,1
157520,1
1696487,1
7553,1
...,...
961851,1
406233,1
142395,1
215299,1


In [95]:
import pandas as pd
from tqdm import tqdm
import pickle

columns = ["node_id"]

for i in range(128):
    columns.append("dim"+str(i+1))

emb = pd.read_csv('embeddings/node2vec/cora.emb', delim_whitespace=True, names=columns).sort_values(by=['node_id'], ascending=True)

dimensions = []
for i in range(128):
    dimensions.append("dim"+str(i+1))

nodes = []
for index, row in emb.iterrows():
    nodes.append(row["node_id"])

pairs = []
for node1 in nodes:
    for node2 in nodes:
        pairs.append([node1, node2])

pairs = pd.DataFrame.from_records(pairs)
pairs.columns=["node1", "node2"]

node_embeddings = {}
for index, row in emb.iterrows():
    node_embeddings[row["node_id"]] = [row[dim] for dim in dimensions]

edges = pd.read_csv('edgelists/cora.edgelist', delim_whitespace=True, names=["node1", "node2"])

negative = pd.concat([edges, pairs]).drop_duplicates(keep=False)
negative = negative.sample(len(edges))
negative = negative.reset_index(drop=True)

ones = [[1]] * len(edges)
ones = pd.DataFrame.from_records(ones)
ones.columns=["edge"]
positive_edges = pd.concat([edges,ones], axis=1)

zeros = [[0]] * len(negative)
zeros = pd.DataFrame.from_records(zeros)
zeros.columns=["edge"]
negative_edges = pd.concat([negative,zeros], axis=1)

labeled_dataset = pd.concat([positive_edges, negative_edges])

labeled_dataset = labeled_dataset.sample(frac=1)
labeled_dataset['node1'] = labeled_dataset['node1'].map(node_embeddings)
labeled_dataset['node2'] = labeled_dataset['node2'].map(node_embeddings)

In [96]:
labeled_dataset

Unnamed: 0,node1,node2,edge
770,"[0.27945423, -0.09129877, -0.22053136, 0.33313...","[-0.0052444474, 0.02012093, -0.14122427, 0.121...",0
5219,"[0.08804947, 0.16897835, 0.08490682, 0.1910532...","[0.27409813, 0.5427561, 0.14548661, 0.4609664,...",1
2432,"[0.062199514, -0.07723861, -0.09374827, 0.1485...","[0.5856943, 0.36951804, -0.72969574, 1.3730264...",0
4624,"[0.035656866, 0.06126999, -0.07505023, 0.20985...","[0.039302018, 0.031640153, -0.11187381, 0.1350...",0
1222,"[0.091123484, -0.0029778294, -0.21096633, 0.10...","[0.011907396, 0.040668964, -0.106543854, 0.238...",0
...,...,...,...
5058,"[-0.040316377, 0.07048303, -0.12592244, 0.2272...","[0.052579906, -0.01971727, -0.13999438, 0.2022...",0
4285,"[-0.026079508, -0.024073303, -0.109025784, 0.3...","[-0.02103864, -0.02807019, -0.19095017, 0.2586...",1
856,"[-0.008264931, 0.021719176, -0.09764295, 0.238...","[0.068003766, -0.067958936, -0.30492997, 0.157...",0
4770,"[0.108664565, 0.011761833, -0.13443775, 0.1243...","[0.057032518, 0.064249866, -0.1413549, 0.19318...",1
