In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [7]:
columns = ["node_id"]

for i in range(128):
    columns.append("dim"+str(i+1))

cora_emb = pd.read_csv('embeddings/node2vec/cora.emb', delim_whitespace=True, names=columns).sort_values(by=['node_id'], ascending=True)

In [8]:
dimensions = []
for i in range(128):
    dimensions.append("dim"+str(i+1))

In [9]:
nodes = []
for index, row in cora_emb.iterrows():
    nodes.append(row["node_id"])

In [10]:
pairs = []
for node1 in nodes:
    for node2 in nodes:
        pairs.append([node1, node2, 0])

In [11]:
pairs = pd.DataFrame.from_records(pairs)

In [12]:
pairs.columns=["node1", "node2", "edge"]
pairs

Unnamed: 0,node1,node2,edge
0,35.0,35.0,0
1,35.0,40.0,0
2,35.0,114.0,0
3,35.0,117.0,0
4,35.0,128.0,0
...,...,...,...
7333259,1155073.0,1154500.0,0
7333260,1155073.0,1154520.0,0
7333261,1155073.0,1154524.0,0
7333262,1155073.0,1154525.0,0


In [13]:
node_embeddings = {}
for index, row in cora_emb.iterrows():
    node_embeddings[row["node_id"]] = [row[dim] for dim in dimensions]

In [14]:
cora_edges = pd.read_csv('edgelists/cora.edgelist', delim_whitespace=True, names=["node1", "node2"])

In [15]:
cora_edges

Unnamed: 0,node1,node2
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960
...,...,...
5424,853116,19621
5425,853116,853155
5426,853118,1140289
5427,853155,853118


In [16]:
for index_edge, row_edge in tqdm(cora_edges.iterrows(), total=cora_edges.shape[0]):
    pair_row = pairs.loc[(pairs['node1'] == row_edge['node1']) & (pairs['node2'] == row_edge['node2'])]
    pairs.loc[pair_row.index, "edge"] = 1

100%|██████████| 5429/5429 [06:01<00:00, 15.01it/s]


In [17]:
labeled_dataset = pairs

In [18]:
labeled_dataset

Unnamed: 0,node1,node2,edge
0,35.0,35.0,0
1,35.0,40.0,0
2,35.0,114.0,0
3,35.0,117.0,0
4,35.0,128.0,0
...,...,...,...
7333259,1155073.0,1154500.0,0
7333260,1155073.0,1154520.0,0
7333261,1155073.0,1154524.0,0
7333262,1155073.0,1154525.0,0


In [19]:
labeled_dataset['node1'] = labeled_dataset['node1'].map(node_embeddings)
labeled_dataset['node2'] = labeled_dataset['node2'].map(node_embeddings)

In [20]:
labeled_dataset

Unnamed: 0,node1,node2,edge
0,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.049718346, 0.017121674, -0.10700412, 0.0593...",0
1,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.10003702, 0.1625829, -0.032033097, 0.253132...",0
2,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.050540783, 0.046772532, -0.10019523, 0.2197...",0
3,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.02804115, 0.014030917, -0.08220115, 0.22559...",0
4,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.09402196, -0.01262425, -0.09879128, 0.35140...",0
...,...,...,...
7333259,"[-0.10776496, -0.06716024, -0.51714647, 0.6913...","[0.013109966, 0.0022050457, -0.035664182, 0.05...",0
7333260,"[-0.10776496, -0.06716024, -0.51714647, 0.6913...","[0.012644181, 0.000939504, -0.038704194, 0.080...",0
7333261,"[-0.10776496, -0.06716024, -0.51714647, 0.6913...","[0.059567977, 0.023053506, -0.119109556, 0.226...",0
7333262,"[-0.10776496, -0.06716024, -0.51714647, 0.6913...","[0.040649313, 0.037171587, -0.08658076, 0.1742...",0


In [21]:
labeled_dataset.loc[labeled_dataset["edge"]==1]

Unnamed: 0,node1,node2,edge
13,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.016674066, 0.016503723, -0.23380418, 0.3222...",1
21,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.038677208, 0.053517453, -0.09404679, 0.1733...",1
31,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.09174433, 0.054115757, 0.00805601, 0.292683...",1
42,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.06267538, -0.0022035136, -0.105001695, 0.11...",1
186,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.027513662, 0.012208716, -0.085136905, 0.158...",1
...,...,...,...
5072412,"[0.04776552, -0.10905557, -0.10127855, 0.17384...","[0.11933631, 0.018491413, -0.050032403, 0.0992...",1
5073960,"[0.04776552, -0.10905557, -0.10127855, 0.17384...","[-0.061998803, -0.2530384, -0.17531598, 0.1260...",1
5077378,"[-0.09576055, -0.26426095, -0.1826489, 0.19715...","[-0.016698055, -0.29936907, -0.20215443, 0.165...",1
5082082,"[-0.061998803, -0.2530384, -0.17531598, 0.1260...","[-0.09576055, -0.26426095, -0.1826489, 0.19715...",1


In [22]:
import pickle

In [23]:
with open('cora-labelled.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump(labeled_dataset,f)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle
import pandas as pd

In [25]:
labeled_dataset = pd.read_pickle('cora-labelled.pkl', compression='infer')

In [26]:
labeled_dataset

Unnamed: 0,node1,node2,edge
0,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.049718346, 0.017121674, -0.10700412, 0.0593...",0
1,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.10003702, 0.1625829, -0.032033097, 0.253132...",0
2,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.050540783, 0.046772532, -0.10019523, 0.2197...",0
3,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.02804115, 0.014030917, -0.08220115, 0.22559...",0
4,"[0.049718346, 0.017121674, -0.10700412, 0.0593...","[0.09402196, -0.01262425, -0.09879128, 0.35140...",0
...,...,...,...
7333259,"[-0.10776496, -0.06716024, -0.51714647, 0.6913...","[0.013109966, 0.0022050457, -0.035664182, 0.05...",0
7333260,"[-0.10776496, -0.06716024, -0.51714647, 0.6913...","[0.012644181, 0.000939504, -0.038704194, 0.080...",0
7333261,"[-0.10776496, -0.06716024, -0.51714647, 0.6913...","[0.059567977, 0.023053506, -0.119109556, 0.226...",0
7333262,"[-0.10776496, -0.06716024, -0.51714647, 0.6913...","[0.040649313, 0.037171587, -0.08658076, 0.1742...",0


In [27]:
training_set, validation_set = train_test_split(labeled_dataset, test_size = 0.5, random_state = 21)

In [28]:
X_train = training_set[["node1", "node2"]]
Y_train = training_set[["edge"]]
X_val = validation_set[["node1", "node2"]]
Y_val = validation_set[["edge"]]

In [29]:
X_val

Unnamed: 0,node1,node2
7121352,"[-0.0036314505, 0.03355813, -0.066738226, 0.27...","[0.0149269095, 0.029361753, -0.14075868, 0.180..."
4655171,"[-0.107240416, 0.027921598, -0.09334998, 0.280...","[0.07802603, 0.12697582, 0.03943992, -0.006449..."
4024277,"[0.068003766, -0.067958936, -0.30492997, 0.157...","[0.022536533, 0.04097203, -0.08608644, 0.16932..."
3730060,"[0.0058743665, 0.02687376, -0.08828144, 0.1739...","[0.029191626, 0.021335661, -0.07608572, 0.1594..."
1585776,"[0.038796052, 0.0418337, -0.12889533, 0.191343...","[-0.02103864, -0.02807019, -0.19095017, 0.2586..."
...,...,...
1262785,"[0.019746494, 0.039092906, -0.120412245, 0.173...","[0.044477746, 0.032185547, -0.14004585, 0.1793..."
3950919,"[-0.07643254, -0.022629486, -0.18565226, 0.311...","[0.029919442, 0.0148792155, -0.062127583, 0.10..."
737323,"[-0.016018555, -0.026522364, -0.132579, 0.3664...","[-0.086224645, 0.094683096, -0.6987286, 1.1129..."
5452211,"[0.009246149, 0.0010840088, -0.05304365, 0.076...","[-0.11697028, -0.007520222, -0.18937013, 0.211..."


In [58]:
X_train

Unnamed: 0,node1,node2
2471465,"[0.028648688, 0.009297014, -0.09225189, 0.2060...","[0.051186897, 0.001078017, -0.15021047, 0.2270..."
167523,"[0.09138024, -0.4578878, -0.26863584, 0.377749...","[0.035193503, 0.03204156, -0.09881593, 0.22268..."
5137758,"[-0.05146452, -0.1333054, -0.5529091, 0.691860...","[0.031913318, 0.031520072, -0.06453789, 0.1265..."
2751041,"[-0.020355348, -0.037732005, 0.15861453, 0.338...","[-0.038505025, -0.08984008, -0.09109558, 0.267..."
6344691,"[0.07439266, -0.023156188, -0.068742216, 0.202...","[0.0029483652, 0.054076955, -0.105595544, 0.19..."
...,...,...
4258160,"[0.15614457, 0.1087287, -0.16523542, 0.3564600...","[0.1051597, 0.019845676, -0.09455794, 0.188343..."
1130544,"[-0.040316377, 0.07048303, -0.12592244, 0.2272...","[-0.01624268, 0.031974223, -0.09564581, 0.2225..."
664324,"[0.042837206, 0.16202438, -0.09681678, 0.21246...","[-0.003891104, 0.0640967, -0.100903735, 0.1912..."
202552,"[-0.0030463585, 0.13365391, -0.21486509, 0.309...","[0.0372443, 0.011462707, -0.07036886, 0.131608..."


In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
logisticRegr = LogisticRegression()

In [59]:
X_train_split = pd.concat([pd.DataFrame(X_train['node1'].to_list()),pd.DataFrame(X_train['node2'].to_list())], axis=1)
with open('X_train_split.pkl', 'wb') as f:
    pickle.dump(X_train_split,f)

In [None]:
X_train_split

In [None]:
logisticRegr.fit(X_train_split, Y_train.values.ravel())

Error: Session cannot generate requests

In [55]:
X_val_split = pd.concat([pd.DataFrame(X_val['node1'].to_list()),pd.DataFrame(X_val['node2'].to_list())], axis=1)

In [None]:
X_val_split

In [57]:
logisticRegr.predict(X_val_split)

array([0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1])

In [None]:
import pandas as pd
from tqdm import tqdm
import pickle

columns = ["node_id"]

for i in range(128):
    columns.append("dim"+str(i+1))

emb = pd.read_csv('embeddings/node2vec/blogcat.emb', delim_whitespace=True, names=columns).sort_values(by=['node_id'], ascending=True)

Error: Session cannot generate requests