In [3]:
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [4]:
labeled_dataset = pd.read_pickle('cora-labelled.pkl', compression='infer')
labeled_sample = labeled_dataset.sample(1000000)

training_set, validation_set = train_test_split(labeled_sample, test_size = 0.5, random_state = 21)

X_train = training_set[["node1", "node2"]]
Y_train = training_set[["edge"]]
X_val = validation_set[["node1", "node2"]]
Y_val = validation_set[["edge"]]

In [5]:
X_train_split = pd.read_pickle('X_train_split.pkl', compression='infer')
X_val_split = pd.read_pickle('X_val_split.pkl', compression='infer')

In [6]:
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train_split, Y_train.values.ravel())

LogisticRegression()

In [7]:
Y_pred = logisticRegr.predict(X_val_split)

In [8]:
score = logisticRegr.score(X_val_split, Y_val)

In [9]:
score

0.999308

In [10]:
from sklearn.metrics import confusion_matrix

In [11]:
cm = confusion_matrix(Y_pred, Y_val)

In [12]:
def accuracy(confusion_matrix):
   diagonal_sum = confusion_matrix.trace()
   sum_of_all_elements = confusion_matrix.sum()
   return diagonal_sum / sum_of_all_elements

In [13]:
print(f"Accuracy of Link Prediction :{accuracy(cm)}")

Accuracy of Link Prediction :0.999308


In [1]:
import pandas as pd
from tqdm import tqdm
import pickle

columns = ["node_id"]

for i in range(128):
    columns.append("dim"+str(i+1))

emb = pd.read_csv('embeddings/node2vec/blogcat.emb', delim_whitespace=True, names=columns).sort_values(by=['node_id'], ascending=True)

In [3]:
emb = emb.sample(5000)

In [4]:
emb

Unnamed: 0,node_id,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,...,dim119,dim120,dim121,dim122,dim123,dim124,dim125,dim126,dim127,dim128
1527,2066,0.052074,-0.429777,0.215035,0.199201,0.206047,0.114223,0.241008,0.088216,0.157998,...,0.156034,-0.068149,0.009941,-0.208166,-0.300069,0.156915,-0.137772,-0.049340,-0.317405,0.112589
64,6950,0.022492,0.108998,-0.071975,0.168154,0.421048,0.230287,0.006817,-0.375041,0.184413,...,0.103182,-0.200783,-0.250971,0.191417,-0.109690,0.199817,-0.238718,0.160501,-0.118080,-0.282179
6884,10256,0.126145,-0.152589,0.060298,0.032160,0.188941,-0.384820,-0.408019,-0.172406,0.091635,...,-0.166827,0.243925,0.247281,0.281483,0.020363,-0.218705,-0.113981,0.273330,0.094058,-0.024915
9682,9214,0.249940,0.222552,-0.150548,-0.021193,-0.209373,-0.070403,0.040943,-0.291925,-0.189060,...,0.705777,-0.019503,-0.049223,-0.047085,-0.197041,0.105464,-0.315041,0.045481,-0.013897,0.060941
5651,3491,0.155980,-0.241860,0.146430,0.211675,0.151393,-0.249648,-0.658807,0.320382,-0.143971,...,0.155417,-0.181873,-0.062734,-0.282360,0.137320,-0.035122,-0.163720,0.082120,-0.126070,0.394341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1593,-0.058861,-0.317037,0.050183,0.431725,0.139852,0.213652,0.268744,0.105128,0.013668,...,0.107539,-0.083106,0.041312,-0.514029,0.035096,0.134221,-0.067839,0.171409,-0.169376,-0.011365
6369,352,0.124507,-0.015024,-0.110246,0.224241,0.678592,-0.108148,-0.057218,-0.077695,-0.050989,...,-0.079311,-0.257443,-0.149401,0.027160,-0.085700,0.253258,-0.057287,-0.246299,-0.217165,0.312101
5350,4299,0.169887,-0.337741,-0.016692,-0.250782,-0.020639,-0.386869,0.346148,-0.297523,-0.153134,...,0.207065,0.068094,-0.405422,0.077596,0.117720,-0.243272,0.049892,-0.026708,-0.183935,0.223543
6482,4895,0.464559,-0.104694,0.151273,-0.451966,0.239214,-0.173488,0.367109,-0.022892,0.293441,...,-0.346831,-0.297553,0.109841,0.243500,0.079850,-0.058421,-0.244479,-0.380137,-0.100433,-0.264750


In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

columns = ["node_id"]

for i in range(128):
    columns.append("dim"+str(i+1))

emb = pd.read_csv('embeddings/node2vec/airport.emb', delim_whitespace=True, names=columns).sort_values(by=['node_id'], ascending=True)

dimensions = []
for i in range(128):
    dimensions.append("dim"+str(i+1))

nodes = []
for index, row in emb.iterrows():
    nodes.append(row["node_id"])

pairs = []
for node1 in nodes:
    for node2 in nodes:
        pairs.append([node1, node2])

pairs = pd.DataFrame.from_records(pairs)
pairs.columns=["node1", "node2"]

node_embeddings = {}
for index, row in emb.iterrows():
    node_embeddings[row["node_id"]] = [row[dim] for dim in dimensions]

edges = pd.read_csv('edgelists/airport.edgelist', delim_whitespace=True, names=["node1", "node2"])

negative = pd.concat([edges, pairs]).drop_duplicates(keep=False)
negative = negative.sample(len(edges))
negative = negative.reset_index(drop=True)

In [7]:
edges

Unnamed: 0,node1,node2
1,2,1
1,3,7
4,5,3
6,7,5
6,8,14137
...,...,...
959,958,682
1566,958,365
232,216,3
232,10,1


In [8]:
edges = edges.reset_index(drop=True)

In [9]:
edges

Unnamed: 0,node1,node2
0,2,1
1,3,7
2,5,3
3,7,5
4,8,14137
...,...,...
28231,958,682
28232,958,365
28233,216,3
28234,10,1


In [10]:
ones = [[1]] * len(edges)
ones = pd.DataFrame.from_records(ones)
ones.columns=["edge"]
positive_edges = pd.concat([edges,ones], axis=1)