In [17]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

columns = ["node_id"]

for i in range(128):
    columns.append("dim"+str(i+1))

emb = pd.read_csv('embeddings/node2vec/airport.emb', delim_whitespace=True, names=columns).sort_values(by=['node_id'], ascending=True)

dimensions = []
for i in range(128):
    dimensions.append("dim"+str(i+1))

nodes = []
for index, row in emb.iterrows():
    nodes.append(row["node_id"])

pairs = []
for node1 in nodes:
    for node2 in nodes:
        pairs.append([node1, node2])

pairs = pd.DataFrame.from_records(pairs)
pairs.columns=["node1", "node2"]

node_embeddings = {}
for index, row in emb.iterrows():
    node_embeddings[row["node_id"]] = [row[dim] for dim in dimensions]

edges = pd.read_csv('edgelists/airport.edgelist', delim_whitespace=True, names=["node1", "node2"])
edges = edges.reset_index(drop=True)

negative = pd.concat([edges, pairs]).drop_duplicates(keep=False)
negative = negative.sample(len(edges))
negative = negative.reset_index(drop=True)

ones = [[1]] * len(edges)
ones = pd.DataFrame.from_records(ones)
ones.columns=["edge"]
positive_edges = pd.concat([edges,ones], axis=1)

zeros = [[0]] * len(negative)
zeros = pd.DataFrame.from_records(zeros)
zeros.columns=["edge"]
negative_edges = pd.concat([negative,zeros], axis=1)

labeled_dataset = pd.concat([positive_edges, negative_edges])
labeled_dataset = labeled_dataset.sample(frac=1)
labeled_dataset['node1'] = labeled_dataset['node1'].map(node_embeddings)
labeled_dataset['node2'] = labeled_dataset['node2'].map(node_embeddings)

In [14]:
labeled_dataset.dropna()

Unnamed: 0,node1,node2,edge
3653,671.0,725.0,0
12696,563.0,540.0,0
6920,198.0,108116.0,1
9467,95.0,802.0,0
3001,141.0,32.0,1
...,...,...,...
27021,410.0,952.0,0
20694,1533.0,1315.0,0
17132,893.0,498.0,1
778,33.0,1314.0,0


In [15]:
training_set, validation_set = train_test_split(labeled_dataset, test_size = 0.5, random_state = 21)

X_train = training_set[["node1", "node2"]]
Y_train = training_set[["edge"]]
X_val = validation_set[["node1", "node2"]]
Y_val = validation_set[["edge"]]

In [16]:
X_train_split = pd.concat([pd.DataFrame(X_train['node1'].to_list()),pd.DataFrame(X_train['node2'].to_list())], axis=1)