Berner Fachhochschule BFH - MAS Data Science - Graph Machine Learning - Master Thesis FS/2022 Thomas Iten

# Experiment 8 - Node2Vec Mitarbieter vs. Mitarbeiter X Tests

**Referenzen**<br />
[1] https://snap.stanford.edu/node2vec<br />
[2] https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/node2vec-link-prediction.html<br />



In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from gml.graph.data_factory import TestTrainDataFactory, EdgeLabelFactory
from gml.graph.graph_embedding import EdgeEmbedding

## 8.1 Data Structure

### Datensammlung - Mitarbeiter mit Organisation

<img src="img/test-6.png" alt="Test Scenario 4" width="800"/>

## 8.2 Test Mitarbeiter


In [2]:
n = 2000

graph, test_graph, test_samples, test_labels, train_graph, train_samples, train_labels \
    = TestTrainDataFactory().create_testdata(n, add_id=False, add_predict_edges=True)

graph.print_dimemsions()

Graph dimensions:
  order : 12002 (number of nodes)
  size  : 24000 (number of edges)


In [3]:
window = 8
dimensions = 64
embeddings =  EdgeEmbedding(graph.graph, window=window, dimensions=dimensions).embeddings

test_embeddings  = [embeddings[str(x[0]),str(x[1])] for x in test_samples]
train_embeddings = [embeddings[str(x[0]),str(x[1])] for x in train_samples]

classifier = RandomForestClassifier
c = classifier(n_estimators=1000)
c.fit(train_embeddings, train_labels)
y_pred = c.predict(test_embeddings)

name  = classifier.__name__
index = ["Precision", "Recall", "F1-Score"]
score = {}

score[name] = [
    metrics.precision_score(test_labels, y_pred),
    metrics.recall_score(test_labels, y_pred),
    metrics.f1_score(test_labels, y_pred)
]

df = pd.DataFrame(score, index=index)
print(df)

Computing transition probabilities:   0%|          | 0/12002 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [14:03<00:00, 84.33s/it]


           RandomForestClassifier
Precision                     1.0
Recall                        1.0
F1-Score                      1.0


## 8.3 Test mit Mitarbeiter X

### Generate link predictions EM-DC (positive) and EMx-DC (negative)

In [4]:
# Filter positive edges
pos_samples = []
for i in range(len(test_labels)):
    sample = test_samples[i]
    label = test_labels[i]
    if label == 1:
        pos_samples.append(sample)

# Generate negative samples by appending an x to the employee
neg_samples = []
for sample in pos_samples:
    from_node = sample[0]
    to_node = sample[1]
    if from_node.startswith("EM"):
        from_node = from_node + "x"
    if to_node.startswith("EM"):
        to_node = to_node + "x"
    neg_samples.append((from_node, to_node))


# Generate new test data set
pos_labels = [1 for _ in range(len(pos_samples))]
neg_labels = [0 for _ in range(len(neg_samples))]

# Combine and shuffle samples
samples = pos_samples
samples.extend(neg_samples)
labels = pos_labels
labels.extend(neg_labels)
test_samples, test_labels = EdgeLabelFactory().shuffle(samples, labels)

# Generate new test embeddings
test_embeddings  = [embeddings[str(x[0]),str(x[1])] for x in test_samples]

print("New Testdata set with EM and EMx")
print(test_samples[:10], "...")
print(test_labels[:10], "...")

New Testdata set with EM and EMx
[('DC1325', 'EM1325x'), ('DC1735', 'EM1735'), ('DC1217', 'EM1217x'), ('DC1279', 'EM1279x'), ('EM1684', 'DC1684'), ('DC1383', 'EM1383x'), ('EM670x', 'DC670'), ('EM1688x', 'DC1688'), ('DC1613', 'EM1613x'), ('EM1466x', 'DC1466')] ...
[0, 1, 0, 0, 1, 0, 0, 0, 0, 0] ...


### Train and Test

In [5]:
classifier = RandomForestClassifier
c = classifier(n_estimators=1000)
c.fit(train_embeddings, train_labels)
y_pred = c.predict(test_embeddings)

name  = classifier.__name__
index = ["Precision", "Recall", "F1-Score"]
score = {}

score[name] = [
    metrics.precision_score(test_labels, y_pred),
    metrics.recall_score(test_labels, y_pred),
    metrics.f1_score(test_labels, y_pred)
]

df = pd.DataFrame(score, index=index)
print(df)

           RandomForestClassifier
Precision                0.604961
Recall                   1.000000
F1-Score                 0.753864


## 8.4 Test mit Mitarbieter X ohne Organisation

In [6]:
graph, test_graph, test_samples, test_labels, train_graph, train_samples, train_labels\
    = TestTrainDataFactory().create_testdata(n, add_id=False, add_predict_edges=True, add_org=False)

graph.print_dimemsions()

Graph dimensions:
  order : 12000 (number of nodes)
  size  : 16000 (number of edges)


In [7]:
window = 8
dimensions = 64
embeddings =  EdgeEmbedding(graph.graph, window=window, dimensions=dimensions).embeddings

test_embeddings  = [embeddings[str(x[0]),str(x[1])] for x in test_samples]
train_embeddings = [embeddings[str(x[0]),str(x[1])] for x in train_samples]

print("Embedding shape:")
print("Nodes    =", str(embeddings.kv.vectors.shape[0]), "(number of nodes)")
print("Features =", embeddings.kv.vectors.shape[1], "(number of features per node)")

Computing transition probabilities:   0%|          | 0/12000 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:35<00:00,  3.60s/it]


Embedding shape:
Nodes    = 12000 (number of nodes)
Features = 64 (number of features per node)


In [8]:
# Filter positive edges
pos_samples = []
for i in range(len(test_labels)):
    sample = test_samples[i]
    label = test_labels[i]
    if label == 1:
        pos_samples.append(sample)

# Generate negative samples by appending an x to the employee
neg_samples = []
for sample in pos_samples:
    from_node = sample[0]
    to_node = sample[1]
    if from_node.startswith("EM"):
        from_node = from_node + "x"
    if to_node.startswith("EM"):
        to_node = to_node + "x"
    neg_samples.append((from_node, to_node))


# Generate new test data set
pos_labels = [1 for _ in range(len(pos_samples))]
neg_labels = [0 for _ in range(len(neg_samples))]

# Combine and shuffle samples
samples = pos_samples
samples.extend(neg_samples)
labels = pos_labels
labels.extend(neg_labels)
test_samples, test_labels = EdgeLabelFactory().shuffle(samples, labels)

# Generate new test embeddings
test_embeddings  = [embeddings[str(x[0]),str(x[1])] for x in test_samples]

print("New Testdata set with EM and EMx")
print(test_samples[:10], "...")
print(test_labels[:10], "...")

New Testdata set with EM and EMx
[('DC1157', 'EM1157x'), ('DC555', 'EM555'), ('EM1878', 'DC1878'), ('DC125', 'EM125x'), ('DC341', 'EM341'), ('EM616x', 'DC616'), ('DC769', 'EM769x'), ('DC771', 'EM771'), ('EM1366x', 'DC1366'), ('EM1302', 'DC1302')] ...
[0, 1, 1, 0, 1, 0, 0, 1, 0, 1] ...


In [9]:
classifier = RandomForestClassifier
c = classifier(n_estimators=1000)
c.fit(train_embeddings, train_labels)
y_pred = c.predict(test_embeddings)

name  = classifier.__name__
index = ["Precision", "Recall", "F1-Score"]
score = {}

score[name] = [
    metrics.precision_score(test_labels, y_pred),
    metrics.recall_score(test_labels, y_pred),
    metrics.f1_score(test_labels, y_pred)
]

df = pd.DataFrame(score, index=index)
print(df)

           RandomForestClassifier
Precision                0.500753
Recall                   0.997000
F1-Score                 0.666667



---
_The end._