In [1]:
import sys
sys.path.append('../../')

In [2]:
import torch
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange

from models.train import train, test
from models.models import GCN, GAT, GraphSAGE
from models.utils import ContagionDataset

Using backend: pytorch


In [3]:
do_train = True
seed = 4444

# Small dataset

In [4]:
data_dir = './data'
log_path = './logs'
save_path = './saved'

## Target: additional_stress

In [5]:
dict_small_total = {}
sets_lengths = (0.7, 0.15, 0.15)
target = 'additional_stress'

In [6]:
dataset = ContagionDataset(
    raw_dir=data_dir,
    drop_edges=0,
    sets_lengths=sets_lengths,
    target = target,
    seed=seed,
)
x_train,x_test,y_train,y_test = train_test_split(dataset.node_features[0].to_numpy(), dataset.targets[0], test_size=0.2, random_state=seed)

### Logistic Regression

In [7]:
model_lr = LogisticRegression(multi_class='multinomial',random_state=seed).fit(x_train, y_train)
print(classification_report(y_true=y_test, y_pred=model_lr.predict(x_test)))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80         4
           1       0.50      0.12      0.20         8
           2       0.00      0.00      0.00         8
           3       0.27      0.80      0.40         5

    accuracy                           0.36        25
   macro avg       0.36      0.48      0.35        25
weighted avg       0.32      0.36      0.27        25



In [8]:
test_acc = model_lr.score(x_test, y_test)
dict_small_total['logistic_regression'] = test_acc
print(f"Train accuracy: {model_lr.score(x_train, y_train)}")
print(f"Test accuracy: {test_acc}")

Train accuracy: 0.45
Test accuracy: 0.36


### Random Forest

In [9]:
n = 10
num_nodes = x_train.shape[0]
model_rf = None
test_acc = 0.0
for k in trange(1,num_nodes, (num_nodes - 1) // n):
    tmp = RandomForestClassifier(random_state=seed, n_estimators=k).fit(x_train,y_train)
    tmp_acc = tmp.score(x_test, y_test)
    if test_acc < tmp_acc:
        test_acc = tmp_acc
        model_rf = tmp

print(classification_report(y_true=y_test, y_pred=model_rf.predict(x_test)))

100%|██████████| 11/11 [00:00<00:00, 23.92it/s]

              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       1.00      0.25      0.40         8
           2       0.50      0.38      0.43         8
           3       0.25      0.60      0.35         5

    accuracy                           0.48        25
   macro avg       0.64      0.56      0.52        25
weighted avg       0.66      0.48      0.48        25






In [10]:
dict_small_total['random_forest'] = test_acc
print(f"Train accuracy: {model_rf.score(x_train, y_train)}")
print(f"Test accuracy: {test_acc}")

Train accuracy: 1.0
Test accuracy: 0.48


### KNN Classifier

In [11]:
model_knn = KNeighborsClassifier(n_neighbors=dataset.num_classes).fit(x_train,y_train)
print(classification_report(y_true=y_test, y_pred=model_knn.predict(x_test)))

              precision    recall  f1-score   support

           0       0.44      1.00      0.62         4
           1       0.50      0.12      0.20         8
           2       0.50      0.50      0.50         8
           3       0.33      0.40      0.36         5

    accuracy                           0.44        25
   macro avg       0.44      0.51      0.42        25
weighted avg       0.46      0.44      0.40        25



In [12]:
test_acc = model_knn.score(x_test, y_test)
dict_small_total['knn_classifier'] = test_acc
print(f"Train accuracy: {model_knn.score(x_train, y_train)}")
print(f"Test accuracy: {test_acc}")

Train accuracy: 0.61
Test accuracy: 0.44


In [35]:
dict_small_total

{'logistic_regression': 0.4, 'random_forest': 0.48, 'knn_classifier': 0.32}

### GCN

In [58]:
save_model = f'{save_path}_gcn'
log_dir = f'{log_path}_gcn'

dataset_val = ContagionDataset(
    raw_dir=data_dir,
    drop_edges=0,
    sets_lengths=sets_lengths,
    add_self_loop = True,
    target = target,
    seed=seed,
)

gcn_model = dict(
    in_features=[dataset_val.num_node_features],
    h_features=[[5, 10], [10, 15], [5,5,5], [5,5,5,5], [5, 10, 15], [5, 10, 15, 20], [5], [10], [15]],
    # h_features=[[5, 10], [10, 15], [5], [10], [15], [10,15]],
    out_features=[dataset_val.num_classes],
    activation=[torch.nn.ReLU()],
    norm_edges=['both', 'none'],
    norm_nodes=[None, 'bn', 'gn'],
    dropout=[0.2, 0.5, 0.0],
    # other
    lr=[1],
    label_smoothing=[0.0, 0.2, 0.4],
    use_edge_weight=[True, False],
    add_self_loop=[True, False],
    drop_edges=[0,0.2,0.4],
)
list_model = [dict(zip(gcn_model.keys(), k)) for k in itertools.product(*gcn_model.values())]

In [28]:
if do_train:
    for d in tqdm(list_model):
        d = d.copy()
        lr = d.pop('lr')
        ls = d.pop('label_smoothing')
        add_self_loop = d.pop('add_self_loop')
        drop_edges = d.pop('drop_edges')
        use_edge_weight = d.pop('use_edge_weight')

        dataset_valid = ContagionDataset(
            raw_dir=data_dir,
            drop_edges=0,
            sets_lengths=sets_lengths,
            add_self_loop = add_self_loop,
            target = target,
            seed=seed,
        )

        dataset_train = ContagionDataset(
            raw_dir=data_dir,
            drop_edges=drop_edges,
            sets_lengths=sets_lengths,
            add_self_loop = add_self_loop,
            target = target,
            seed=seed,
        )

        train(
            model=GCN(**d),
            dict_model=d,
            dataset_train=dataset_train,
            dataset_val=dataset_valid,
            log_dir=log_dir,
            save_path=save_model,
            lr=lr,
            optimizer_name="adamw",
            n_epochs=100,
            scheduler_mode='max_val_acc',
            debug_mode=False,
            steps_validate=1,
            use_cpu=False,
            label_smoothing=ls,
            use_edge_weight=use_edge_weight,
        )

100%|██████████| 540/540 [25:50<00:00,  2.87s/it]
100%|██████████| 540/540 [28:38<00:00,  3.18s/it]
100%|██████████| 540/540 [28:58<00:00,  3.22s/it]
100%|██████████| 3/3 [1:23:27<00:00, 1669.18s/it]


In [36]:
dataset[0].ndata['test_mask'].sum()

tensor(13)

In [45]:
res_edges = test(
    dataset=dataset_val,
    save_path=save_model,
    n_runs=1,
    debug_mode=False,
    use_cpu=False,
    save=True,
    use_edge_weight=True,
)
res_edges[1]

1944it [00:49, 39.05it/s]


0.6999996900558472

In [36]:
all = res_edges[2]
# ascending order
sort_idx = np.argsort([k[1] for k in all])[::-1]
all[sort_idx[0]][0]

{'in_features': 2,
 'h_features': [5],
 'out_features': 4,
 'activation': ReLU(),
 'norm_edges': 'both',
 'norm_nodes': 'bn',
 'dropout': 0.0,
 'train_lr': 1,
 'train_optimizer_name': 'adamw',
 'train_scheduler_mode': 'max_val_acc',
 'train_label_smoothing': 0.2,
 'train_use_edge_weight': True,
 'train_self_loop': True,
 'train_drop_edges': 0,
 'train_loss': 2.8970888,
 'train_acc': 0.4712643325328827,
 'val_acc': 0.4444442093372345,
 'epoch': 4,
 'model_class': 'gcn',
 'path_name': '2_[5]_4_ReLU()_both_bn_0.0_1_adamw_max_val_acc_0.2_True_True_0',
 'test_acc': 0.6999996900558472}

In [48]:
max([k[0]['val_acc'] for k in all])

0.722221851348877

In [41]:
[all[k][0] for k in sort_idx[:10]]

[{'in_features': 2,
  'h_features': [5],
  'out_features': 4,
  'activation': ReLU(),
  'norm_edges': 'both',
  'norm_nodes': 'bn',
  'dropout': 0.0,
  'train_lr': 1,
  'train_optimizer_name': 'adamw',
  'train_scheduler_mode': 'max_val_acc',
  'train_label_smoothing': 0.2,
  'train_use_edge_weight': True,
  'train_self_loop': True,
  'train_drop_edges': 0,
  'train_loss': 2.8970888,
  'train_acc': 0.4712643325328827,
  'val_acc': 0.4444442093372345,
  'epoch': 4,
  'model_class': 'gcn',
  'path_name': '2_[5]_4_ReLU()_both_bn_0.0_1_adamw_max_val_acc_0.2_True_True_0',
  'test_acc': 0.6999996900558472},
 {'in_features': 2,
  'h_features': [5, 10],
  'out_features': 4,
  'activation': ReLU(),
  'norm_edges': 'none',
  'norm_nodes': 'bn',
  'dropout': 0.2,
  'train_lr': 1,
  'train_optimizer_name': 'adamw',
  'train_scheduler_mode': 'max_val_acc',
  'train_label_smoothing': 0.0,
  'train_use_edge_weight': False,
  'train_self_loop': True,
  'train_drop_edges': 0.2,
  'train_loss': 3.730532

In [42]:
res_no_edges = test(
    dataset=dataset_val,
    save_path=save_model,
    n_runs=1,
    debug_mode=False,
    use_cpu=False,
    save=True,
    use_edge_weight=False,
)
res_no_edges[1]

1296it [00:18, 71.53it/s]


0.6999996900558472

### GraphSAGE

In [53]:
save_model = f'{save_path}_sage'
log_dir = f'{log_path}_sage'

dataset_val = ContagionDataset(
    raw_dir=data_dir,
    drop_edges=0,
    sets_lengths=sets_lengths,
    add_self_loop = True,
    target = target,
    seed=seed,
)

sage_model = dict(
    in_features = [dataset.num_node_features],
    h_features = [[5, 10], [10, 15], [5,5,5], [5, 10, 15], [5], [10]], 
    out_features = [dataset.num_classes],
    aggregator_type = ['mean', 'gcn', 'pool', 'lstm'],
    norm_edges = ['right', 'left', 'none'],
    norm_nodes = [None, 'bn', 'gn'],
    activation = [torch.nn.ReLU()],
    feat_drop = [0.2],
    # other
    lr=[1e-2, 1, 1e-3],
    label_smoothing=[0.0, 0.2, 0.4],
    use_edge_weight=[True, False],
    add_self_loop=[True, False],
    drop_edges=[0,0.2,0.4],
)
list_model = [dict(zip(sage_model.keys(), k)) for k in itertools.product(*sage_model.values())]

In [54]:
if do_train:
    for d in tqdm(list_model):
        d = d.copy()
        lr = d.pop('lr')
        ls = d.pop('label_smoothing')
        add_self_loop = d.pop('add_self_loop')
        drop_edges = d.pop('drop_edges')
        use_edge_weight = d.pop('use_edge_weight')

        dataset_valid = ContagionDataset(
            raw_dir=data_dir,
            drop_edges=0,
            sets_lengths=sets_lengths,
            add_self_loop = add_self_loop,
            target = target,
            seed=seed,
        )

        dataset_train = ContagionDataset(
            raw_dir=data_dir,
            drop_edges=drop_edges,
            sets_lengths=sets_lengths,
            add_self_loop = add_self_loop,
            target = target,
            seed=seed,
        )

        train(
            model=GraphSAGE(**d),
            dict_model=d,
            dataset_train=dataset_train,
            dataset_val=dataset_valid,
            log_dir=log_dir,
            save_path=save_model,
            lr=lr,
            optimizer_name="adamw",
            n_epochs=100,
            scheduler_mode='max_val_acc',
            debug_mode=False,
            steps_validate=1,
            use_cpu=False,
            label_smoothing=ls,
            use_edge_weight=use_edge_weight,
        )

  0%|          | 38/23328 [01:45<17:55:24,  2.77s/it]


KeyboardInterrupt: 

### GAT

In [18]:
save_model = f'{save_path}_gat'
log_dir = f'{log_path}_gat'

add_self_loop = True

dataset_val = ContagionDataset(
    raw_dir=data_dir,
    drop_edges=0,
    sets_lengths=sets_lengths,
    add_self_loop = add_self_loop,
    target = target,
    seed=seed,
)

gat_model = dict(
    in_features = [dataset.num_node_features],
    h_features = [[5, 10], [10, 15], [5,5,5], [5, 10, 15], [5], [10]], 
    out_features = [dataset.num_classes],
    num_heads = [[dataset.num_node_features] * 4],
    norm_nodes = [None, 'bn', 'gn'],
    activation = [None, torch.nn.ReLU()],
    negative_slope = [0.2],
    feat_drop = [0.2],
    attn_drop = [0.2],
    residual = [False, True],
    # other
    lr=[1e-2, 1, 1e-3],
    label_smoothing=[0.0, 0.2, 0.4],
    use_edge_weight=[True, False],
    drop_edges=[0,0.2,0.4],
)
list_model = [dict(zip(gat_model.keys(), k)) for k in itertools.product(*gat_model.values())]

In [19]:
if do_train:
    for d in tqdm(list_model):
        d = d.copy()
        lr = d.pop('lr')
        ls = d.pop('label_smoothing')
        drop_edges = d.pop('drop_edges')
        use_edge_weight = d.pop('use_edge_weight')

        dataset_valid = ContagionDataset(
            raw_dir=data_dir,
            drop_edges=0,
            sets_lengths=sets_lengths,
            add_self_loop = add_self_loop,
            target = target,
            seed=seed,
        )

        dataset_train = ContagionDataset(
            raw_dir=data_dir,
            drop_edges=drop_edges,
            sets_lengths=sets_lengths,
            add_self_loop = add_self_loop,
            target = target,
            seed=seed,
        )

        train(
            model=GAT(**d),
            dict_model=d,
            dataset_train=dataset_train,
            dataset_val=dataset_valid,
            log_dir=log_dir,
            save_path=save_model,
            lr=lr,
            optimizer_name="adamw",
            n_epochs=100,
            scheduler_mode='max_val_acc',
            debug_mode=False,
            steps_validate=1,
            use_cpu=False,
            label_smoothing=ls,
            use_edge_weight=use_edge_weight,
        )

  0%|          | 2/3888 [00:07<4:15:28,  3.94s/it]


KeyboardInterrupt: 

### Overall