# Imports

In [2]:
import tensorflow as tf
import larq as lq

from utils import prepare_dataset, generate_data_loaders, run_experiment
from models import generate_quantized_gcn, generate_standard_gcn

from tqdm import trange

import pandas as pd

# Setup

In [3]:
import warnings
warnings.filterwarnings('ignore')

cora_dataset = prepare_dataset("Cora")
pubmed_dataset = prepare_dataset("PubMed")

warnings.filterwarnings("once")

Pre-processing node features
Dataset:  cora
Size of train set: 140
Size of val set: 500
Size of test set: 1000
Num classes: 7
Num features: 1433
Pre-processing node features
Dataset:  pubmed
Size of train set: 60
Size of val set: 500
Size of test set: 1000
Num classes: 3
Num features: 500


# Experiment 1: GCN baseline (no quantization)

Following the architecture and hyperparameters settings of Kipf & Welling (2016)

## Cora

In [4]:
input_shapes = (cora_dataset.graphs[0].x.shape[1], cora_dataset.graphs[0].a.shape[1])
model_factory = lambda : generate_standard_gcn(channels=64, input_shapes=input_shapes, dataset=cora_dataset, dropout_rate=0.5, layers=2)[0]
model_factory().summary()

Instructions for updating:
Use `tf.linalg.matmul` instead
Model: "BiGCN"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1433)]       0           []                               
                                                                                                  
 dropout (Dropout)              (None, 1433)         0           ['input_1[0][0]']                
                                                                                                  
 dense (Dense)                  (None, 64)           91776       ['dropout[0][0]']                
                                                                                                  
 graph_conv (GraphConv)         (2708, 64)           0           ['dense[0][0]']                  
                                    

In [5]:
num_runs = 20
epochs = 1000
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
# Note: need to explicitly specify reduction = "sum" to correctly scale loss, otherwise it will be divided by the batch size
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=100, restore_best_weights=True)

run_experiment(
    experiment_name="standard_gcn_baseline", 
    num_runs=num_runs, 
    epochs=epochs, 
    optimizer=optimizer,
    dataset=cora_dataset,
    model_factory=model_factory,
    loss_function=loss_function,
    callbacks=[early_stopping_callback]
    )

  0%|          | 0/20 [00:00<?, ?it/s]2023-01-10 10:35:28.465139: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
100%|██████████| 20/20 [01:33<00:00,  4.67s/it]

20 runs completed: 1.0000 mean train acc, 0.8037 mean val acc, 0.8175 mean test acc





## PubMed

In [None]:
dataset = pubmed_dataset
input_shapes = (dataset.graphs[0].x.shape[1], dataset.graphs[0].a.shape[1])
model_factory = lambda : generate_standard_gcn(channels=64, input_shapes=input_shapes, dataset=dataset, dropout_rate=0.5, layers=2)[0]

num_runs = 5
epochs = 1000
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=100, restore_best_weights=True)

run_experiment(
    experiment_name="standard_gcn_baseline", 
    num_runs=num_runs, 
    epochs=epochs, 
    optimizer=optimizer,
    dataset=dataset,
    model_factory=model_factory,
    loss_function=loss_function,
    callbacks=[early_stopping_callback]
    )

# Experiment 2: Quantized GCN Baseline (Same as paper)

## Cora

In [6]:
dataset = cora_dataset
input_shapes = (dataset.graphs[0].x.shape[1], dataset.graphs[0].a.shape[1])
model_factory = lambda : generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.4, 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_regularizer=None,
    batch_norm_scale=False,
    batch_norm_center=False)[0]
model_factory().summary()

Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).


Model: "BiGCN"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_43 (InputLayer)          [(None, 1433)]       0           []                               
                                                                                                  
 batch_normalization (BatchNorm  (None, 1433)        2866        ['input_43[0][0]']               
 alization)                                                                                       
                                                                                                  
 magnitude_aware_sign (Magnitud  (None, 1433)        0           ['batch_normalization[0][0]']    
 eAwareSign)                                                                                      
                                                                                              

In [8]:
warnings.filterwarnings('ignore')

num_runs = 20
epochs = 1000
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=1000, restore_best_weights=True)

run_experiment(
    experiment_name="binary_gcn_baseline", 
    num_runs=num_runs, 
    epochs=epochs, 
    optimizer=optimizer,
    dataset=dataset,
    model_factory=model_factory,
    loss_function=loss_function,
    callbacks=[early_stopping_callback],
    verbose=0
    )

  0%|          | 0/20 [00:00<?, ?it/s]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
  5%|▌         | 1/20 [00:57<18:19, 57.89s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 10%|█         | 2/20 [01:59<18:06, 60.36s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 15%|█▌        | 3/20 [03:02<17:22, 61.31s/it]Using a binary

20 runs completed: 0.9607 mean train acc, 0.7702 mean val acc, 0.7737 mean test acc





# Experiment 3: Dropout - reducing the dropout rate, removing dropout, and putting dropout before vs. after the binarization

In [9]:
dataset = cora_dataset
input_shapes = (dataset.graphs[0].x.shape[1], dataset.graphs[0].a.shape[1])
num_runs = 20
epochs = 1000

for dropout_rate in [0.5, 0.4, 0.3, 0.2, 0.1, 0.0]:
    print("Dropout rate = ", dropout_rate)
    model_factory = lambda : generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=dropout_rate, 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_regularizer=None,
    batch_norm_scale=False,
    batch_norm_center=False)[0]

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8)
    loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=200, restore_best_weights=True)

    run_experiment(
        experiment_name=f"binary_gcn_dropout={dropout_rate}", 
        num_runs=num_runs, 
        epochs=epochs, 
        optimizer=optimizer,
        dataset=dataset,
        model_factory=model_factory,
        loss_function=loss_function,
        callbacks=[early_stopping_callback],
        verbose=0
        )
    

Dropout rate =  0.5


  0%|          | 0/20 [00:00<?, ?it/s]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
  5%|▌         | 1/20 [00:12<03:54, 12.35s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 10%|█         | 2/20 [01:13<12:14, 40.82s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 15%|█▌        | 3/20 [02:12<13:59, 49.37s/it]Using a binary

20 runs completed: 0.9118 mean train acc, 0.7402 mean val acc, 0.7356 mean test acc
Dropout rate =  0.4


  0%|          | 0/20 [00:00<?, ?it/s]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
  5%|▌         | 1/20 [00:10<03:26, 10.86s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 10%|█         | 2/20 [01:02<10:27, 34.86s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 15%|█▌        | 3/20 [01:52<11:46, 41.55s/it]Using a binary

20 runs completed: 0.9511 mean train acc, 0.7439 mean val acc, 0.7331 mean test acc
Dropout rate =  0.3


  0%|          | 0/20 [00:00<?, ?it/s]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
  5%|▌         | 1/20 [00:41<13:00, 41.10s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 10%|█         | 2/20 [01:20<12:02, 40.12s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 15%|█▌        | 3/20 [02:03<11:47, 41.65s/it]Using a binary

20 runs completed: 0.9854 mean train acc, 0.7688 mean val acc, 0.7752 mean test acc
Dropout rate =  0.2


  0%|          | 0/20 [00:00<?, ?it/s]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
  5%|▌         | 1/20 [00:41<13:00, 41.08s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 10%|█         | 2/20 [01:22<12:25, 41.42s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 15%|█▌        | 3/20 [02:00<11:12, 39.58s/it]Using a binary

20 runs completed: 0.9996 mean train acc, 0.7619 mean val acc, 0.7752 mean test acc
Dropout rate =  0.1


  0%|          | 0/20 [00:00<?, ?it/s]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
  5%|▌         | 1/20 [00:42<13:27, 42.51s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 10%|█         | 2/20 [01:22<12:16, 40.90s/it]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
 15%|█▌        | 3/20 [01:58<10:58, 38.75s/it]Using a binary

# Experiment 4: Allow batchnorm to be trained

In [None]:
dataset = cora_dataset
input_shapes = (dataset.graphs[0].x.shape[1], dataset.graphs[0].a.shape[1])


for batch_norm_momentum in [0.5, 0.75, 0.9, 0.95, 0.99]:
    model_factory = lambda : generate_quantized_gcn(
        channels=64, 
        input_shapes=input_shapes, 
        dataset=dataset, 
        dropout_rate=0.4, 
        layers=2,
        input_quantizer=lq.quantizers.MagnitudeAwareSign,
        kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
        kernel_regularizer=None,
        batch_norm_momentum=batch_norm_momentum,
        batch_norm_scale=True,
        batch_norm_center=True)[0]


    num_runs = 20
    epochs = 1000
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8)
    loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=1000, restore_best_weights=True)

    run_experiment(
        experiment_name=f"binary_gcn_batchnorm_momentum={batch_norm_momentum}", 
        num_runs=num_runs, 
        epochs=epochs, 
        optimizer=optimizer,
        dataset=dataset,
        model_factory=model_factory,
        loss_function=loss_function,
        callbacks=[early_stopping_callback],
        verbose=1
        )

# Combine batchnorm training and optimal dropout rate

# Experiment 5: Batchnorm before every layer

In [162]:
# With magnitude aware sign
dataset = cora_dataset
input_shapes = (dataset.graphs[0].x.shape[1], dataset.graphs[0].a.shape[1])
model_factory = lambda : generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.2, 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_regularizer=None,
    batch_norm_momentum=0.8,
    batch_norm_scale=True,
    batch_norm_center=True,
    single_batch_norm=False)[0]


num_runs = 20
epochs = 2000
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=400, restore_best_weights=True)

run_experiment(
    experiment_name="binary_gcn_multiple_batchnorm", 
    num_runs=num_runs, 
    epochs=epochs, 
    optimizer=optimizer,
    dataset=dataset,
    model_factory=model_factory,
    loss_function=loss_function,
    callbacks=[early_stopping_callback],
    verbose=2
    )

  0%|          | 0/1 [00:00<?, ?it/s]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).


Epoch 1/1000
1/1 - 1s - loss: 2.0372 - accuracy: 0.1571 - val_loss: 1.9416 - val_accuracy: 0.3160 - 575ms/epoch - 575ms/step
Epoch 2/1000
1/1 - 0s - loss: 1.9815 - accuracy: 0.1786 - val_loss: 1.9388 - val_accuracy: 0.3160 - 43ms/epoch - 43ms/step
Epoch 3/1000
1/1 - 0s - loss: 1.9166 - accuracy: 0.2000 - val_loss: 1.9368 - val_accuracy: 0.3160 - 38ms/epoch - 38ms/step
Epoch 4/1000
1/1 - 0s - loss: 1.8373 - accuracy: 0.2714 - val_loss: 1.9351 - val_accuracy: 0.3160 - 45ms/epoch - 45ms/step
Epoch 5/1000
1/1 - 0s - loss: 1.7520 - accuracy: 0.3357 - val_loss: 1.9350 - val_accuracy: 0.3160 - 44ms/epoch - 44ms/step
Epoch 6/1000
1/1 - 0s - loss: 1.6813 - accuracy: 0.4143 - val_loss: 1.9324 - val_accuracy: 0.3160 - 46ms/epoch - 46ms/step
Epoch 7/1000
1/1 - 0s - loss: 1.6388 - accuracy: 0.4286 - val_loss: 1.9334 - val_accuracy: 0.1780 - 37ms/epoch - 37ms/step
Epoch 8/1000
1/1 - 0s - loss: 1.5556 - accuracy: 0.5071 - val_loss: 1.9360 - val_accuracy: 0.0780 - 37ms/epoch - 37ms/step
Epoch 9/1000
1

  0%|          | 0/1 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [171]:
# with ste sign
dataset = cora_dataset
input_shapes = (dataset.graphs[0].x.shape[1], dataset.graphs[0].a.shape[1])
model_factory = lambda : generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.2, 
    layers=2,
    input_quantizer=lq.quantizers.SteSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_regularizer=None,
    batch_norm_momentum=0.7,
    batch_norm_scale=True,
    batch_norm_center=False,
    single_batch_norm=False)[0]


num_runs = 1
epochs = 2000
optimizer = tf.keras.optimizers.experimental.AdamW(learning_rate=0.001, epsilon=1e-8, weight_decay=0.000)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=400, restore_best_weights=True)
#lr_callback = tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=200)

run_experiment(
    experiment_name="binary_gcn_multiple_batchnorm_ste_sign", 
    num_runs=num_runs, 
    epochs=epochs, 
    optimizer=optimizer,
    dataset=dataset,
    model_factory=model_factory,
    loss_function=loss_function,
    callbacks=[early_stopping_callback],
    verbose=2
    )

  0%|          | 0/1 [00:00<?, ?it/s]Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).
Using a binary weight quantizer without setting `kernel_constraint` may result in starved weights (where the gradient is always zero).


Epoch 1/2000
1/1 - 1s - loss: 2.3258 - accuracy: 0.1000 - val_loss: 2.2232 - val_accuracy: 0.1120 - lr: 0.0010 - 706ms/epoch - 706ms/step
Epoch 2/2000
1/1 - 0s - loss: 2.1727 - accuracy: 0.1714 - val_loss: 2.2935 - val_accuracy: 0.0560 - lr: 0.0010 - 52ms/epoch - 52ms/step
Epoch 3/2000
1/1 - 0s - loss: 1.9992 - accuracy: 0.1929 - val_loss: 2.2615 - val_accuracy: 0.0740 - lr: 0.0010 - 44ms/epoch - 44ms/step
Epoch 4/2000
1/1 - 0s - loss: 2.0667 - accuracy: 0.1786 - val_loss: 2.3084 - val_accuracy: 0.1040 - lr: 0.0010 - 40ms/epoch - 40ms/step
Epoch 5/2000
1/1 - 0s - loss: 1.9024 - accuracy: 0.2500 - val_loss: 2.2976 - val_accuracy: 0.1060 - lr: 0.0010 - 40ms/epoch - 40ms/step
Epoch 6/2000
1/1 - 0s - loss: 1.9347 - accuracy: 0.2643 - val_loss: 2.0803 - val_accuracy: 0.1460 - lr: 0.0010 - 43ms/epoch - 43ms/step
Epoch 7/2000
1/1 - 0s - loss: 1.8294 - accuracy: 0.2857 - val_loss: 2.2274 - val_accuracy: 0.1340 - lr: 0.0010 - 41ms/epoch - 41ms/step
Epoch 8/2000
1/1 - 0s - loss: 1.7676 - accurac

100%|██████████| 1/1 [00:45<00:00, 45.43s/it]

1 runs completed: 1.0000 mean train acc, 0.7680 mean val acc, 0.7760 mean test acc





# Experiment 6: Effect of l2 regularization vs AdamW, using binary regularizers

In [None]:
dataset = cora_dataset
input_shapes = (dataset.graphs[0].x.shape[1], dataset.graphs[0].a.shape[1])

## using AdamW

In [None]:

model_factory = lambda : generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.4, 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_regularizer=None,
    batch_norm_scale=True,
    batch_norm_center=True,
    single_batch_norm=True)[0]


num_runs = 5
epochs = 1000

for weight_decay in [0.01, 0.001, 0.0005, 0.0001]:
    optimizer = tf.keras.optimizers.experimental.AdamW(learning_rate=0.001, epsilon=1e-8, weight_decay=weight_decay)
    loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=1000, restore_best_weights=True)

    run_experiment(
        experiment_name=f"binary_gcn_adamw_decay={weight_decay}", 
        num_runs=num_runs, 
        epochs=epochs, 
        optimizer=optimizer,
        dataset=dataset,
        model_factory=model_factory,
        loss_function=loss_function,
        callbacks=[early_stopping_callback],
        verbose=0
        )



# Using binary regularizers

In [None]:
from regularizers import BinaryL1Regularizer, BinaryL2Regularizer

In [None]:

weight_decay = 0.00001
model_factory = lambda : generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.4, 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_regularizer=BinaryL1Regularizer(strength=weight_decay),
    batch_norm_momentum=0.9,
    batch_norm_scale=True,
    batch_norm_center=True,
    single_batch_norm=True)[0]

optimizer = tf.keras.optimizers.Adam(learning_rate=0.002, epsilon=1e-8)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=200, restore_best_weights=True)

num_runs = 1
epochs = 1000

run_experiment(
    experiment_name=f"binary_gcn_binaryl1_wd={weight_decay}", 
    num_runs=num_runs, 
    epochs=epochs, 
    optimizer=optimizer,
    dataset=dataset,
    model_factory=model_factory,
    loss_function=loss_function,
    callbacks=[early_stopping_callback],
    verbose=2
    )

In [None]:
weight_decay = 0.00001
model_factory = lambda : generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.4, 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_regularizer=BinaryL2Regularizer(strength=weight_decay),
    batch_norm_momentum=0.9,
    batch_norm_scale=True,
    batch_norm_center=True,
    single_batch_norm=True)[0]

optimizer = tf.keras.optimizers.Adam(learning_rate=0.002, epsilon=1e-8)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=200, restore_best_weights=True)

num_runs = 1
epochs = 1000

run_experiment(
    experiment_name=f"binary_gcn_binaryl1_wd={weight_decay}", 
    num_runs=num_runs, 
    epochs=epochs, 
    optimizer=optimizer,
    dataset=dataset,
    model_factory=model_factory,
    loss_function=loss_function,
    callbacks=[early_stopping_callback],
    verbose=2
    )

# Experiment 8: Increase number or width of layers (compare to standard GCN)

In [None]:
dataset = cora_dataset
input_shapes = (dataset.graphs[0].x.shape[1], dataset.graphs[0].a.shape[1])


for num_channels in [64, 128, 256]:
    model_factory = lambda : generate_quantized_gcn(
        channels=num_channels, 
        input_shapes=input_shapes, 
        dataset=dataset, 
        dropout_rate=0.4, 
        layers=2,
        input_quantizer=lq.quantizers.MagnitudeAwareSign,
        kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
        kernel_regularizer=None,
        batch_norm_momentum=batch_norm_momentum,
        batch_norm_scale=True,
        batch_norm_center=True)[0]


    num_runs = 20
    epochs = 1000
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8)
    loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=1000, restore_best_weights=True)

    run_experiment(
        experiment_name=f"binary_gcn_batchnorm_momentum={batch_norm_momentum}", 
        num_runs=num_runs, 
        epochs=epochs, 
        optimizer=optimizer,
        dataset=dataset,
        model_factory=model_factory,
        loss_function=loss_function,
        callbacks=[early_stopping_callback],
        verbose=1
        )

# Experiment: Use SGD with momentum and a LR schedule

In [None]:
from lr_schedules import WarmUpCosineDecayScheduler

In [None]:
dataset = cora_dataset
input_shapes = (dataset.graphs[0].x.shape[1], dataset.graphs[0].a.shape[1])


model_factory = lambda : generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.4, 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_regularizer=None,
    batch_norm_momentum=batch_norm_momentum,
    batch_norm_scale=True,
    batch_norm_center=True)[0]


num_runs = 1
epochs = 1000
sgd_momentum = 0.9
optimizer = tf.keras.optimizers.SGD(momentum=sgd_momentum)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=1000, restore_best_weights=True)
warmup_cosine_scheduler = WarmUpCosineDecayScheduler(
    learning_rate_base=0.1, 
    global_step_init=0, 
    warmup_steps=50,
    verbose=1
    )

run_experiment(
    experiment_name=f"binary_gcn_sgd_momentum={sgd_momentum}", 
    num_runs=num_runs, 
    epochs=epochs, 
    optimizer=optimizer,
    dataset=dataset,
    model_factory=model_factory,
    loss_function=loss_function,
    callbacks=[early_stopping_callback, warmup_cosine_scheduler],
    verbose=2
    )

# Experiment 9: Use Bop Optimizer

In [None]:
# Bop training notes:
# Warming up and cooling down gamma seems to have a good effect
# Also resetting the optimizer every x train steps helps with the accumlated gradients
# Small adam learning rate seems to be helpful
# Some ok param settings:
    # gamma_initial = 0
    # gamma_target = 1e-08
    # gamma_decay = 0.99
    # warmup = 5
    # threshold = 1e-8
    # threshold_increase = 1.0



In [None]:
dataset = cora_dataset
input_shapes = (dataset.graphs[0].x.shape[1], dataset.graphs[0].a.shape[1])
loader_tr, loader_va, loader_te = generate_data_loaders(dataset)
model = generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0., 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    batch_norm_momentum=0.9,
    batch_norm_scale=True,
    batch_norm_center=True,
    single_batch_norm=True,
    kernel_constraint="weight_clip",
    kernel_initializer="he_uniform")[0]
initial_optimizer=tf.keras.optimizers.Adam(0.01)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=200, restore_best_weights=True)


gamma_initial = 0
gamma_target = 1e-08
gamma_decay = 0.992
warmup = 5

threshold = 4e-8
threshold_decay = 1.0
epochs = 20

training_history = {"loss":[], "accuracy":[], "val_loss":[], "val_accuracy":[], "gamma":[], "threshold":[]}
for i in range(100):

    if i < warmup:
        gamma = gamma_target * (i+1) / warmup

    print(f"train run {i}, threshold={threshold:.3}, gamma={gamma:.3}" )

    case_optimizer = lq.optimizers.CaseOptimizer(
    (
        lq.optimizers.Bop.is_binary_variable,  # predicate
        lq.optimizers.Bop(threshold=threshold, gamma=gamma),  # optimizer
    ),
    default_optimizer=tf.keras.optimizers.Adam(0.0001))

    model.compile(
        optimizer=case_optimizer,
        loss=loss_function,
        weighted_metrics=["accuracy"])

    history = model.fit(
        loader_tr.load(),
        steps_per_epoch=loader_tr.steps_per_epoch,
        validation_data=loader_va.load(),
        validation_steps=loader_va.steps_per_epoch,
        epochs=epochs,
        callbacks=[],
        verbose=1
    )
    history.history["gamma"] = [gamma] * epochs
    history.history["threshold"] = [threshold] * epochs
    #model.evaluate(loader_va.load(), steps=loader_va.steps_per_epoch, verbose=2)
    gamma *= gamma_decay
    threshold *= threshold_decay
    for k, v in history.history.items():
        training_history[k] += v



In [None]:
import matplotlib.pyplot as plt
import pandas as od
history_df = pd.DataFrame(training_history)
history_df[["accuracy", "val_accuracy"]].plot()
ax = history_df[["loss", "val_loss"]].plot()
ax.set_yscale("log")
history_df[["threshold", "gamma"]].plot(ax=ax.twinx(), )

In [None]:
checkpoint = model.get_weights()

In [None]:
import numpy as np
model.set_weights(checkpoint)
case_optimizer = lq.optimizers.CaseOptimizer(
(
    lq.optimizers.Bop.is_binary_variable,  # predicate
    lq.optimizers.Bop(threshold=1e-8, gamma=0.5e-8),  # optimizer
),
default_optimizer=None)

model.compile(
        optimizer=case_optimizer,
        loss=loss_function,
        weighted_metrics=["accuracy"])

history = model.fit(
    loader_tr.load(),
    steps_per_epoch=loader_tr.steps_per_epoch,
    validation_data=loader_va.load(),
    validation_steps=loader_va.steps_per_epoch,
    epochs=200,
    callbacks=[],
    verbose=0
)
pd.DataFrame(history.history)[["loss", "val_loss"]].plot()

In [None]:
import matplotlib.pyplot as plt
import pandas as od
history_df = pd.DataFrame(training_history)
history_df[["accuracy", "val_accuracy"]].plot()
ax = history_df[["loss", "val_loss"]].plot()
ax.set_yscale("log")
history_df[["threshold", "gamma"]].plot(ax=ax.twinx(), )

In [None]:
import matplotlib.pyplot as plt
import pandas as od
history_df = pd.DataFrame(training_history)
history_df[["accuracy", "val_accuracy"]].plot()
ax = history_df[["loss", "val_loss"]].plot()
ax.set_yscale("log")
history_df[["threshold", "gamma"]].plot()

In [None]:
import matplotlib.pyplot as plt
import pandas as od
history_df = pd.DataFrame(training_history)
history_df[["accuracy", "val_accuracy"]].plot()
history_df[["loss", "val_loss"]].plot()

In [None]:
import matplotlib.pyplot as plt
plt.plot(training_history["accuracy"])
plt.plot(training_history["val_accuracy"])

In [None]:
# Hyperp

# Experiment 7: Hyperparameter tuning (allow the quantizers to change as part of this)

# Experiment 10: Use two-part learning, start with full precision, then do second phase of binarized learning

In [None]:
dataset = cora_dataset
input_shapes = (cora_dataset.graphs[0].x.shape[1], cora_dataset.graphs[0].a.shape[1])
full_precision_model = generate_standard_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.4, 
    layers=2,
    activation=lq.quantizers.MagnitudeAwareSign,
    batch_norm_momentum=0.9,
    batch_norm_scale=True,
    batch_norm_center=True,
    single_batch_norm=True,
    use_batch_norm=True,
    preactivation=True)[0]

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
# Note: need to explicitly specify reduction = "sum" to correctly scale loss, otherwise it will be divided by the batch size
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=100, restore_best_weights=True)

epochs = 1000
loader_tr, loader_va, loader_te = generate_data_loaders(dataset)

full_precision_model.compile(
    optimizer=optimizer,
    loss=loss_function,
    weighted_metrics=["accuracy"]
)

history = full_precision_model.fit(
    loader_tr.load(),
    steps_per_epoch=loader_tr.steps_per_epoch,
    validation_data=loader_va.load(),
    validation_steps=loader_va.steps_per_epoch,
    epochs=epochs,
    callbacks=[early_stopping_callback],
    verbose=0
)
print("Num epochs: ", len(history.history["loss"]))
full_precision_model.evaluate(loader_va.load(), steps=loader_va.steps_per_epoch, verbose=2)

In [None]:
def initialize_from_full_precision(full_precision_model):
    binary_model = generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.4, 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    batch_norm_momentum=0.9,
    batch_norm_scale=True,
    batch_norm_center=True,
    single_batch_norm=True)[0]

    # Need to do this to build the model
    binary_model.predict(loader_te.load(), steps=loader_te.steps_per_epoch)

    for bin_l, fp_l in zip(binary_model.layers, full_precision_model.layers):
        bin_l.set_weights(fp_l.get_weights())

    return binary_model

In [None]:
bin_model = initialize_from_full_precision(full_precision_model)
bin_model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005, epsilon=1e-8),
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum"),
    weighted_metrics=["accuracy"]
    )
bin_model.evaluate(loader_va.load(), steps=loader_va.steps_per_epoch, verbose=2)

In [None]:
model_factory = lambda : initialize_from_full_precision(full_precision_model)

no_op_quantizer = lq.quantizers.NoOpQuantizer(precision=1)
layer = lq.layers.QuantDense(16, kernel_quantizer=no_op_quantizer)
case_optimizer = lq.optimizers.CaseOptimizer(
    (
        lq.optimizers.Bop.is_binary_variable,  # predicate
        lq.optimizers.Bop(threshold=1e-7, gamma=1e-4),  # optimizer
    ),
    default_optimizer=tf.keras.optimizers.Adam(0.001),
)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=200, restore_best_weights=True)

num_runs = 1
epochs = 1000

run_experiment(
    experiment_name=f"binary_gcn_fp_init", 
    num_runs=num_runs, 
    epochs=epochs, 
    optimizer=optimizer,
    dataset=dataset,
    model_factory=model_factory,
    loss_function=loss_function,
    callbacks=[early_stopping_callback, tf.keras.callbacks.TensorBoard("tensorboard")],
    verbose=2
    )

# Experiment 11: Learning rate schedules

In [None]:
model_factory = lambda : generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.4, 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    batch_norm_momentum=0.9,
    batch_norm_scale=True,
    batch_norm_center=True,
    single_batch_norm=True)[0]

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
learning_rate_callback = tf.keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=100)

num_runs = 1
epochs = 1000

model.compile(
    optimizer=optimizer,
    loss=loss_function,
    weighted_metrics=["accuracy"])

history = model.fit(
    loader_tr.load(),
    steps_per_epoch=loader_tr.steps_per_epoch,
    validation_data=loader_va.load(),
    validation_steps=loader_va.steps_per_epoch,
    epochs=epochs,
    callbacks=[],
    verbose=1
)

# Experiment 12: Ternary version

In [None]:
model_factory = lambda : generate_quantized_gcn(
    channels=64, 
    input_shapes=input_shapes, 
    dataset=dataset, 
    dropout_rate=0.4, 
    layers=2,
    input_quantizer=lq.quantizers.MagnitudeAwareSign,
    kernel_quantizer=lq.quantizers.MagnitudeAwareSign,
    batch_norm_momentum=0.9,
    batch_norm_scale=True,
    batch_norm_center=True,
    single_batch_norm=True)[0]

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, epsilon=1e-8)
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction="sum")
early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=500, restore_best_weights=True)

num_runs = 1
epochs = 1000

run_experiment(
    experiment_name=f"ternary_gcn", 
    num_runs=num_runs, 
    epochs=epochs, 
    optimizer=optimizer,
    dataset=dataset,
    model_factory=model_factory,
    loss_function=loss_function,
    callbacks=[early_stopping_callback],
    verbose=2
    )

# Additional Experiment: Sign quantizers
SteSign seems to do much worse than MagnitudeAwareSign. Is this because of the scale factor, or just bad hyperparameter tuning?
Look into whether the input quantizer can be changed to ste_sign