# Learning from Heterogeneous Graphs

In [1]:
import torch
!pip install -q torch-scatter~=2.1.0 torch-sparse~=0.6.16 torch-cluster~=1.6.0 torch-spline-conv~=1.2.1 torch-geometric==2.2.0 -f https://data.pyg.org/whl/torch-{torch.__version__}.html

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [2]:
import numpy as np
np.random.seed(0)

import torch
torch.manual_seed(0)
from torch.nn import Linear
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class GCNConv(MessagePassing):
    def __init__(self, dim_in, dim_h):
        super().__init__(aggr='add')
        self.linear = Linear(dim_in, dim_h, bias=False)

    def forward(self, x, edge_index):
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))

        x = self.linear(x)

        row, col = edge_index
        deg = degree(col, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        out = self.propagate(edge_index, x=x, norm=norm)

        return out

    def message(self, x, norm):
        return norm.view(-1, 1) * x

In [3]:
conv = GCNConv(16, 32)

## Heterogeneous graphs

In [4]:
from torch_geometric.data import HeteroData

data = HeteroData()

data['user'].x = torch.Tensor([[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]]) # [num_users, num_features_users]
data['game'].x = torch.Tensor([[1, 1], [2, 2]])
data['dev'].x = torch.Tensor([[1], [2]])

data['user', 'follows', 'user'].edge_index = torch.Tensor([[0, 1], [1, 2]]) # [2, num_edges_follows]
data['user', 'plays', 'game'].edge_index = torch.Tensor([[0, 1, 1, 2], [0, 0, 1, 1]])
data['dev', 'develops', 'game'].edge_index = torch.Tensor([[0, 1], [0, 1]])

data['user', 'plays', 'game'].edge_attr = torch.Tensor([[2], [0.5], [10], [12]])

data

HeteroData(
  [1muser[0m={ x=[3, 4] },
  [1mgame[0m={ x=[2, 2] },
  [1mdev[0m={ x=[2, 1] },
  [1m(user, follows, user)[0m={ edge_index=[2, 2] },
  [1m(user, plays, game)[0m={
    edge_index=[2, 4],
    edge_attr=[4, 1]
  },
  [1m(dev, develops, game)[0m={ edge_index=[2, 2] }
)

In [5]:
from torch import nn
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.datasets import DBLP
from torch_geometric.nn import GAT

metapaths = [[('author', 'paper'), ('paper', 'author')]]
transform = T.AddMetaPaths(metapaths=metapaths, drop_orig_edge_types=True)
dataset = DBLP('.', transform=transform)
data = dataset[0]
print(data)

model = GAT(in_channels=-1, hidden_channels=64, out_channels=4, num_layers=1)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict['author'], data.edge_index_dict[('author', 'metapath_0', 'author')]).argmax(dim=-1)
    acc = (pred[mask] == data['author'].y[mask]).sum() / mask.sum()
    return float(acc)

for epoch in range(101):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict['author'], data.edge_index_dict[('author', 'metapath_0', 'author')])
    mask = data['author'].train_mask
    loss = F.cross_entropy(out[mask], data['author'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        train_acc = test(data['author'].train_mask)
        val_acc = test(data['author'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['author'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

Downloading https://www.dropbox.com/s/yh4grpeks87ugr2/DBLP_processed.zip?dl=1


Extracting ./raw/DBLP_processed.zip
Processing...
Done!
  C = torch.sparse.mm(A, B)


HeteroData(
  metapath_dict={ (author, metapath_0, author)=[2] },
  [1mauthor[0m={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057]
  },
  [1mpaper[0m={ x=[14328, 4231] },
  [1mterm[0m={ x=[7723, 50] },
  [1mconference[0m={ num_nodes=20 },
  [1m(author, metapath_0, author)[0m={ edge_index=[2, 11113] }
)
Epoch:   0 | Train Loss: 1.4351 | Train Acc: 25.25% | Val Acc: 22.00%
Epoch:  20 | Train Loss: 1.2815 | Train Acc: 46.50% | Val Acc: 37.50%
Epoch:  40 | Train Loss: 1.1641 | Train Acc: 63.75% | Val Acc: 53.25%
Epoch:  60 | Train Loss: 1.0628 | Train Acc: 76.50% | Val Acc: 63.25%
Epoch:  80 | Train Loss: 0.9771 | Train Acc: 81.00% | Val Acc: 66.25%
Epoch: 100 | Train Loss: 0.9040 | Train Acc: 83.50% | Val Acc: 67.75%
Test accuracy: 72.43%


In [6]:
from torch_geometric.nn import GATConv, Linear, to_hetero

dataset = DBLP(root='.')
data = dataset[0]

data['conference'].x = torch.zeros(20, 1)

class GAT(torch.nn.Module):
    def __init__(self, dim_h, dim_out):
        super().__init__()
        self.conv = GATConv((-1, -1), dim_h, add_self_loops=False)
        self.linear = nn.Linear(dim_h, dim_out)

    def forward(self, x, edge_index):
        h = self.conv(x, edge_index).relu()
        h = self.linear(h)
        return h

model = GAT(dim_h=64, dim_out=4)
model = to_hetero(model, data.metadata(), aggr='sum')
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict)['author'].argmax(dim=-1)
    acc = (pred[mask] == data['author'].y[mask]).sum() / mask.sum()
    return float(acc)

for epoch in range(101):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)['author']
    mask = data['author'].train_mask
    loss = F.cross_entropy(out[mask], data['author'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        train_acc = test(data['author'].train_mask)
        val_acc = test(data['author'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['author'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

GraphModule(
  (conv): ModuleDict(
    (author__to__paper): GATConv((-1, -1), 64, heads=1)
    (paper__to__author): GATConv((-1, -1), 64, heads=1)
    (paper__to__term): GATConv((-1, -1), 64, heads=1)
    (paper__to__conference): GATConv((-1, -1), 64, heads=1)
    (term__to__paper): GATConv((-1, -1), 64, heads=1)
    (conference__to__paper): GATConv((-1, -1), 64, heads=1)
  )
  (linear): ModuleDict(
    (author): Linear(in_features=64, out_features=4, bias=True)
    (paper): Linear(in_features=64, out_features=4, bias=True)
    (term): Linear(in_features=64, out_features=4, bias=True)
    (conference): Linear(in_features=64, out_features=4, bias=True)
  )
)



def forward(self, x, edge_index):
    x_dict = torch_geometric_nn_to_hetero_transformer_get_dict(x);  x = None
    x__author = x_dict.get('author', None)
    x__paper = x_dict.get('paper', None)
    x__term = x_dict.get('term', None)
    x__conference = x_dict.get('conference', None);  x_dict = None
    edge_index_dict = torch_ge

Epoch:   0 | Train Loss: 1.3974 | Train Acc: 20.75% | Val Acc: 23.00%
Epoch:  20 | Train Loss: 1.2047 | Train Acc: 95.25% | Val Acc: 68.00%
Epoch:  40 | Train Loss: 0.8654 | Train Acc: 96.75% | Val Acc: 67.50%
Epoch:  60 | Train Loss: 0.5061 | Train Acc: 98.75% | Val Acc: 73.50%
Epoch:  80 | Train Loss: 0.2580 | Train Acc: 99.50% | Val Acc: 73.50%
Epoch: 100 | Train Loss: 0.1384 | Train Acc: 100.00% | Val Acc: 74.00%
Test accuracy: 78.63%


## Hierarchical Self-Attention Network (HAN)

original

In [7]:
import torch
import torch.nn.functional as F
from torch import nn

import torch_geometric.transforms as T
from torch_geometric.datasets import DBLP
from torch_geometric.nn import HANConv, Linear


dataset = DBLP('.')
data = dataset[0]
print(data)

data['conference'].x = torch.zeros(20, 1)

class HAN(nn.Module):
    def __init__(self, dim_in, dim_out, dim_h=128, heads=8):
        super().__init__()
        self.han = HANConv(dim_in, dim_h, heads=heads, dropout=0.6, metadata=data.metadata())
        self.linear = nn.Linear(dim_h, dim_out)

    def forward(self, x_dict, edge_index_dict):
        out = self.han(x_dict, edge_index_dict)
        out = self.linear(out['author'])
        return out

model = HAN(dim_in=-1, dim_out=4)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
    acc = (pred[mask] == data['author'].y[mask]).sum() / mask.sum()
    return float(acc)

for epoch in range(101):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    mask = data['author'].train_mask
    loss = F.cross_entropy(out[mask], data['author'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        train_acc = test(data['author'].train_mask)
        val_acc = test(data['author'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['author'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

HeteroData(
  [1mauthor[0m={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057]
  },
  [1mpaper[0m={ x=[14328, 4231] },
  [1mterm[0m={ x=[7723, 50] },
  [1mconference[0m={ num_nodes=20 },
  [1m(author, to, paper)[0m={ edge_index=[2, 19645] },
  [1m(paper, to, author)[0m={ edge_index=[2, 19645] },
  [1m(paper, to, term)[0m={ edge_index=[2, 85810] },
  [1m(paper, to, conference)[0m={ edge_index=[2, 14328] },
  [1m(term, to, paper)[0m={ edge_index=[2, 85810] },
  [1m(conference, to, paper)[0m={ edge_index=[2, 14328] }
)
Epoch:   0 | Train Loss: 1.3867 | Train Acc: 32.75% | Val Acc: 26.25%
Epoch:  20 | Train Loss: 1.1576 | Train Acc: 94.75% | Val Acc: 69.25%
Epoch:  40 | Train Loss: 0.7842 | Train Acc: 96.75% | Val Acc: 74.00%
Epoch:  60 | Train Loss: 0.4900 | Train Acc: 98.50% | Val Acc: 78.00%
Epoch:  80 | Train Loss: 0.2945 | Train Acc: 99.25% | Val Acc: 80.00%
Epoch: 100 | Train Loss: 0.2175 | Train Acc: 100.00% | Val A

cải tiến mô hình HAN

version A

In [10]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

import torch_geometric.transforms as T
from torch_geometric.datasets import DBLP
from torch_geometric.nn import HANConv

##############################
# 1. Set Seed for Reproducibility
##############################
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Ensure deterministic behavior
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

##############################
# 2. Load the DBLP Dataset
##############################
# We load the full heterogeneous graph. Note that we are not filtering out any node types.
dataset = DBLP(root='.')
data = dataset[0]
print("=== DBLP Dataset Information ===")
print(data)

# For node types missing features, e.g., 'conference', assign dummy features.
if 'conference' in data and 'x' not in data['conference']:
    num_conferences = data['conference'].num_nodes
    # Create a dummy feature vector (here, zeros with dimension 1).
    data['conference'].x = torch.zeros((num_conferences, 1))

# (Optional) Check if any other node type is missing features:
for node_type in data.node_types:
    if 'x' not in data[node_type]:
        num_nodes = data[node_type].num_nodes
        print(f"Node type '{node_type}' is missing features. Assigning dummy features of shape ({num_nodes}, 1).")
        data[node_type].x = torch.zeros((num_nodes, 1))

##############################
# 3. Define the HAN Model
##############################
class HAN(torch.nn.Module):
    def __init__(self, dim_in, dim_out, dim_h=128, heads=8, dropout=0.6):
        """
        Args:
            dim_in (int): Input feature dimension. Set to -1 to let PyG infer per node type.
            dim_out (int): Number of output classes (for DBLP author classification, 4 classes).
            dim_h (int): Hidden dimension.
            heads (int): Number of attention heads.
            dropout (float): Dropout probability.
        """
        super().__init__()
        # HANConv leverages all available meta-relations in the heterogeneous graph.
        self.han_conv = HANConv(dim_in, dim_h, heads=heads, dropout=dropout, metadata=data.metadata())
        self.linear = nn.Linear(dim_h, dim_out)
        self.dropout = dropout

    def forward(self, x_dict, edge_index_dict):
        # HANConv returns a dict mapping each node type to its learned embedding.
        out_dict = self.han_conv(x_dict, edge_index_dict)
        # We only need the 'author' embeddings for our classification task.
        author_emb = out_dict['author']
        author_emb = F.dropout(author_emb, p=self.dropout, training=self.training)
        out = self.linear(author_emb)
        return out

##############################
# 4. Initialize Model, Optimizer, and Device
##############################
model = HAN(dim_in=-1, dim_out=4, dim_h=128, heads=8, dropout=0.6)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
data = data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)

##############################
# 5. Define the Test Function
##############################
@torch.no_grad()
def test(mask):
    model.eval()
    out = model(data.x_dict, data.edge_index_dict)
    pred = out.argmax(dim=-1)
    correct = (pred[mask] == data['author'].y[mask]).sum().item()
    acc = correct / mask.sum().item()
    return acc

##############################
# 6. Training Loop
##############################
num_epochs = 101
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    out = model(data.x_dict, data.edge_index_dict)
    loss = F.cross_entropy(out[data['author'].train_mask],
                           data['author'].y[data['author'].train_mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        train_acc = test(data['author'].train_mask)
        val_acc = test(data['author'].val_mask)
        print(f"Epoch: {epoch:>3} | Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%")

##############################
# 7. Final Evaluation on Test Set
##############################
test_acc = test(data['author'].test_mask)
print(f"\nTest Accuracy: {test_acc*100:.2f}%")

=== DBLP Dataset Information ===
HeteroData(
  [1mauthor[0m={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057]
  },
  [1mpaper[0m={ x=[14328, 4231] },
  [1mterm[0m={ x=[7723, 50] },
  [1mconference[0m={ num_nodes=20 },
  [1m(author, to, paper)[0m={ edge_index=[2, 19645] },
  [1m(paper, to, author)[0m={ edge_index=[2, 19645] },
  [1m(paper, to, term)[0m={ edge_index=[2, 85810] },
  [1m(paper, to, conference)[0m={ edge_index=[2, 14328] },
  [1m(term, to, paper)[0m={ edge_index=[2, 85810] },
  [1m(conference, to, paper)[0m={ edge_index=[2, 14328] }
)
Node type 'conference' is missing features. Assigning dummy features of shape (20, 1).


Epoch:   0 | Loss: 1.3822 | Train Acc: 30.25% | Val Acc: 26.25%
Epoch:  20 | Loss: 1.2255 | Train Acc: 81.25% | Val Acc: 59.50%
Epoch:  40 | Loss: 0.9450 | Train Acc: 91.50% | Val Acc: 67.50%
Epoch:  60 | Loss: 0.6405 | Train Acc: 96.50% | Val Acc: 72.50%
Epoch:  80 | Loss: 0.4617 | Train Acc: 99.00% | Val Acc: 78.75%
Epoch: 100 | Loss: 0.3363 | Train Acc: 99.25% | Val Acc: 78.50%

Test Accuracy: 82.01%


version B

In [8]:
import torch
import torch.nn.functional as F
from torch import nn
import torch_geometric.transforms as T
from torch_geometric.datasets import DBLP
from torch_geometric.nn import HANConv, Linear
from torch.nn import LayerNorm, Dropout

class EnhancedHAN(nn.Module):
    def __init__(self, dim_in, dim_out, dim_h=192, heads=12, num_layers=2):
        super().__init__()
        self.dropout = Dropout(p=0.5)  # Increased dropout
        
        # Fewer layers but with careful regularization
        self.hans = nn.ModuleList([
            HANConv(
                in_channels=-1 if i == 0 else dim_h,
                out_channels=dim_h,
                heads=heads,
                dropout=0.5,  # Increased dropout in attention
                metadata=data.metadata()
            ) for i in range(num_layers)
        ])
        
        self.layer_norms = nn.ModuleList([
            LayerNorm(dim_h) for _ in range(num_layers)
        ])
        
        # Simplified classifier with strong regularization
        self.classifier = nn.Sequential(
            Linear(dim_h, dim_h),
            nn.GELU(),  # Changed to GELU for better regularization
            LayerNorm(dim_h),
            Dropout(0.5),
            Linear(dim_h, dim_h // 2),
            nn.GELU(),
            LayerNorm(dim_h // 2),
            Dropout(0.5),
            Linear(dim_h // 2, dim_out)
        )
        
        # L2 regularization on attention weights
        self.attention_l2 = 0.01

    def forward(self, x_dict, edge_index_dict):
        hidden = x_dict
        attention_weights = []
        
        for i, (han, norm) in enumerate(zip(self.hans, self.layer_norms)):
            new_hidden = han(hidden, edge_index_dict)
            
            # Store attention weights for regularization
            if hasattr(han, 'alpha'):
                attention_weights.append(han.alpha)
            
            # Residual connection with scaling
            if i > 0:
                for node_type in new_hidden.keys():
                    new_hidden[node_type] = 0.8 * new_hidden[node_type] + 0.2 * hidden[node_type]
            
            # Strong regularization after each layer
            for node_type in new_hidden.keys():
                new_hidden[node_type] = self.dropout(norm(new_hidden[node_type]))
            
            hidden = new_hidden
        
        out = self.classifier(hidden['author'])
        
        # L2 regularization on attention
        self.attention_reg = sum(w.pow(2).mean() for w in attention_weights) * self.attention_l2
        
        return out

# Enhanced data preprocessing
transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToUndirected(),
    T.AddSelfLoops(),
    T.RandomNodeSplit(split='train_rest', num_val=0.2, num_test=0.2),  # Stratified split
])

# Load and preprocess dataset
dataset = DBLP('.', transform=transform)
data = dataset[0]
data['conference'].x = torch.zeros(20, 1)

# Model initialization
model = EnhancedHAN(dim_in=-1, dim_out=4)

# Optimizer with reduced learning rate and increased weight decay
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.001,  # Reduced learning rate
    weight_decay=0.05,  # Increased weight decay
    betas=(0.9, 0.999)
)

# Cosine annealing scheduler with warm restarts
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0=20,  # Reset every 20 epochs
    T_mult=2,  # Double the reset interval each time
    eta_min=1e-6
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    out = model(data.x_dict, data.edge_index_dict)
    pred = out[mask].argmax(dim=-1)
    acc = (pred == data['author'].y[mask]).sum() / mask.sum()
    return float(acc)

# Training with early stopping
best_val_acc = 0
patience = 15
patience_counter = 0
best_state = None

for epoch in range(200):  # Increased max epochs
    model.train()
    optimizer.zero_grad()
    
    out = model(data.x_dict, data.edge_index_dict)
    mask = data['author'].train_mask
    
    # Combined loss with attention regularization
    main_loss = F.cross_entropy(out[mask], data['author'].y[mask])
    reg_loss = model.attention_reg
    loss = main_loss + reg_loss
    
    loss.backward()
    
    # Gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
    
    optimizer.step()
    scheduler.step()
    
    if epoch % 20 == 0:
        train_acc = test(data['author'].train_mask)
        val_acc = test(data['author'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')
        
        # Early stopping check
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            print(f'Early stopping at epoch {epoch}')
            break

# Load best model for final testing
if best_state is not None:
    model.load_state_dict(best_state)
    
test_acc = test(data['author'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

Epoch:   0 | Train Loss: 1.6305 | Train Acc: 30.10% | Val Acc: 27.13%
Epoch:  20 | Train Loss: 1.3850 | Train Acc: 42.22% | Val Acc: 36.74%
Epoch:  40 | Train Loss: 0.6900 | Train Acc: 91.42% | Val Acc: 87.18%
Epoch:  60 | Train Loss: 0.5256 | Train Acc: 94.29% | Val Acc: 88.90%
Epoch:  80 | Train Loss: 0.1428 | Train Acc: 98.77% | Val Acc: 88.53%
Epoch: 100 | Train Loss: 0.0869 | Train Acc: 99.71% | Val Acc: 88.04%
Epoch: 120 | Train Loss: 0.0609 | Train Acc: 99.88% | Val Acc: 88.41%
Epoch: 140 | Train Loss: 0.0517 | Train Acc: 99.88% | Val Acc: 87.92%
Epoch: 160 | Train Loss: 0.0452 | Train Acc: 99.92% | Val Acc: 87.05%
Epoch: 180 | Train Loss: 0.0314 | Train Acc: 99.92% | Val Acc: 87.30%
Test accuracy: 85.94%
