In [1]:
from pft_encoder import read_data, PreFTDataset, Encoder, evaluate_space, pre_fine_tune, inference
from edge_sampling import SBMGraph, AdaptiveBMGraph
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch_geometric.nn import SAGEConv, GCNConv, GATConv, TransformerConv
from sklearn.metrics import accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
noisy_config = {
"name": "Structured/Amazon-GoogleBert",
"task_type": "classification",
"vocab": ["0", "1"],
"trainset": "Noisy/er_magellan/Structured/Amazon-Google/Amazon-GoogleBert/train.txt",
"validset": "data/er_magellan/Structured/Amazon-Google/valid.txt",
"testset": "data/er_magellan/Structured/Amazon-Google/test.txt"
}
real_config =  {
  "name": "Structured/Amazon-Google",
  "task_type": "classification",
  "vocab": ["0", "1"],
  "trainset": "data/er_magellan/Structured/Amazon-Google/train.txt",
  "validset": "data/er_magellan/Structured/Amazon-Google/valid.txt",
  "testset": "data/er_magellan/Structured/Amazon-Google/test.txt"
  }

In [3]:
path = 'Noisy/er_magellan/Structured/Amazon-Google/Amazon-GoogleBert/train.txt'
# path = 'data/er_magellan/Structured/Amazon-Google/train.txt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
left_samples, right_samples, labels = read_data(path)

In [None]:
pft_dataset = PreFTDataset(path, max_len=128)
pft_dataloader = DataLoader(pft_dataset, batch_size=32, shuffle=True)
inference_dataloader = DataLoader(pft_dataset, batch_size=32, shuffle=False)
model = Encoder(device=device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)
model.to(device)

In [5]:
pre_fine_tune(model, pft_dataloader, optimizer, criterion, device, epochs=1)


100%|██████████| 215/215 [00:21<00:00, 10.03it/s]

Epoch [1/1], Loss: 0.203425990460917





In [6]:
positive, negative, pos_centroid, neg_centroid, gap = evaluate_space(model, pft_dataloader, device)

100%|██████████| 215/215 [00:06<00:00, 31.42it/s]


In [7]:
gap.cpu().detach().numpy()

array(12.397679, dtype=float32)

In [8]:
X = inference(model, pft_dataloader)

In [None]:
class GNN(nn.Module):
    def __init__(self, in_channels, out_channels, num_layers):
        super(GNN, self).__init__()
        self.convs = nn.ModuleList()
        self.convs.append(GATConv(in_channels, out_channels))
        for i in range(num_layers - 1):
            self.convs.append(GATConv(out_channels, out_channels))
    
    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index)
        return x

In [None]:
sbm = AdaptiveBMGraph(p=0.005, q=0.0005, 
config_true=real_config, config_noisy=noisy_config, c0=50, c1=50,beta=25)
sbm.calc_community_probs()
print('Community probabilities:')
print(sbm.probs)
print('Generating graph...')
data_object = sbm.generate_graph()
print('Analyzing graph...')
sbm.analyze_graph()
edge_index = data_object.edge_index.to(device)
train_mask = data_object.train_mask.to(device)
val_mask = data_object.val_mask.to(device)
test_mask = data_object.test_mask.to(device)
y = data_object.y.to(device)
labels_clean = data_object.labels_clean.to(device)


In [None]:
graph_net = GNN(768, 768, 2).to(device)
optimizer = optim.Adam(graph_net.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
for epoch in range(1, 6):
    graph_net.train()
    optimizer.zero_grad()
    out = graph_net(X, edge_index)
    loss = criterion(out[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch}, Loss: {loss.item()}')
