In [1]:
import os

os.chdir("/home/hack12/ensemble-ai2024/")

In [2]:
import sys

sys.path.append("..")

In [3]:
import random
from pathlib import Path
from taskdataset import TaskDataset
import hydra
import numpy as np
import torch
import wandb
from omegaconf import omegaconf, OmegaConf
from torch import optim, nn
from torch.utils.data import DataLoader
from torchvision.transforms import RandomAffine
from tqdm import tqdm
import random
from src.config import Config

# from src.data.custom_dataset import EncodingsToLabels, EncodingsDataset
from src.models.linear_head import Net, MapperNet
from src.transforms.affine import (
    AffineTransform,
    AffineAndPadAndShuffleTransform,
    PadAndShuffleTransform,
)
from src.transforms.binary import BinaryTransform
import json
from typing import List
import requests
import argparse
from torch.utils.data import Dataset

# import end2end_stealing.vision_transformer as vits
import torchvision.models as models
import torchvision
import numpy as np

In [8]:
def train_binary(epoch, model, train_loader, criterion, optimizer, device):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data.to(device))
        # output = nn.Sigmoid()(output)
        # pred = (output > 0.5).float()
        target = target.to(device)

        # print('pred', pred)
        # print('target', target)
        loss = criterion(output, target.to(device))

        # print(loss)
        loss.backward()
        optimizer.step()
    print(
        "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
            epoch,
            batch_idx * len(data),
            len(train_loader.dataset),
            100.0 * batch_idx / len(train_loader),
            loss.item(),
        )
    )

In [9]:
class EncodingsDataset(Dataset):
    def __init__(self, transformed_encodings, encodings):
        self.transformed_encodings = torch.from_numpy(
            transformed_encodings.astype("float32")
        )
        self.encodings = torch.from_numpy(encodings.astype("float32"))

    def __len__(self):
        return len(self.transformed_encodings)

    def __getitem__(self, idx):
        return self.transformed_encodings[idx], self.encodings[idx]

In [10]:
class MapperNet(nn.Module):
    def __init__(self, n_inputs, n_outputs, n_hidden):
        super().__init__()
        self.fc1 = nn.Linear(n_inputs, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_outputs)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [11]:
SEED = 0
torch.manual_seed(SEED)
# torch.use_deterministic_algorithms(True)
torch.backends.cudnn.benchmark = False
random.seed(SEED)
np.random.seed(SEED)

device = torch.device(f"cuda") if torch.cuda.is_available() else torch.device("cpu")

In [12]:
dataset = torch.load(
    "/home/bartosz.cywinski@aiclearing.com/code/ensemble-ai2024/end2end-stealing/checkpoints/SybilAttack.pt"
)
dataset_ids = dataset.ids

In [32]:
learning_rate = 3e-5
batch_size = 64
mapper_epochs = 2500

In [14]:
overlap = 1000

In [15]:
results = {}

In [17]:
dataset_ids = dataset.ids

In [19]:
directory_path = (
    "/home/bartosz.cywinski@aiclearing.com/code/ensemble-ai2024/sybil_affine"
)

In [21]:
import os

In [22]:
files = os.listdir(directory_path)

# Filter files that start with 'B'
filtered_files = [file for file in files if file.startswith("B")]

In [23]:
A_data = np.load(
    "/home/bartosz.cywinski@aiclearing.com/code/ensemble-ai2024/sybil_affine/A.npy"
)

In [25]:
B_data_queries = [
    np.load(directory_path + "/" + file_name) for file_name in filtered_files
]

In [26]:
B_data_train = [B_data[:overlap] for B_data in B_data_queries]

In [27]:
B_data_test = [B_data[overlap:] for B_data in B_data_queries]

In [28]:
A_data_train = A_data[:overlap]

In [29]:
results["ids"] = dataset_ids
results["features"] = A_data

In [30]:
MODEL_OUT_IN_SIZE = A_data_train.shape[1]
HIDDEN = 2 * 384

In [33]:
for B_data_train, B_data_test in zip(B_data_train, B_data_test):
    train_dataset = EncodingsDataset(B_data_train, A_data_train)

    train_loader = torch.utils.data.DataLoader(
        dataset=train_dataset, batch_size=batch_size, shuffle=True
    )

    mapper = MapperNet(MODEL_OUT_IN_SIZE, MODEL_OUT_IN_SIZE, n_hidden=HIDDEN).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(mapper.parameters(), lr=learning_rate)

    for epoch in range(1, mapper_epochs):
        train_binary(epoch, mapper, train_loader, criterion, optimizer, device=device)

    reconstructed_encodings_test = []
    with torch.no_grad():
        for encoding in tqdm(B_data_test):
            encoding = torch.Tensor(encoding)
            reconstructed_encoding = mapper(encoding.to(device))
            reconstructed_encoding = nn.Sigmoid()(reconstructed_encoding)
            reconstructed_encoding_pred = (reconstructed_encoding > 0.5).float()
            reconstructed_encodings_test.append(
                reconstructed_encoding_pred.detach().to("cpu")
            )
        reconstructed_encodings_test = torch.cat(reconstructed_encodings_test).reshape(
            len(B_data_test), -1
        )

    results["features"] = np.concatenate(
        (results["features"], reconstructed_encodings_test.numpy())
    )



100%|██████████| 1000/1000 [00:00<00:00, 5255.14it/s]




100%|██████████| 1000/1000 [00:00<00:00, 4867.84it/s]




100%|██████████| 1000/1000 [00:00<00:00, 4879.10it/s]




100%|██████████| 1000/1000 [00:00<00:00, 4803.15it/s]




100%|██████████| 1000/1000 [00:00<00:00, 5002.68it/s]




100%|██████████| 1000/1000 [00:00<00:00, 6347.16it/s]




100%|██████████| 1000/1000 [00:00<00:00, 6557.16it/s]




100%|██████████| 1000/1000 [00:01<00:00, 857.81it/s]




100%|██████████| 1000/1000 [00:00<00:00, 5681.98it/s]




100%|██████████| 1000/1000 [00:00<00:00, 5366.68it/s]




100%|██████████| 1000/1000 [00:00<00:00, 5607.80it/s]




100%|██████████| 1000/1000 [00:00<00:00, 6385.10it/s]




100%|██████████| 1000/1000 [00:00<00:00, 6172.15it/s]




100%|██████████| 1000/1000 [00:00<00:00, 5545.84it/s]




100%|██████████| 1000/1000 [00:02<00:00, 380.56it/s]




100%|██████████| 1000/1000 [00:02<00:00, 367.80it/s]




100%|██████████| 1000/1000 [00:00<00:00, 2433.48it/s]




100%|██████████| 1000/1000 [00:02<00:00, 498.38it/s]


In [34]:
len(results["features"])

20000

In [35]:
np.savez(
    "example_submission_afajniak.npz",
    ids=results["ids"],
    representations=results["features"],
)

In [36]:
def sybil_submit(binary_or_affine: str, path_to_npz_file: str):
    TEAM_TOKEN = "8J40ASDQOjfeeSKL"
    SERVER_URL = "http://34.71.138.79:9090"

    if binary_or_affine not in ["binary", "affine"]:
        raise Exception("Invalid endpoint")

    endpoint = f"/sybil/{binary_or_affine}/submit"
    url = SERVER_URL + endpoint

    with open(path_to_npz_file, "rb") as f:
        response = requests.post(url, files={"file": f}, headers={"token": TEAM_TOKEN})

    if response.status_code == 200:
        print("OK")
        print(response.json())
    else:
        print(
            f"Request submit failed. Status code: {response.status_code}, content: {response.json()}"
        )

In [37]:
sybil_submit(
    "affine",
    "/home/bartosz.cywinski@aiclearing.com/code/ensemble-ai2024/src/example_submission_afajniak.npz",
)

OK
{'score': 26.573927318776168}
