In [41]:
import os

import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AlbertForSequenceClassification

from apk_analysis.dataset import load_dataset, TextDataset


In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
[torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]


['NVIDIA GeForce RTX 3070 Ti Laptop GPU']

In [5]:
DATASET_DIR = os.path.join(".", "resources", "dataset")
PRIVACY_TYPES_COSINE_DATASET_PATH = os.path.join(DATASET_DIR, "privacy_types_cosine_dataset.json")
PRIVACY_TYPES_N_GRAM_DATASET_PATH = os.path.join(DATASET_DIR, "privacy_types_n_gram_dataset.json")

PRIVACY_TYPES_COSINE_DATASET_JSON: TextDataset = load_dataset(PRIVACY_TYPES_COSINE_DATASET_PATH)
PRIVACY_TYPES_N_GRAM_DATASET_JSON: TextDataset = load_dataset(PRIVACY_TYPES_N_GRAM_DATASET_PATH)

USING_TEXT_DATASET = PRIVACY_TYPES_N_GRAM_DATASET_JSON


In [16]:
TOKENIZER = AutoTokenizer.from_pretrained("albert-base-v2")


In [33]:
class SimpleDataset(Dataset):
    def __init__(self, texts: list[str], labels: list[float], max_length: int):
        self._texts = texts
        self._labels = labels
        self._max_length = max_length

    def __getitem__(self, index: int):
        input_tensors = TOKENIZER.encode_plus(self._texts[index], max_length=self._max_length, padding="max_length", truncation=True, return_tensors="pt")
        label_tensor = torch.tensor(self._labels[index], dtype=torch.float32)
        return input_tensors['input_ids'].squeeze(), input_tensors['attention_mask'].squeeze(), label_tensor

    def __len__(self) -> int:
        return len(self._texts)


In [34]:
train_texts, test_texts, train_labels, test_labels = train_test_split(USING_TEXT_DATASET.contents, USING_TEXT_DATASET.labels, test_size=0.4, random_state=9326)
test_texts, val_texts, test_labels, val_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=22912)


In [35]:
MAX_TEXT_LENGTH = 50

CATEGORIES_NAMES = USING_TEXT_DATASET.categories
TRAIN_DATASET = SimpleDataset(train_texts, train_labels, MAX_TEXT_LENGTH)
TEST_DATASET = SimpleDataset(test_texts, test_labels, MAX_TEXT_LENGTH)
VAL_DATASET = SimpleDataset(val_texts, val_labels, MAX_TEXT_LENGTH)

BATCH_SIZE = 64
TRAIN_LOADER = DataLoader(TRAIN_DATASET, batch_size=BATCH_SIZE, shuffle=True)
TEST_LOADER = DataLoader(TEST_DATASET, batch_size=BATCH_SIZE, shuffle=True)
VAL_LOADER = DataLoader(VAL_DATASET, batch_size=BATCH_SIZE, shuffle=True)


In [38]:
LEARNING_RATE = 0.001

MODEL = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=len(CATEGORIES_NAMES), problem_type="multi_label_classification").to(DEVICE)
CRITERION = torch.nn.BCEWithLogitsLoss().to(DEVICE)
OPTIMIZER = torch.optim.AdamW(MODEL.parameters(), lr=LEARNING_RATE)


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
EPOCH_NUM = 100
MODEL.train()

for epoch in range(EPOCH_NUM):
    with tqdm(total=len(TRAIN_LOADER), desc=f"Epoch {epoch + 1}/{EPOCH_NUM}") as pbar:
        total_loss = []
        for input_ids, attention_mask, target_labels in TRAIN_LOADER:
            input_ids, attention_mask, target_labels = input_ids.to(DEVICE), attention_mask.to(DEVICE), target_labels.to(DEVICE)
            OPTIMIZER.zero_grad()

            outputs = MODEL(input_ids, attention_mask=attention_mask)[0]

            loss = CRITERION(outputs, target_labels.float())
            loss.backward()

            OPTIMIZER.step()

            total_loss.append(loss.item())

            pbar.set_postfix({"Loss": loss.item(), "Avg loss": sum(total_loss) / len(total_loss)})


Epoch 1/100:   0%|          | 0/4571 [00:41<?, ?it/s, Loss=0.00141, Avg loss=0.00153] 


KeyboardInterrupt: 