In [1]:
import os
import pickle
import torch
import pandas as pd

from tqdm import tqdm
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import f1_score, accuracy_score
from datasets import *


In [2]:
data_folder = 'data/ClassifyAppDataset'
num_epochs = 100
batch_size = 64
dense_layer_size = 32
print_summary = False
out_folder = 'output/inst2vec_for_classifyapp'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
log_step = 10
max_length = 512
emb_path = 'src/observation/inst2vec/pickle/embeddings.pickle'

if not os.path.exists(out_folder):
    os.makedirs(out_folder)

## 加载数据集

In [3]:
def collate_fn(batch, padding_value=8564, max_length=max_length):
    input_ids, labels = [item['input_ids'] for item in batch], [item['labels'] for item in batch]
    padded_batch = []
    if max_length == None:
        max_length = max(len(item) for item in input_ids)
    
    for item in input_ids:
        padded_item = item + [padding_value] * max(0, (max_length - len(item)))
        padded_item = padded_item[:max_length]
        padded_batch.append(padded_item)
    return {"input_ids": torch.tensor(padded_batch), "labels": torch.tensor(labels)}

dataset = load_from_disk("/root/Compiler-master/data/ClassifyAppDataset")
train_loader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(dataset['test'], batch_size=batch_size, collate_fn=collate_fn)
val_loader = DataLoader(dataset['val'], batch_size=batch_size, collate_fn=collate_fn)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 221344
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 9227
    })
    val: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 9155
    })
})

In [5]:
next(iter(train_loader))

{'input_ids': tensor([[   0, 8564,   30,  ..., 8564, 8564, 8564],
         [   0, 8564,   40,  ..., 8564, 8564, 8564],
         [   0, 8564,   68,  ..., 8564, 8564, 8564],
         ...,
         [   0, 8564,   40,  ..., 8564, 8564, 8564],
         [   0, 8564,   40,  ..., 8564, 8564, 8564],
         [8564, 8564, 8564,  ..., 8564, 8564, 8564]]),
 'labels': tensor([ 10,  30,  47,  74,  87,  93,  38,  98,   1,   0,  27,  98,   7,  11,
          19,  22,  26,  22,  43,  48,  22,  27,  42,  56,  46,  37, 102,  47,
          57,  52,  38,  21,  66,  55,  26,   9,  19,  65,  32,  85,  92,  54,
           9,  11,  62,  35,   8,  84,  10, 103,  15,  59,  63, 101,  64,  96,
           9,  84,  83,  12,  84,  19,  64,  71])}

## 定义网络模型

In [11]:
# 定义网络结构
class ClassifyAppLSTM(nn.Module):
    def __init__(self, embedding_dim, dense_layer_size, num_classes):
        super(ClassifyAppLSTM, self).__init__()
        # Embedding 
        with open(emb_path, "rb") as f:
            embeddings = pickle.load(f)
        embeddings = torch.tensor(embeddings, dtype=torch.float32)
        embedding_matrix_normalized = F.normalize(embeddings, p=2, dim=1)
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix_normalized, freeze=False)

        # LSTM layers
        self.lstm1 = nn.LSTM(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)

        # Batch normalization
        self.batch_norm = nn.BatchNorm1d(embedding_dim)

        # Dense layers
        self.dense1 = nn.Linear(embedding_dim, dense_layer_size)
        self.dense2 = nn.Linear(dense_layer_size, num_classes)

        # Activation functions
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # Embedding
        x = self.embedding(x)
        
        # LSTM layers
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)

        # Take the output of the last time step
        x = x[:, 0, :]

        # Batch normalization
        x = self.batch_norm(x)

        # Dense layers
        x = self.relu(self.dense1(x))
        x = self.dense2(x)

        return x
    

model = ClassifyAppLSTM(200, dense_layer_size, 104)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [12]:
data = {k: v.to(device) for k, v in next(iter(train_loader)).items()}
model(data['input_ids'])

tensor([[-0.1570, -0.1559, -0.1372,  ..., -0.1717, -0.2645,  0.0525],
        [-0.1388, -0.0986, -0.0710,  ..., -0.0540, -0.1758, -0.0632],
        [-0.1388, -0.0986, -0.0710,  ..., -0.0540, -0.1758, -0.0632],
        ...,
        [-0.1388, -0.0986, -0.0710,  ..., -0.0540, -0.1758, -0.0632],
        [-0.1388, -0.0986, -0.0710,  ..., -0.0540, -0.1758, -0.0632],
        [-0.1388, -0.0986, -0.0710,  ..., -0.0540, -0.1758, -0.0632]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

In [14]:
def eval_model(model, loader):
    model.eval()
    correct = 0
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc='Eval'):
            data = {k: v.to(device) for k, v in batch.items()}
            outputs = model(data['input_ids'])
            preds = outputs.argmax(dim=1)
            y_pred += preds.tolist()
            y_true += data['labels'].tolist()
            
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    acc = accuracy_score(y_true, y_pred)
    return f1_weighted, acc

def train_model(model, train_loader, val_loader,  criterion, optimizer, num_epochs):
    # 模型训练
    writer = SummaryWriter(out_folder)
    pre_val_f1 = 0
    gloabl_step = 0
    for epoch in range(num_epochs):
        epoch_loss = 0
        correct = 0
        step = 0
        model.train()
        y_true, y_pred = [], []

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)

        running_loss = 0.0
        for idx, batch in enumerate(progress_bar):
            data = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(data['input_ids'])
            loss = criterion(outputs, data['labels'])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            preds = outputs.argmax(dim=1)
            y_pred += preds.tolist()
            y_true += data['labels'].tolist()

            if gloabl_step % log_step == 0:
                writer.add_scalar('train_loss', loss.item(), gloabl_step)
            gloabl_step += 1

            running_loss += loss.item()
            progress_bar.set_postfix(loss=running_loss / (idx + 1))
            # progress_bar.set_postfix(loss=loss.item())
                    
        train_f1, train_acc = f1_score(y_true, y_pred, average='weighted'), accuracy_score(y_true, y_pred)
        val_f1, val_acc = eval_model(model, val_loader)
        writer.add_scalar('train_f1', train_f1, epoch)
        writer.add_scalar('train_acc', train_acc, epoch)
        writer.add_scalar('val_f1', val_f1, epoch)
        writer.add_scalar('val_acc', val_acc, epoch)

        if val_f1 > pre_val_f1:
            pre_val_f1 = val_f1
            torch.save(model.state_dict(), out_folder + f'/best_epoch_{epoch}_eval_f1_{int(val_f1*100)}_acc_{int(val_acc*100)}.pth')
            
        torch.save(model.state_dict(), out_folder + f'/best_epoch_{epoch}_eval_f1_{int(val_f1*100)}_acc_{int(val_acc*100)}.pth')
        


In [15]:
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)
# eval_model(model, val_loader)

Epoch 1/100:   0%|          | 0/3459 [00:00<?, ?it/s]

KeyboardInterrupt: 