In [4]:
import os
import math
import torch
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm

from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold

from src.utils.system import read_ir_from_file
from sklearn.model_selection import StratifiedKFold
from src.observation.inst2vec import Inst2vecEncoder
from scipy.stats import gmean
from sklearn.utils import resample

In [18]:
data_folder = 'data/classifyapp_data'
num_epochs = 10
batch_size = 64
dense_layer_size = 32
# train_samples = 1500
train_samples = 150
vsamples = 0
ring_size = 5
print_summary = False
out_folder = 'output/inst2vec_for_classifyapp'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
encoder = Inst2vecEncoder()  # inst2vec 编码器
unk_idx = encoder.unknown_vocab_element

if not os.path.exists(out_folder):
    os.makedirs(out_folder)

## Load data

In [6]:
# Data acquisition
num_classes = 104
y_train = np.empty(0, dtype=np.float32)  # training
X_train = list()
folder_data_train = os.path.join(data_folder, 'ir_train')
y_val = np.empty(0, dtype=np.float32)  # validation
X_val = list()
folder_data_val = os.path.join(data_folder, 'ir_val')
y_test = np.empty(0, dtype=np.float32)  # testing
X_test = list()
folder_data_test = os.path.join(data_folder, 'ir_test')
print('Getting file names for', num_classes, 'classes from folders:')
print(folder_data_train)
print(folder_data_val)
print(folder_data_test)

Getting file names for 104 classes from folders:
data/classifyapp_data/ir_train
data/classifyapp_data/ir_val
data/classifyapp_data/ir_test


In [7]:
# 获取每个class中的ir文件列表
seed = 2025
for i in range(1, num_classes + 1):
    folder = os.path.join(folder_data_train, str(i))  # index i marks the target class
    assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
    print('\ttraining  : Read file names from folder ', folder)
    listing = os.listdir(folder)
    seq_files = [os.path.join(folder, f) for f in listing]

    # training: Randomly pick programs
    assert len(seq_files) >= train_samples, "Cannot sample " + str(train_samples) + " from " + str(
        len(seq_files)) + " files found in " + folder
    X_train += resample(seq_files, replace=False, n_samples=train_samples, random_state=seed)
    y_train = np.concatenate([y_train, np.array([int(i)] * train_samples, dtype=np.int32)])  # i becomes target

    # validation: Read data file names
    folder = os.path.join(folder_data_val, str(i))
    assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
    print('\tvalidation: Read file names from folder ', folder)
    listing = os.listdir(folder + '/')
    seq_files = [os.path.join(folder, f) for f in listing]

    # validation: Randomly pick programs
    if vsamples > 0:
        assert len(seq_files) >= vsamples, "Cannot sample " + str(vsamples) + " from " + str(
            len(seq_files)) + " files found in " + folder
        X_val += resample(seq_files, replace=False, n_samples=vsamples, random_state=seed)
        y_val = np.concatenate([y_val, np.array([int(i)] * vsamples, dtype=np.int32)])
    else:
        assert len(seq_files) > 0, "No .rec files found in" + folder
        X_val += seq_files
        y_val = np.concatenate([y_val, np.array([int(i)] * len(seq_files), dtype=np.int32)])


    # test: Read data file names
    folder = os.path.join(folder_data_test, str(i))
    assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
    print('\ttest      : Read file names from folder ', folder)
    listing = os.listdir(folder + '/')
    seq_files = [os.path.join(folder, f) for f in listing]
    assert len(seq_files) > 0, "No .rec files found in" + folder
    X_test += seq_files
    y_test = np.concatenate([y_test, np.array([int(i)] * len(seq_files), dtype=np.int32)])


	training  : Read file names from folder  data/classifyapp_data/ir_train/1
	validation: Read file names from folder  data/classifyapp_data/ir_val/1
	test      : Read file names from folder  data/classifyapp_data/ir_test/1
	training  : Read file names from folder  data/classifyapp_data/ir_train/2
	validation: Read file names from folder  data/classifyapp_data/ir_val/2
	test      : Read file names from folder  data/classifyapp_data/ir_test/2
	training  : Read file names from folder  data/classifyapp_data/ir_train/3
	validation: Read file names from folder  data/classifyapp_data/ir_val/3
	test      : Read file names from folder  data/classifyapp_data/ir_test/3
	training  : Read file names from folder  data/classifyapp_data/ir_train/4
	validation: Read file names from folder  data/classifyapp_data/ir_val/4
	test      : Read file names from folder  data/classifyapp_data/ir_test/4
	training  : Read file names from folder  data/classifyapp_data/ir_train/5
	validation: Read file names from fol

### Read ir file

In [8]:
def encode_srcs(input_files, dataset_name):
    """
    encode and pad source code for learning
    data_folder: folder from which to read input files
    input_files: list of strings of file names
    """

    # Get list of source file names
    num_files = len(input_files)
    num_unks = 0
    seq_lengths = list()   

    print('\n--- Preparing to read', num_files, 'input files for', dataset_name, 'data set')
    seqs = list()
    for file in tqdm(input_files, desc='Reading IR'):
        ir = encoder.preprocess(file)
        encode_ir = encoder.encode(ir)  # inst2vec编码
        seq_lengths.append(len(encode_ir))
        num_unks += encode_ir.count(unk_idx)
        seqs.append([int(s) for s in encode_ir])

    print('\tShortest sequence    : {:>5}'.format(min(seq_lengths)))
    maxlen = max(seq_lengths)
    print('\tLongest sequence     : {:>5}'.format(maxlen))
    print('\tMean sequence length : {:>5} (rounded down)'.format(math.floor(np.mean(seq_lengths))))
    print('\tNumber of \'UNK\'      : {:>5}'.format(num_unks))
    print('\tPercentage of \'UNK\'  : {:>8.4} (% among all stmts)'.format((num_unks * 100) / sum(seq_lengths)))
    print('\t\'UNK\' index          : {:>5}'.format(unk_idx))

    return seqs, maxlen

def pad_src(seqs, maxlen, unk_index):
    padded_sequences = []
    for seq in seqs:
        if len(seq) < maxlen:
            # Pad sequence if it is shorter than maxlen
            seq = seq + [unk_index] * (maxlen - len(seq))
        padded_sequences.append(seq)

    # Convert to np.array
    encoded = np.array(padded_sequences)
    return encoded


In [9]:
X_seq_train, maxlen_train = encode_srcs(X_train, 'training')
X_seq_val, maxlen_val = encode_srcs(X_val, 'validation')
X_seq_test, maxlen_test = encode_srcs(X_test, 'testing')


--- Preparing to read 15600 input files for training data set


Reading IR: 100%|██████████| 15600/15600 [01:12<00:00, 215.34it/s]


	Shortest sequence    :    11
	Longest sequence     :  4002
	Mean sequence length :   189 (rounded down)
	Number of 'UNK'      : 930288
	Percentage of 'UNK'  :    31.43 (% among all stmts)
	'UNK' index          :  8564

--- Preparing to read 9155 input files for validation data set


Reading IR: 100%|██████████| 9155/9155 [00:37<00:00, 244.62it/s]


	Shortest sequence    :    24
	Longest sequence     :  5053
	Mean sequence length :   189 (rounded down)
	Number of 'UNK'      : 532337
	Percentage of 'UNK'  :    30.69 (% among all stmts)
	'UNK' index          :  8564

--- Preparing to read 9227 input files for testing data set


Reading IR: 100%|██████████| 9227/9227 [00:38<00:00, 242.02it/s]

	Shortest sequence    :    27
	Longest sequence     :  3016
	Mean sequence length :   185 (rounded down)
	Number of 'UNK'      : 525984
	Percentage of 'UNK'  :    30.66 (% among all stmts)
	'UNK' index          :  8564





In [10]:
maxlen = max(maxlen_train, maxlen_test, maxlen_val)
print('Max. sequence length overall:', maxlen)
X_seq_train_pad = pad_src(X_seq_train, maxlen, unk_idx)
X_seq_val_pad = pad_src(X_seq_val, maxlen, unk_idx)
X_seq_test_pad = pad_src(X_seq_test, maxlen, unk_idx)

Max. sequence length overall: 5053


In [11]:
y_train = torch.tensor(y_train, dtype=torch.long)
y_val = torch.tensor(y_val, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

## 定义Dataset

In [12]:
class ClassifyAppDataset(Dataset):
    def __init__(self, seqs, label, maxlen, embeddings):
        super().__init__()
        self.sequences = seqs
        self.y = label
        self.embeddings = embeddings
        
    def __getitem__(self, index):
        seqs = self.embeddings[self.sequences[index]]
        label = self.y[index]
        return seqs, label

    def __len__(self):
        return len(self.y)


embeddings = torch.tensor(encoder.embeddings, dtype=torch.float32)
embedding_matrix_normalized = F.normalize(embeddings, p=2, dim=1)

train_dataset = ClassifyAppDataset(X_seq_train_pad, y_train, maxlen, embedding_matrix_normalized)
test_dataset = ClassifyAppDataset(X_seq_test_pad, y_test, maxlen, embedding_matrix_normalized)
val_dataset = ClassifyAppDataset(X_seq_val_pad, y_val, maxlen, embedding_matrix_normalized)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

## 定义网络模型

In [20]:
# 定义网络结构
class ClassifyAppLSTM(nn.Module):
    def __init__(self, embedding_dim, dense_layer_size, num_classes):
        super(ClassifyAppLSTM, self).__init__()
        # LSTM layers
        self.lstm1 = nn.LSTM(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)

        # Batch normalization
        self.batch_norm = nn.BatchNorm1d(embedding_dim)

        # Dense layers
        self.dense1 = nn.Linear(embedding_dim, dense_layer_size)
        self.dense2 = nn.Linear(dense_layer_size, num_classes)

        # Activation functions
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # LSTM layers
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)

        # Take the output of the last time step
        x = x[:, -1, :]

        # Batch normalization
        x = self.batch_norm(x)

        # Dense layers
        x = self.relu(self.dense1(x))
        x = self.dense2(x)

        return x

model = ClassifyAppLSTM(embedding_matrix_normalized.shape[1], dense_layer_size, 104)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
def eval_model(model, loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch in loader:
            sequences, labels = [b.to(device) for b in batch]
            outputs = model(sequences)
            preds = outputs.argmax(dim=1)
            labels = labels - 1
            correct += (preds == labels).sum().item()
            
    accuracy = correct / len(loader.dataset)
    return accuracy

def train_model(model, train_loader, test_loader,  criterion, optimizer, num_epochs):
    # 模型训练
    pre_eval_acc = -1
    for epoch in range(num_epochs):
        epoch_loss = 0
        correct = 0
        step = 0
        model.train()
        for batch in train_loader:
            sequences, labels = [b.to(device) for b in batch]
            labels = labels - 1
            
            optimizer.zero_grad()
            outputs = model(sequences)

            # 计算loss值 由output和lang_outputs与label计算CrossEntropyLoss
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()

            step += 1
            if step % 50 == 0:
                print(f'step {step}, loss: {loss.item():.4f}')
                    
        accuracy = correct / len(train_loader.dataset)
        eval_acc = eval_model(model, test_loader)
        print(f"epoch {epoch+1}/{num_epochs}, loss: {epoch_loss:.4f}, train_acc: {accuracy:.4f}, eval_acc: {eval_acc:.4f}")


In [22]:
train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs)

step 50, loss: 4.6583
step 100, loss: 4.6509
step 150, loss: 4.6275
step 200, loss: 4.6419
epoch 1/10, loss: 1134.4791, train_acc: 0.0083, eval_acc: 0.0089
step 50, loss: 4.6372
step 100, loss: 4.6535
step 150, loss: 4.6468
step 200, loss: 4.6540
epoch 2/10, loss: 1134.0360, train_acc: 0.0078, eval_acc: 0.0089
step 50, loss: 4.6412
step 100, loss: 4.6546
step 150, loss: 4.6428
step 200, loss: 4.6532
epoch 3/10, loss: 1133.7538, train_acc: 0.0084, eval_acc: 0.0076
step 50, loss: 4.6432
step 100, loss: 4.6424


KeyboardInterrupt: 