In [2]:
from src.observation.inst2vec import Inst2vecEncoder
from tqdm import tqdm
import numpy as np
from sklearn.utils import resample
import os
import math

In [3]:
data_folder = 'data/classifyapp_data'
encoder = Inst2vecEncoder()  # inst2vec 编码器
train_samples = 1500
unk_idx = encoder.unknown_vocab_element
vsamples = 0



In [5]:
# Data acquisition
num_classes = 104
y_train = np.empty(0, dtype=np.int32)  # training
X_train = list()
folder_data_train = os.path.join(data_folder, 'ir_train')
y_val = np.empty(0, dtype=np.int32)  # validation
X_val = list()
folder_data_val = os.path.join(data_folder, 'ir_val')
y_test = np.empty(0, dtype=np.int32)  # testing
X_test = list()
folder_data_test = os.path.join(data_folder, 'ir_test')
print('Getting file names for', num_classes, 'classes from folders:')
print(folder_data_train)
print(folder_data_val)
print(folder_data_test)

Getting file names for 104 classes from folders:
data/classifyapp_data/ir_train
data/classifyapp_data/ir_val
data/classifyapp_data/ir_test


In [6]:
# 获取每个class中的ir文件列表
seed = 2025
for i in range(1, num_classes + 1):
    folder = os.path.join(folder_data_train, str(i))  # index i marks the target class
    assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
    print('\ttraining  : Read file names from folder ', folder)
    listing = os.listdir(folder)
    seq_files = [os.path.join(folder, f) for f in listing]

    # training: Randomly pick programs
    # assert len(seq_files) >= train_samples, "Cannot sample " + str(train_samples) + " from " + str(
    #     len(seq_files)) + " files found in " + folder
    # X_train += resample(seq_files, replace=False, n_samples=train_samples, random_state=seed)
    # y_train = np.concatenate([y_train, np.array([int(i) - 1] * train_samples, dtype=np.int32)])  # i becomes target
    X_train += seq_files
    y_train = np.concatenate([y_train, np.array([int(i) - 1] * len(seq_files), dtype=np.int32)])  # i becomes target


    # validation: Read data file names
    folder = os.path.join(folder_data_val, str(i))
    assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
    print('\tvalidation: Read file names from folder ', folder)
    listing = os.listdir(folder + '/')
    seq_files = [os.path.join(folder, f) for f in listing]

    # validation: Randomly pick programs
    if vsamples > 0:
        assert len(seq_files) >= vsamples, "Cannot sample " + str(vsamples) + " from " + str(
            len(seq_files)) + " files found in " + folder
        X_val += resample(seq_files, replace=False, n_samples=vsamples, random_state=seed)
        y_val = np.concatenate([y_val, np.array([int(i) - 1] * vsamples, dtype=np.int32)])
    else:
        assert len(seq_files) > 0, "No .rec files found in" + folder
        X_val += seq_files
        y_val = np.concatenate([y_val, np.array([int(i) - 1] * len(seq_files), dtype=np.int32)])


    # test: Read data file names
    folder = os.path.join(folder_data_test, str(i))
    assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
    print('\ttest      : Read file names from folder ', folder)
    listing = os.listdir(folder + '/')
    seq_files = [os.path.join(folder, f) for f in listing]
    assert len(seq_files) > 0, "No .rec files found in" + folder
    X_test += seq_files
    y_test = np.concatenate([y_test, np.array([int(i) - 1] * len(seq_files), dtype=np.int32)])


	training  : Read file names from folder  data/classifyapp_data/ir_train/1
	validation: Read file names from folder  data/classifyapp_data/ir_val/1
	test      : Read file names from folder  data/classifyapp_data/ir_test/1
	training  : Read file names from folder  data/classifyapp_data/ir_train/2
	validation: Read file names from folder  data/classifyapp_data/ir_val/2
	test      : Read file names from folder  data/classifyapp_data/ir_test/2
	training  : Read file names from folder  data/classifyapp_data/ir_train/3
	validation: Read file names from folder  data/classifyapp_data/ir_val/3
	test      : Read file names from folder  data/classifyapp_data/ir_test/3
	training  : Read file names from folder  data/classifyapp_data/ir_train/4
	validation: Read file names from folder  data/classifyapp_data/ir_val/4
	test      : Read file names from folder  data/classifyapp_data/ir_test/4
	training  : Read file names from folder  data/classifyapp_data/ir_train/5
	validation: Read file names from fol

In [7]:
len(X_train), len(y_train)

(221344, 221344)

In [8]:
def encode_srcs(input_files, dataset_name):
    """
    encode and pad source code for learning
    data_folder: folder from which to read input files
    input_files: list of strings of file names
    """

    # Get list of source file names
    num_files = len(input_files)
    num_unks = 0
    seq_lengths = list()   

    print('\n--- Preparing to read', num_files, 'input files for', dataset_name, 'data set')
    seqs = list()
    for file in tqdm(input_files, desc='Reading IR'):
        ir = encoder.preprocess(file)
        encode_ir = encoder.encode(ir)  # inst2vec编码
        seq_lengths.append(len(encode_ir))
        num_unks += encode_ir.count(unk_idx)
        seqs.append([int(s) for s in encode_ir])

    print('\tShortest sequence    : {:>5}'.format(min(seq_lengths)))
    maxlen = max(seq_lengths)
    print('\tLongest sequence     : {:>5}'.format(maxlen))
    print('\tMean sequence length : {:>5} (rounded down)'.format(math.floor(np.mean(seq_lengths))))
    print('\tNumber of \'UNK\'      : {:>5}'.format(num_unks))
    print('\tPercentage of \'UNK\'  : {:>8.4} (% among all stmts)'.format((num_unks * 100) / sum(seq_lengths)))
    print('\t\'UNK\' index          : {:>5}'.format(unk_idx))

    return seqs, maxlen

def pad_src(seqs, maxlen, unk_index):
    padded_sequences = []
    for seq in seqs:
        if len(seq) < maxlen:
            # Pad sequence if it is shorter than maxlen
            seq = seq + [unk_index] * (maxlen - len(seq))
        padded_sequences.append(seq)

    # Convert to np.array
    encoded = np.array(padded_sequences)
    return encoded


In [9]:
X_seq_train, maxlen_train = encode_srcs(X_train, 'training')
X_seq_val, maxlen_val = encode_srcs(X_val, 'validation')
X_seq_test, maxlen_test = encode_srcs(X_test, 'testing')


--- Preparing to read 221344 input files for training data set


Reading IR: 100%|██████████| 221344/221344 [18:08<00:00, 203.26it/s]


	Shortest sequence    :    11
	Longest sequence     :  5177
	Mean sequence length :   188 (rounded down)
	Number of 'UNK'      : 13098724
	Percentage of 'UNK'  :    31.35 (% among all stmts)
	'UNK' index          :  8564

--- Preparing to read 9155 input files for validation data set


Reading IR: 100%|██████████| 9155/9155 [00:41<00:00, 221.98it/s]


	Shortest sequence    :    24
	Longest sequence     :  5053
	Mean sequence length :   189 (rounded down)
	Number of 'UNK'      : 532337
	Percentage of 'UNK'  :    30.69 (% among all stmts)
	'UNK' index          :  8564

--- Preparing to read 9227 input files for testing data set


Reading IR: 100%|██████████| 9227/9227 [00:41<00:00, 221.22it/s]

	Shortest sequence    :    27
	Longest sequence     :  3016
	Mean sequence length :   185 (rounded down)
	Number of 'UNK'      : 525984
	Percentage of 'UNK'  :    30.66 (% among all stmts)
	'UNK' index          :  8564





In [10]:
train_data = [{'input_ids': ids, 'labels': int(l)} for ids, l in zip(X_seq_train, y_train)]
test_data = [{'input_ids': ids, 'labels': int(l)} for ids, l in zip(X_seq_test, y_test)]
val_data = [{'input_ids': ids, 'labels': int(l)} for ids, l in zip(X_seq_val, y_val)]

In [11]:
from datasets import *
import pandas as pd

train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)
val_dataset = Dataset.from_list(val_data)

In [12]:
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'val': val_dataset
})

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 221344
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 9227
    })
    val: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 9155
    })
})

In [14]:
dataset.save_to_disk("/root/Compiler-master/data/ClassifyAppDataset")

Saving the dataset (0/1 shards):   0%|          | 0/221344 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9227 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9155 [00:00<?, ? examples/s]

In [15]:
dataset = load_from_disk("/root/Compiler-master/data/ClassifyAppDataset")

In [16]:
dataset['train'][0]

{'input_ids': [8564,
  8564,
  8564,
  8564,
  0,
  8564,
  8564,
  8564,
  71,
  8564,
  8564,
  8564,
  78,
  263,
  8564,
  8564,
  425,
  204,
  8564,
  346,
  904,
  679,
  198,
  757,
  536,
  412,
  204,
  8564,
  341,
  257,
  8564,
  8564,
  1012,
  198,
  204,
  8564,
  496,
  204,
  8564,
  248,
  347,
  679,
  198,
  204,
  8564,
  525,
  8564,
  695,
  216,
  8564,
  347,
  431,
  394,
  204,
  8564,
  953,
  198,
  424,
  204,
  8564,
  341,
  257,
  8564,
  966,
  216,
  1313,
  8564,
  289,
  289,
  293,
  3032,
  293,
  3032,
  8564,
  295,
  425,
  204,
  8564,
  248,
  8564,
  295,
  8564,
  105,
  8564,
  536,
  295,
  394,
  204,
  8564,
  3032,
  3032,
  481,
  8564,
  8564,
  8564,
  5468,
  8564,
  8564,
  217,
  213,
  220,
  221,
  182,
  218,
  210,
  8564,
  8564,
  258,
  8564,
  5468,
  8564,
  8564,
  262,
  5468,
  8564,
  8564,
  8564,
  224,
  8564,
  8564,
  951,
  8564,
  8564,
  225,
  204,
  8564,
  261,
  232,
  8564,
  8564,
  8564,
  8564,
  226

In [97]:
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
import torch

In [114]:
def collate_fn(batch, padding_value=8564, max_length=512):
    input_ids, labels = [item['input_ids'] for item in batch], [item['labels'] for item in batch]
    padded_batch = []
    if max_length == None:
        max_length = max(len(item) for item in input_ids)
    
    for item in input_ids:
        padded_item = item + [padding_value] * max(0, (max_length - len(item)))
        padded_item = padded_item[:max_length]
        padded_batch.append(padded_item)
    return {"input_ids": torch.tensor(padded_batch), "label": torch.tensor(labels)}

dataloader = DataLoader(dataset['train'], batch_size=8, collate_fn=collate_fn)

In [115]:
next(iter(dataloader))['input_ids'].shape

torch.Size([8, 512])