In [1]:
import os
import pandas as pd
import javalang
import ast

In [None]:
# ===============================
# PHẦN 1: Extract tokens from Java files
# ===============================
def extract_tokens(node):
    tokens = []
    if isinstance(node, javalang.tree.MethodInvocation):
        tokens.append(node.member)
    elif isinstance(node, javalang.tree.ClassCreator):
        tokens.append(node.type.name)
    elif isinstance(node, javalang.tree.MethodDeclaration):
        tokens.append(node.name)
    elif isinstance(node, javalang.tree.ClassDeclaration):
        tokens.append(node.name)
    elif isinstance(node, javalang.tree.EnumDeclaration):
        tokens.append(node.name)
    elif isinstance(node, javalang.tree.IfStatement):
        tokens.append("<IF>")
    elif isinstance(node, javalang.tree.WhileStatement):
        tokens.append("<WHILE>")
    elif isinstance(node, javalang.tree.ForStatement):
        tokens.append("<FOR>")
    elif isinstance(node, javalang.tree.ThrowStatement):
        tokens.append("<THROW>")
    elif isinstance(node, javalang.tree.CatchClause):
        tokens.append("<CATCH>")
    
    for child in node.children:
        if isinstance(child, list):
            for item in child:
                if isinstance(item, javalang.tree.Node):
                    tokens.extend(extract_tokens(item))
        elif isinstance(child, javalang.tree.Node):
            tokens.extend(extract_tokens(child))
    return tokens

def parse_java_file_to_token_vector(file_path):
    """
    Đọc nội dung file .java, parse thành AST, rồi trích xuất token vector.
    """
    try:
        with open(file_path, 'r', encoding='cp1252') as f:
            code = f.read()
        tree = javalang.parse.parse(code)
        token_vector = extract_tokens(tree)
        return token_vector
    except FileNotFoundError:
        print(f"[LỖI] Không tìm thấy file: {file_path}")
        return []
    except Exception as e:
        print(f"[LỖI] Không parse được file: {file_path}, lý do: {e}")
        return []

def build_java_path(java_name, base_path):
    """
    Từ tên gói + lớp (vd: org.apache.tools.ant.input.PropertyFileInputHandler)
    -> Tạo đường dẫn tới file .java.
    """
    relative_path = java_name.replace('.', '/') + ".java"
    full_path = os.path.join(base_path, relative_path)
    return full_path.replace('\\', '/')  

def extract_tokens_and_save():
    """
    Đọc file ant-1.6.csv, parse từng file Java dựa trên cột 'name',
    trích xuất token vector và lưu kết quả ra file Excel (ant_tokens.xlsx).
    """
    csv_path = r"C:\Users\Acer\Desktop\2024\lab\DP-CNN\ant-1.6.csv"
    base_path = r"C:/Users/Acer/Desktop/2024/lab/DP-CNN/apache-ant-1.6.0-src/apache-ant-1.6.0/src/main"
    
    df = pd.read_csv(csv_path)
    all_token_vectors = []
    for i, row in df.iterrows():
        java_name = row['name']
        file_path = build_java_path(java_name, base_path)
        token_vector = parse_java_file_to_token_vector(file_path)
        all_token_vectors.append(token_vector)
    
    df_tokens = pd.DataFrame({
        'name': df['name'],
        'token_vector': all_token_vectors
    })
    
    output_excel = r"C:\Users\Acer\Desktop\2024\lab\DP-CNN\ant_tokens.xlsx"
    df_tokens.to_excel(output_excel, index=False)
    print(f"[DONE] ant_tokens.xlsx saved at {output_excel}")

extract_tokens_and_save()

[DONE] ant_tokens.xlsx saved at C:\Users\Acer\Desktop\2024\lab\DP-CNN\ant_tokens.xlsx


In [None]:
# ===============================
# PHẦN 2: Mapping tokens to ID and applying padding/truncate
# ===============================
def pad_or_truncate(int_list, fixed_length=213):
    """
    Nếu độ dài int_list < fixed_length, thêm 0 ở cuối.
    Nếu độ dài int_list > fixed_length, cắt bớt ở cuối.
    """
    if len(int_list) < fixed_length:
        return int_list + [0] * (fixed_length - len(int_list))
    else:
        return int_list[:fixed_length]

def map_tokens_and_save():
    """
    xây dựng mapping token -> ID, chuyển thành integer vector, áp dụng padding/truncate,
    và lưu ra file CSV (final_vectors.csv) cùng với mapping token2id trong token2id_mapping.csv.
    """
    input_excel = r"C:\Users\Acer\Desktop\2024\lab\DP-CNN\ant_tokens.xlsx"
    output_csv = r"C:\Users\Acer\Desktop\2024\lab\DP-CNN\final_vectors.csv"
    mapping_csv = r"C:\Users\Acer\Desktop\2024\lab\DP-CNN\token2id_mapping.csv"
    
    df = pd.read_excel(input_excel)
    df['token_vector'] = df['token_vector'].apply(lambda x: ast.literal_eval(x)) #chuyển từ string list -> list
    
    all_tokens = set()
    for tokens in df['token_vector']:
        all_tokens.update(tokens)
    
    token2id = {}
    current_id = 1
    for token in sorted(all_tokens):  
        token2id[token] = current_id
        current_id += 1
    
    def convert_tokens_to_int(tokens):
        return [token2id.get(t, 0) for t in tokens]
    
    df['int_vector'] = df['token_vector'].apply(convert_tokens_to_int)
    df['int_vector_fixed'] = df['int_vector'].apply(lambda x: pad_or_truncate(x, fixed_length=213))
    
    df_final = df[['name', 'int_vector_fixed']].copy()
    df_final.to_csv(output_csv, index=False)
    print(f"[DONE] final_vectors.csv saved at {output_csv}")
    
    mapping_df = pd.DataFrame(list(token2id.items()), columns=['token', 'id'])
    mapping_df.to_csv(mapping_csv, index=False)
    print(f"[DONE] token2id_mapping.csv saved at {mapping_csv}")

map_tokens_and_save()

[DONE] final_vectors.csv saved at C:\Users\Acer\Desktop\2024\lab\DP-CNN\final_vectors.csv
[DONE] token2id_mapping.csv saved at C:\Users\Acer\Desktop\2024\lab\DP-CNN\token2id_mapping.csv


In [None]:
# ===============================
# PHẦN 3: Merge với file ant-1.6.csv, chuyển đổi cột bug, nhân đôi dòng buggy, lưu train.csv
# ===============================
def merge_and_duplicate():
    ant_csv = r"C:\Users\Acer\Desktop\2024\lab\DP-CNN\ant-1.6.csv"
    df_ant = pd.read_csv(ant_csv)
    
    def transform_bug_value(x):
        if x == 0:
            return "clean"
        else:
            return "buggy"
    
    df_ant['bug'] = df_ant['bug'].apply(transform_bug_value)
    df_ant_minimal = df_ant[['name', 'bug']].copy()
    
    final_vectors_csv = r"C:\Users\Acer\Desktop\2024\lab\DP-CNN\final_vectors.csv"
    df_vectors = pd.read_csv(final_vectors_csv)
    
    df_merged = pd.merge(df_ant_minimal, df_vectors, on='name', how='inner')
    
    df_buggy = df_merged[df_merged['bug'] == 'buggy']
    df_merged_doubled = pd.concat([df_merged, df_buggy], ignore_index=True)
    
    output_csv = r"C:\Users\Acer\Desktop\2024\lab\DP-CNN\train.csv"
    df_merged_doubled.to_csv(output_csv, index=False)
    print(f"[DONE] train.csv saved at {output_csv}")

merge_and_duplicate()

[DONE] train.csv saved at C:\Users\Acer\Desktop\2024\lab\DP-CNN\train.csv
