## 1. Load dữ liệu đã xử lí

In [1]:
!pip install nltk




In [2]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [3]:
# English stop words
stop_words = set(
    ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
     'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
     'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs',
     'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
     'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
     'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
     'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
     'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
     'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
     'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
     'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
     'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
     's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o',
     're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven',
     'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won',
     'wouldn', 'b', 'c', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'n', 'p', 'q', 'u', 'v',
     'w', 'x', 'z', 'us'])

# Java language keywords
java_keywords = set(
    ['abstract', 'assert', 'boolean', 'break', 'byte', 'case',
     'catch', 'char', 'class', 'const', 'continue', 'default', 'do', 'double',
     'else', 'enum', 'extends', 'false', 'final', 'finally', 'float', 'for', 'goto',
     'if', 'implements', 'import', 'instanceof', 'int', 'interface', 'long',
     'native', 'new', 'null', 'package', 'private', 'protected', 'public', 'return',
     'short', 'static', 'strictfp', 'super', 'switch', 'synchronized', 'this',
     'throw', 'throws', 'transient', 'true', 'try', 'void', 'volatile', 'while'])

from collections import namedtuple
from pathlib import Path

# Dataset root directory (điều chỉnh đường dẫn nếu cần)
_DATASET_ROOT = Path('/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/')

Dataset = namedtuple('Dataset', ['name', 'src', 'bug_repo', 'repo_url', 'features'])

# Các dataset được định nghĩa
aspectj = Dataset(
    'aspectj',
    _DATASET_ROOT / 'source files/org.aspectj',
    _DATASET_ROOT / 'bug reports/AspectJ.txt',
    "https://github.com/eclipse/org.aspectj/tree/bug433351.git",
    _DATASET_ROOT / 'bug reports/AspectJ.xlsx'
)

eclipse = Dataset(
    'eclipse',
    _DATASET_ROOT / 'source files/eclipse.platform.ui-johna-402445',
    _DATASET_ROOT / 'bug reports/Eclipse_Platform_UI.txt',
    "https://github.com/eclipse/eclipse.platform.ui.git",
    _DATASET_ROOT / 'bug reports/Eclipse_Platform_UI.xlsx'
)

swt = Dataset(
    'swt',
    _DATASET_ROOT / 'source files/eclipse.platform.swt-xulrunner-31',
    _DATASET_ROOT / 'bug reports/SWT.txt',
    "https://github.com/eclipse/eclipse.platform.swt.git",
    _DATASET_ROOT / 'bug reports/SWT.xlsx'
)

tomcat = Dataset(
    'tomcat',
    _DATASET_ROOT / 'source files/tomcat-7.0.51',
    _DATASET_ROOT / 'bug reports/Tomcat.txt',
    "https://github.com/apache/tomcat.git",
    _DATASET_ROOT / 'bug reports/Tomcat.xlsx'
)

birt = Dataset(
    'birt',
    _DATASET_ROOT / 'source files/birt-20140211-1400',
    _DATASET_ROOT / 'bug reports/Birt.txt',
    "https://github.com/apache/birt.git",
    _DATASET_ROOT / 'bug reports/Birt.xlsx'
)


### Current dataset in use. (change this name to change the dataset)
DATASET = tomcat

class BugReport:
    """Class representing each bug report"""
    __slots__ = ['summary', 'description', 'fixed_files', 'report_time', 'pos_tagged_summary', 'pos_tagged_description','stack_traces','stack_traces_remove']

    def __init__(self, summary, description, fixed_files, report_time):
        self.summary = summary
        self.description = description
        self.fixed_files = fixed_files
        self.report_time = report_time
        self.pos_tagged_summary = None
        self.pos_tagged_description = None
        self.stack_traces = None
        self.stack_traces_remove = None

class SourceFile:
    """Class representing each source file"""
    __slots__ = ['all_content', 'comments', 'class_names', 'attributes', 'method_names', 'variables', 'file_name',
                 'pos_tagged_comments', 'exact_file_name', 'package_name']

    def __init__(self, all_content, comments, class_names, attributes, method_names, variables, file_name,
                 package_name):
        self.all_content = all_content
        self.comments = comments
        self.class_names = class_names
        self.attributes = attributes
        self.method_names = method_names
        self.variables = variables
        self.file_name = file_name
        self.exact_file_name = file_name[0]
        self.package_name = package_name
        self.pos_tagged_comments = None


class Parser:
    """Class containing different parsers"""
    __slots__ = ['name', 'src', 'bug_repo']

    def __init__(self, pro):
        self.name = pro.name
        self.src = pro.src
        self.bug_repo = pro.bug_repo

    def report_parser(self):
        reader = csv.DictReader(open(self.bug_repo, "r"), delimiter="\t")
        bug_reports = OrderedDict()
        # raw_texts = []
        # fixed_files = []
        for line in reader:
            # line["raw_text"] = line["summary"] + ' ' + line["description"]
            line["report_time"] = datetime.strptime(line["report_time"], "%Y-%m-%d %H:%M:%S")
            temp = line["files"].strip().split(".java")
            length = len(temp)
            x = []
            for i, f in enumerate(temp):
                if i == (length - 1):
                    x.append(os.path.normpath(f))
                    continue
                x.append(os.path.normpath(f + ".java"))
            line["files"] = x
            bug_reports[line["bug_id"]] = BugReport(line["summary"], line["description"], line["files"],
                                                    line["report_time"])
        # bug_reports = tsv2dict(self.bug_repo)

        return bug_reports

    def src_parser(self):
        """Parse source code directory of a program and colect its java files"""

        # Gettting the list of source files recursively from the source directory
        src_addresses = glob.glob(str(self.src) + '/**/*.java', recursive=True)
        print(src_addresses)
        # Creating a java lexer instance for pygments.lex() method
        java_lexer = JavaLexer()
        src_files = OrderedDict()
        # src_files = dict()
        # Looping to parse each source file
        for src_file in src_addresses:
            with open(src_file, encoding='latin-1') as file:
                src = file.read()

            # Placeholder for different parts of a source file
            comments = ''
            class_names = []
            attributes = []
            method_names = []
            variables = []

            # Source parsing
            parse_tree = None
            try:
                parse_tree = javalang.parse.parse(src)
                for path, node in parse_tree.filter(javalang.tree.VariableDeclarator):
                    if isinstance(path[-2], javalang.tree.FieldDeclaration):
                        attributes.append(node.name)
                    elif isinstance(path[-2], javalang.tree.VariableDeclaration):
                        variables.append(node.name)
            except:
                pass

            # Triming the source file
            ind = False
            if parse_tree:
                if parse_tree.imports:
                    last_imp_path = parse_tree.imports[-1].path
                    src = src[src.index(last_imp_path) + len(last_imp_path) + 1:]
                elif parse_tree.package:
                    package_name = parse_tree.package.name
                    src = src[src.index(package_name) + len(package_name) + 1:]
                else:  # no import and no package declaration
                    ind = True
            # javalang can't parse the source file
            else:
                ind = True

            # Lexically tokenize the source file
            lexed_src = pygments.lex(src, java_lexer)

            for i, token in enumerate(lexed_src):
                if token[0] in Token.Comment:
                    if ind and i == 0 and token[0] is Token.Comment.Multiline:
                        src = src[src.index(token[1]) + len(token[1]):]
                        continue
                    comments = comments + token[1]
                elif token[0] is Token.Name.Class:
                    class_names.append(token[1])
                elif token[0] is Token.Name.Function:
                    method_names.append(token[1])

            # get the package declaration if exists
            if parse_tree and parse_tree.package:
                package_name = parse_tree.package.name
            else:
                package_name = None

            if self.name == 'aspectj' or 'tomcat' or 'eclipse' or 'swt':
                src_files[os.path.relpath(src_file, start=self.src)] = SourceFile(src, comments, class_names,
                                                                                  attributes, method_names, variables, [
                                                                                      os.path.basename(src_file).split(
                                                                                          '.')[0]], package_name)
            else:
                # If source files has package declaration
                if package_name:
                    src_id = (package_name + '.' + os.path.basename(src_file))
                else:
                    src_id = os.path.basename(src_file)
                src_files[src_id] = SourceFile(src, comments, class_names, attributes, method_names, variables,
                                               [os.path.basename(src_file).split('.')[0]], package_name)
            # print(src_files)
            # print("===========")
        return src_files


class ReportPreprocessing:
    """Class preprocess bug reports"""
    __slots__ = ['bug_reports']

    def __init__(self, bug_reports):
        self.bug_reports = bug_reports

    def extract_stack_traces(self):
        """Extract stack traces from bug reports"""
        pattern = re.compile(r' at (.*?)\((.*?)\)')
        signs = ['.java', 'Unknown Source', 'Native Method']
        for report in self.bug_reports.values():
            st_canid = re.findall(pattern, report.description)
            st = [x for x in st_canid if any(s in x[1] for s in signs)]
            report.stack_traces = st

    def extract_stack_traces_remove(self):
        pattern = re.compile(r' at (.*?)\((.*?)\)')
        signs = ['.java', 'Unknown Source', 'Native Method']
        for report in self.bug_reports.values():
            st_canid = re.findall(pattern, report.description)
            st = [x for x in st_canid if any(s in x[1] for s in signs)]
            at = []
            for x in st:
                if (x[1] == 'Unknown Source'):
                    temp = 'Unknown'
                    y = x[0]+ '(' + temp
                else:
                    y = x[0] + '(' + x[1] + ')'
                at.append(y)
            report.stack_traces_remove = at

    def pos_tagging(self):
        """Extracing specific pos tags from bug reports raw_text"""
        for report in self.bug_reports.values():
            # Tokenizing using word_tokeize for more accurate pos-tagging
            sum_tok = nltk.word_tokenize(report.summary)
            desc_tok = nltk.word_tokenize(report.description)
            sum_pos = nltk.pos_tag(sum_tok)
            desc_pos = nltk.pos_tag(desc_tok)
            report.pos_tagged_summary = [token for token, pos in sum_pos if 'NN' in pos or 'VB' in pos]
            report.pos_tagged_description = [token for token, pos in desc_pos if 'NN' in pos or 'VB' in pos]

    def tokenize(self):
        """Tokenize bug report intro tokens"""
        for report in self.bug_reports.values():
            report.summary = nltk.wordpunct_tokenize(report.summary)
            report.description = nltk.wordpunct_tokenize(report.description)

    def _split_camelcase(self, tokens):
        # copy tokens
        returning_tokens = tokens[:]
        for token in tokens:
            split_tokens = re.split(fr'[{string.punctuation}]+', token)
            # if token is split into some other tokens
            if len(split_tokens) > 1:
                returning_tokens.remove(token)
                # camel case detection for new tokens
                for st in split_tokens:
                    camel_split = inflection.underscore(st).split('_')
                    if len(camel_split) > 1:
                        returning_tokens.append(st)
                        returning_tokens = returning_tokens + camel_split
                    else:
                        returning_tokens.append(st)
            else:
                camel_split = inflection.underscore(token).split('_')
                if len(camel_split) > 1:
                    returning_tokens = returning_tokens + camel_split
        return returning_tokens

    def split_camelcase(self):
        """Split camelcase indentifiers"""
        for report in self.bug_reports.values():
            report.summary = self._split_camelcase(report.summary)
            report.description = self._split_camelcase(report.description)
            report.pos_tagged_summary = self._split_camelcase(report.pos_tagged_summary)
            report.pos_tagged_description = self._split_camelcase(report.pos_tagged_description)

    def normalize(self):
        """remove punctuation, numbers and lowecase conversion"""
        # build a translate table for punctuation and number removal
        punctnum_table = str.maketrans({c: None for c in string.punctuation + string.digits})

        for report in self.bug_reports.values():
            summary_punctnum_rem = [token.translate(punctnum_table) for token in report.summary]
            desc_punctnum_rem = [token.translate(punctnum_table) for token in report.description]
            pos_sum_punctnum_rem = [token.translate(punctnum_table) for token in report.pos_tagged_summary]
            pos_desc_punctnum_rem = [token.translate(punctnum_table) for token in report.pos_tagged_description]
            report.summary = [token.lower() for token in summary_punctnum_rem if token]
            report.description = [token.lower() for token in desc_punctnum_rem if token]
            report.pos_tagged_summary = [token.lower() for token in pos_sum_punctnum_rem if token]
            report.pos_tagged_description = [token.lower() for token in pos_desc_punctnum_rem if token]

    def remove_stopwords(self):
        """removing stop word from tokens"""
        for report in self.bug_reports.values():
            report.summary = [token for token in report.summary if token not in stop_words]
            report.description = [token for token in report.description if token not in stop_words]
            report.pos_tagged_summary = [token for token in report.pos_tagged_summary if token not in stop_words]
            report.pos_tagged_description = [token for token in report.pos_tagged_description if token not in stop_words]

    def remove_java_keywords(self):
        """removing java language keywords from tokens"""
        for report in self.bug_reports.values():
            report.summary = [token for token in report.summary if token not in java_keywords]
            report.description = [token for token in report.description if token not in java_keywords]
            report.pos_tagged_summary = [token for token in report.pos_tagged_summary if token not in java_keywords]
            report.pos_tagged_description = [token for token in report.pos_tagged_description if token not in java_keywords]

    def stem(self):
        # stemming tokens
        stemmer = PorterStemmer()
        for report in self.bug_reports.values():
            report.summary = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.summary], report.summary]))
            report.description = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.description], report.description]))
            report.pos_tagged_summary = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.pos_tagged_summary], report.pos_tagged_summary]))
            report.pos_tagged_description = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.pos_tagged_description], report.pos_tagged_description]))

    def preprocess(self):
        self.extract_stack_traces()
        self.extract_stack_traces_remove()
        self.pos_tagging()
        self.tokenize()
        self.split_camelcase()
        self.normalize()
        self.remove_stopwords()
        self.remove_java_keywords()
        self.stem()

class SrcPreprocessing:
    """class to preprocess source code"""
    __slots__ = ['src_files']

    def __init__(self, src_files):
        self.src_files = src_files

    def pos_tagging(self):
        """Extracing specific pos tags from comments"""
        for src in self.src_files.values():
            # tokenize using word_tokenize
            comments_tok = nltk.word_tokenize(src.comments)
            comments_pos = nltk.pos_tag(comments_tok)
            src.pos_tagged_comments = [token for token, pos in comments_pos if 'NN' in pos or 'VB' in pos]

    def tokenize(self):
        """tokenize source code to tokens"""
        for src in self.src_files.values():
            src.all_content = nltk.wordpunct_tokenize(src.all_content)
            src.comments = nltk.wordpunct_tokenize(src.comments)

    def _split_camelcase(self, tokens):
        # copy token
        returning_tokens = tokens[:]
        for token in tokens:
            split_tokens = re.split(fr'[{string.punctuation}]+', token)
            # if token is split into some other tokens
            if len(split_tokens) > 1:
                returning_tokens.remove(token)
                # camelcase defect for new tokens
                for st in split_tokens:
                    camel_split = inflection.underscore(st).split('_')
                    if len(camel_split) > 1:
                        returning_tokens.append(st)
                        returning_tokens = returning_tokens + camel_split
                    else:
                        returning_tokens.append(st)
            else:
                camel_split = inflection.underscore(token).split('_')
                if len(camel_split) > 1:
                    returning_tokens = returning_tokens + camel_split
        return returning_tokens

    def split_camelcase(self):
        # Split camelcase indenti
        for src in self.src_files.values():
            src.all_content = self._split_camelcase(src.all_content)
            src.comments = self._split_camelcase(src.comments)
            src.class_names = self._split_camelcase(src.class_names)
            src.attributes = self._split_camelcase(src.attributes)
            src.method_names = self._split_camelcase(src.method_names)
            src.variables = self._split_camelcase(src.variables)
            src.pos_tagged_comments = self._split_camelcase(src.pos_tagged_comments)

    def normalize(self):
        "remove punctuation, number and lowercase conversion"
        # build a translate table for punctuation and number
        punctnum_table = str.maketrans({c: None for c in string.punctuation + string.digits})
        for src in self.src_files.values():
            content_punctnum_rem = [token.translate(punctnum_table) for token in src.all_content]
            comments_punctnum_rem = [token.translate(punctnum_table) for token in src.comments]
            classnames_punctnum_rem = [token.translate(punctnum_table) for token in src.class_names]
            attributes_punctnum_rem = [token.translate(punctnum_table) for token in src.attributes]
            methodnames_punctnum_rem = [token.translate(punctnum_table) for token in src.method_names]
            variables_punctnum_rem = [token.translate(punctnum_table) for token in src.variables]
            filename_punctnum_rem = [token.translate(punctnum_table) for token in src.file_name]
            pos_comments_punctnum_rem = [token.translate(punctnum_table) for token in src.pos_tagged_comments]

            src.all_content = [token.lower() for token in content_punctnum_rem if token]
            src.comments = [token.lower() for token in comments_punctnum_rem if token]
            src.class_names = [token.lower() for token in classnames_punctnum_rem if token]
            src.attributes = [token.lower() for token in attributes_punctnum_rem if token]
            src.method_names = [token.lower() for token in methodnames_punctnum_rem if token]
            src.variables = [token.lower() for token in variables_punctnum_rem if token]
            src.file_name = [token.lower() for token in filename_punctnum_rem if token]
            src.pos_tagged_comments = [token.lower() for token in pos_comments_punctnum_rem if token]

    def remove_stopwords(self):
        for src in self.src_files.values():
            src.all_content = [token for token in src.all_content if token not in stop_words]
            src.comments = [token for token in src.comments if token not in stop_words]
            src.class_names = [token for token in src.class_names if token not in stop_words]
            src.attributes = [token for token in src.attributes if token not in stop_words]
            src.method_names = [token for token in src.method_names if token not in stop_words]
            src.variables = [token for token in src.variables if token not in stop_words]
            src.file_name = [token for token in src.file_name if token not in stop_words]
            src.pos_tagged_comments = [token for token in src.pos_tagged_comments if token not in stop_words]

    def remove_javakeywords(self):
        for src in self.src_files.values():
            src.all_content = [token for token in src.all_content if token not in java_keywords]
            src.comments = [token for token in src.comments if token not in java_keywords]
            src.class_names = [token for token in src.class_names if token not in java_keywords]
            src.attributes = [token for token in src.attributes if token not in java_keywords]
            src.method_names = [token for token in src.method_names if token not in java_keywords]
            src.variables = [token for token in src.variables if token not in java_keywords]
            src.file_name = [token for token in src.file_name if token not in java_keywords]
            src.pos_tagged_comments = [token for token in src.pos_tagged_comments if token not in java_keywords]

    def stem(self):
        # stemming tokens
        stemmer = PorterStemmer()
        for src in self.src_files.values():
            src.all_content = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.all_content], src.all_content]))
            src.comments = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.comments], src.comments]))
            src.class_names = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.class_names], src.class_names]))
            src.attributes = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.attributes], src.attributes]))
            src.method_names = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.method_names], src.method_names]))
            src.variables = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.variables], src.variables]))
            src.file_name = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.file_name], src.file_name]))
            src.pos_tagged_comments = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.pos_tagged_comments], src.pos_tagged_comments]))


    def preprocess(self):
        self.pos_tagging()
        self.tokenize()
        self.split_camelcase()
        self.normalize()
        self.remove_stopwords()
        self.remove_javakeywords()
        self.stem()

In [4]:
!pip install inflection
import inflection


Collecting inflection
  Downloading inflection-0.5.1-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading inflection-0.5.1-py2.py3-none-any.whl (9.5 kB)
Installing collected packages: inflection
Successfully installed inflection-0.5.1


In [5]:
import nltk
nltk.download('punkt_tab')
import pickle
from google.colab import drive
import csv
from collections import OrderedDict
from datetime import datetime
import re
import string
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Load dữ liệu

In [6]:
import pickle

# Đường dẫn đến các file pickle
file_paths = {
    'aspectj': '/kaggle/input/bug-localization-data/aspectj_src_processed.pkl',
    'eclipse': '/kaggle/input/bug-localization-data/eclipse_src_processed.pkl',
    'swt': '/kaggle/input/bug-localization-data/swt_src_processed.pkl',
    'tomcat': '/kaggle/input/bug-localization-data/tomcat_src_processed.pkl',
    'birt': '/kaggle/input/bug-localization-data/birt_src_processed.pkl'
}

# Load từng file và lưu vào các biến tương ứng
datasets = {}

for name, path in file_paths.items():
    with open(path, 'rb') as f:
        datasets[name] = pickle.load(f)

# Kiểm tra dữ liệu đã được load vào các biến
for name, data in datasets.items():
    print(f"Data for {name}:")


Data for aspectj:
Data for eclipse:
Data for swt:
Data for tomcat:
Data for birt:


In [7]:
eclipse_src = datasets['eclipse']
birt_src = datasets['birt']
swt_src = datasets['swt']
tomcat_src = datasets['tomcat']
aspectj_src = datasets['aspectj']

In [8]:
# Load dữ liệu từ các file pickle đã lưu
file_paths = {
    'aspectj': '/kaggle/input/bug-localization-data/aspectj_reports_processed.pkl',
    'eclipse': '/kaggle/input/bug-localization-data/eclipse_reports_processed.pkl',
    'swt': '/kaggle/input/bug-localization-data/swt_reports_processed.pkl',
    'tomcat': '/kaggle/input/bug-localization-data/tomcat_reports_processed.pkl',
    'birt': '/kaggle/input/bug-localization-data/birt_reports_processed.pkl'
}

# Load từng dataset và lưu vào các biến
all_processed_reports = {}

for name, path in file_paths.items():
    with open(path, 'rb') as f:
        all_processed_reports[name] = pickle.load(f)

# Kiểm tra dữ liệu đã load vào
for dataset, reports in all_processed_reports.items():
    print(f"Processed reports for {dataset}:")

Processed reports for aspectj:
Processed reports for eclipse:
Processed reports for swt:
Processed reports for tomcat:
Processed reports for birt:


In [9]:
eclipse_reports = all_processed_reports['eclipse']
birt_reports = all_processed_reports['birt']
swt_reports = all_processed_reports['swt']
tomcat_reports = all_processed_reports['tomcat']
aspectj_reports = all_processed_reports['aspectj']

## 2. Xử lí data, gán nhãn
- Sắp xếp bug report theo thời gian (report_time)
- Chia thành 10 folds
- Tạo training/test dataset theo kiểu fold i → fold i+1
- Gán nhãn cho từng cặp (bug report, source file)

In [10]:
# B1: Lấy danh sách (bug_id, bug_report), sau đó sắp xếp theo report_time
sorted_bug_reports = sorted(aspectj_reports.items(), key=lambda x: x[1].report_time)
data_src = aspectj_src

In [11]:
def split_into_folds(sorted_reports, num_folds=10):
    fold_size = len(sorted_reports) // num_folds
    folds = [sorted_reports[i*fold_size:(i+1)*fold_size] for i in range(num_folds)]

    # Nếu còn dư, rải đều vào các fold đầu
    remainder = sorted_reports[num_folds*fold_size:]
    for i, extra in enumerate(remainder):
        folds[i].append(extra)
    return folds

data_folds = split_into_folds(sorted_bug_reports, num_folds=3)


In [12]:
i = 0 # thử với fold 0 → 1
train_fold = data_folds[i]
test_fold = data_folds[i+1]

In [13]:
import random

def generate_balanced_pairs(bug_fold, source_files, num_negatives_per_positive=50):
    data = []
    for bug_id, bug in bug_fold:
        # Danh sách file chứa bug (poszqitive)
        positive_paths = set(bug.fixed_files)
        positive = [
            (bug_id, bug, src_path, source_files[src_path], 1)
            for src_path in positive_paths if src_path in source_files
        ]

        # Danh sách file còn lại để lấy negative
        all_paths = list(source_files.keys())
        negative_paths = list(set(all_paths) - positive_paths)
        sampled_negatives = random.sample(negative_paths, min(num_negatives_per_positive * len(positive), len(negative_paths)))

        negative = [
            (bug_id, bug, src_path, source_files[src_path], 0)
            for src_path in sampled_negatives if src_path in source_files
        ]

        data.extend(positive + negative)
    return data
def generate_all_negatives_pairs(bug_fold, source_files):
    data = []
    for bug_id, bug in bug_fold:
        positive_paths = set(bug.fixed_files)
        positive = [
            (bug_id, bug, src_path, source_files[src_path], 1)
            for src_path in positive_paths if src_path in source_files
        ]

        all_paths = list(source_files.keys())
        negative_paths = list(set(all_paths) - positive_paths)

        negative = [
            (bug_id, bug, src_path, source_files[src_path], 0)
            for src_path in negative_paths if src_path in source_files
        ]

        data.extend(positive + negative)
    return data


train_pairs = generate_balanced_pairs(train_fold, data_src, num_negatives_per_positive=50)
#test_pairs = generate_balanced_pairs(test_fold, data_src, num_negatives_per_positive=50)
test_pairs = generate_all_negatives_pairs(test_fold, data_src)



Xử lí mất cân bằng

In [14]:
def compute_stats(pairs):
    total = len(pairs)
    pos = sum(1 for _, _, _, _, label in pairs if label == 1)
    neg = total - pos
    ratio = pos / total if total > 0 else 0
    return total, pos, neg, ratio

  
total, pos, neg, ratio = compute_stats(train_pairs)
print("📊 Train Set:")
print(f"  ➤ Tổng cặp: {total}")
print(f"  ✅ Positive (label=1): {pos}")
print(f"  ❌ Negative (label=0): {neg}")
print(f"  ⚖️ Tỷ lệ positive: {ratio:.4f}")

total, pos, neg, ratio = compute_stats(test_pairs)
print("\n🧪 Test Set:")
print(f"  ➤ Tổng cặp: {total}")
print(f"  ✅ Positive (label=1): {pos}")
print(f"  ❌ Negative (label=0): {neg}")
print(f"  ⚖️ Tỷ lệ positive: {ratio:.4f}")


📊 Train Set:
  ➤ Tổng cặp: 8619
  ✅ Positive (label=1): 169
  ❌ Negative (label=0): 8450
  ⚖️ Tỷ lệ positive: 0.0196

🧪 Test Set:
  ➤ Tổng cặp: 1368180
  ✅ Positive (label=1): 168
  ❌ Negative (label=0): 1368012
  ⚖️ Tỷ lệ positive: 0.0001


### Hàm 1: Tạo batches có bootstrapping (luôn chứa ít nhất 1 positive sample)

In [15]:
import random
def create_bootstrapped_batches(pairs, batch_size=128):
    # Tách positive và negative
    positives = [p for p in pairs if p[-1] == 1]
    negatives = [p for p in pairs if p[-1] == 0]

    batches = []
    # Tính số batch có thể tạo
    total_batches = len(pairs) // batch_size

    for _ in range(total_batches):
        # Luôn chọn ít nhất 1 positive
        pos_sample = random.choice(positives)

        # Chọn ngẫu nhiên batch_size - 1 negative samples
        neg_samples = random.sample(negatives, batch_size - 1)

        # Gộp lại, shuffle để positive không đứng đầu
        batch = [pos_sample] + neg_samples
        random.shuffle(batch)

        batches.append(batch)

    return batches


### Focal Loss Function

In [16]:

def focal_loss(predictions, targets, alpha=0.999, gamma=2.0, eps=1e-6):
    """
    predictions: tensor (batch_size,) - output sigmoid from model
    targets: tensor (batch_size,) - true labels (0 or 1)
    """
    # Avoid log(0)
    predictions = predictions.clamp(min=eps, max=1.0 - eps)

    # Compute focal loss
    loss = -alpha * (1 - predictions)**gamma * targets * predictions.log() \
           - (1 - alpha) * predictions**gamma * (1 - targets) * (1 - predictions).log()
    return loss.mean()

# 4. Trích xuất đặc trưng

In [17]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Hàm xử lý text gộp lại từ bug report
def bug_to_text(bug):
    summary = bug.summary['unstemmed'] if isinstance(bug.summary, dict) else bug.summary
    desc = bug.description['unstemmed'] if isinstance(bug.description, dict) else bug.description
    return " ".join(summary + desc)

# Hàm xử lý text từ source file
def src_to_text(src):
    content = src.all_content['unstemmed'] if isinstance(src.all_content, dict) else src.all_content
    comments = src.comments['unstemmed'] if isinstance(src.comments, dict) else src.comments
    return " ".join(content + comments)


## Đặc trưng 1: Tính toán độ tương đồng từ vựng (lexical similarity)
- Phương pháp: sử dụng TF-IDF và cosine similarity.
- Input: Cặp dữ liệu (bug report, source file)
- Output: mảng numpy chứa các giá trị độ tương đồng cosine giữa bug report và source file cho mỗi cặp.

In [18]:
def compute_lexical_similarity(pairs):
    bug_texts = [bug_to_text(bug) for _, bug, _, _, _ in pairs]
    src_texts = [src_to_text(src) for _, _, _, src, _ in pairs]

    # Gộp cả bug + src lại để fit chung vectorizer
    combined = bug_texts + src_texts
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(combined)

    # Tách riêng lại từng phần
    bug_vecs = tfidf_matrix[:len(pairs)]
    src_vecs = tfidf_matrix[len(pairs):]

    # Tính cosine cho từng cặp (theo hàng tương ứng)
    similarities = cosine_similarity(bug_vecs, src_vecs).diagonal()

    return similarities

# Demo: tính feature lexical similarity cho train_pairs (giới hạn 500 mẫu vì tốc độ)
sampled_pairs = train_pairs[:500]
lexical_sim = compute_lexical_similarity(sampled_pairs)

# Trả về dưới dạng numpy array
lexical_sim[:10]  # Trích 10 giá trị đầu tiên để test

array([0.00133791, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0100989 , 0.00052573, 0.00201953, 0.        ])

In [19]:
glove_path = "/kaggle/input/glove-embedding/glove.6B.100d.txt"
# Load GloVe 100d vào dictionary
import numpy as np

def load_glove_embeddings(filepath):
    embeddings = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings
    
glove_embeddings = load_glove_embeddings(glove_path)

## Đặc trưng 2: Tính toán độ tương đồng ngữ nghĩa (semantic similarity)
- Phương pháp: TF-IDF weighted average của GloVe vectors và cosine similarity
- Input:  (bug report, source file).
- Output: Một mảng numpy chứa các giá trị độ tương đồng cosine giữa bug report và source file cho mỗi cặp, dựa trên GloVe vectors và trọng số TF-IDF.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_semantic_similarity(pairs, glove_dict, dim=100):
    bug_texts = [bug_to_text(bug) for _, bug, _, _, _ in pairs]
    src_texts = [src_to_text(src) for _, _, _, src, _ in pairs]

    # Dùng TF-IDF để lấy trọng số từ
    tfidf = TfidfVectorizer()
    tfidf.fit(bug_texts + src_texts)
    vocab = tfidf.vocabulary_
    idf_weights = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

    def embed_text(text):
        tokens = text.split()
        vecs = []
        weights = []
        for token in tokens:
            if token in glove_dict and token in vocab:
                vecs.append(glove_dict[token])
                weights.append(idf_weights[token])
        if not vecs:
            return np.zeros(dim)
        vecs = np.array(vecs)
        weights = np.array(weights).reshape(-1, 1)
        weighted_vecs = vecs * weights
        return weighted_vecs.sum(axis=0) / weights.sum()

    # Tính vector trung bình cho bug và src
    bug_vecs = [embed_text(text) for text in bug_texts]
    src_vecs = [embed_text(text) for text in src_texts]

    # Tính cosine similarity giữa từng cặp
    similarities = [cosine_similarity([b], [s])[0][0] for b, s in zip(bug_vecs, src_vecs)]

    return np.array(similarities)
sampled_pairs = train_pairs[:500]
# Demo trên 500 cặp như lexical
semantic_sim = compute_semantic_similarity(sampled_pairs, glove_embeddings)
semantic_sim[:10]


array([0.77527147, 0.69421516, 0.        , 0.50986771, 0.74005518,
       0.06775918, 0.81621045, 0.77655908, 0.67360652, 0.48345688])

### Đặc trưng 3: Similar Bug Report Score 

→ Kiểm tra xem bug report này có giống **những bug report cũ từng sửa cùng file đó** không?

- `build_bug_fix_history(pairs)` → XD lịch sử chỉnh sửa theo từng file
- `compute_similar_bug_score(pairs, history)`
    - Input: pairs, history
    - So sánh bug hiện tại và bug cũ:
    
    cosine_similarity(TfidfVectorizer().fit_transform([bug_now, bug_old]))[0, 1]
    
    - Lấy giá trị tương đồng cao nhất vừa tìm được

In [21]:
from datetime import datetime

# Bổ trợ: tạo map file_path -> list of (bug_id, report_time, bug_text)
def build_bug_fix_history(pairs):
    history = {}
    for bug_id, bug, src_path, _, label in pairs:
        if label == 1:  # chỉ tính các bug thật sự sửa file
            if src_path not in history:
                history[src_path] = []
            history[src_path].append((bug_id, bug.report_time, bug_to_text(bug)))
    return history

# Đặc trưng 3: Similar Bug Report Score
def compute_similar_bug_score(pairs, history):
    scores = []
    for bug_id, bug, src_path, _, _ in pairs:
        current_time = bug.report_time
        current_text = bug_to_text(bug)

        sim_scores = []
        if src_path in history:
            for hist_bug_id, hist_time, hist_text in history[src_path]:
                if hist_time < current_time:  # chỉ tính bug trong quá khứ
                    sim = cosine_similarity(
                        TfidfVectorizer().fit_transform([current_text, hist_text])
                    )[0, 1]
                    sim_scores.append(sim)
        scores.append(max(sim_scores) if sim_scores else 0.0)
    return np.array(scores)


### Đặc trung 4: Time Since Last Fix (ngày, normalize)
- Kiểm tra với mỗi `(bug report, source file)` xem từng được sửa trước đó không và lần cuối khi nào
    - Đã lâu k sửa → Ít lỗi → Điểm thấp
    - Mới sửa → có thể liên quan tới lỗi → Điểm cao
- Cách hđ:
    - Tìm thời điểm bug current_time
    - Tìm history các lần sửa file trong quá khứ
    - Tính khoảng cách time giữa current và history gần nhất
    - Chưa sửa → Gán số delta_days=9999
    - Chuẩn hoá

In [22]:
# Đặc trưng 4: Time Since Last Fix (ngày, normalize)
def compute_time_since_last_fix(pairs, history):
    scores = []
    for _, bug, src_path, _, _ in pairs:
        current_time = bug.report_time
        if src_path in history:
            past_times = [hist_time for _, hist_time, _ in history[src_path] if hist_time < current_time]
            if past_times:
                delta_days = (current_time - max(past_times)).days
            else:
                delta_days = 9999  # Cực lớn nếu chưa từng sửa
        else:
            delta_days = 9999
        scores.append(delta_days)

    # Normalize về [0,1]
    max_days = max(scores) if max(scores) != 0 else 1  # Tránh chia cho 0

    return np.array([1 - (s / max_days) for s in scores])



### Đặc trưng 5: Fix Frequency (số lần bị sửa trong quá khứ, normalize)


- Kiểm tra xme mỗi cặp được sửa bao nhiêu lần

→ Sửa nhiều → File dễ dính lỗi → Điểm cao

In [23]:
# Đặc trưng 5: Fix Frequency (số lần bị sửa trong quá khứ, normalize)
def compute_fix_frequency(pairs, history):
    scores = []
    for _, bug, src_path, _, _ in pairs:
        current_time = bug.report_time
        if src_path in history:
            past_fixes = [1 for _, hist_time, _ in history[src_path] if hist_time < current_time]
            freq = len(past_fixes)
        else:
            freq = 0
        scores.append(freq)
    # Normalize về [0,1] an toàn
    max_freq = max(scores)
    max_freq = max(max_freq, 1)  # tránh chia 0
    return np.array([s / max_freq for s in scores])


# Dùng cho 500 cặp mẫu
sampled_pairs = train_pairs[:5000]
bug_history = build_bug_fix_history(train_pairs)

similar_bug_score = compute_similar_bug_score(sampled_pairs, bug_history)
time_since_last_fix = compute_time_since_last_fix(sampled_pairs, bug_history)
fix_frequency = compute_fix_frequency(sampled_pairs, bug_history)

# Trích 5 giá trị đầu mỗi feature
similar_bug_score[:50], time_since_last_fix[:50], fix_frequency[:50]

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

# 4. Quá trình huấn luyện

## 4.1 Tạo ma trận train, test

In [24]:
# Gộp 5 đặc trưng lại thành feature matrix X (n_samples, 5)
def build_feature_matrix(pairs, glove_dict, history):
    lexical = compute_lexical_similarity(pairs)
    semantic = compute_semantic_similarity(pairs, glove_dict)
    similar_score = compute_similar_bug_score(pairs, history)
    recency = compute_time_since_last_fix(pairs, history)
    freq = compute_fix_frequency(pairs, history)

    # Gộp lại theo chiều dọc → ma trận (n_samples, 5)
    X = np.stack([lexical, semantic, similar_score, recency, freq], axis=1)

    return X

# Tạo nhãn y
def get_labels(pairs):
    return np.array([label for *_, label in pairs])

# Tạo dữ liệu train từ sampled_pairs
X_train = build_feature_matrix(train_pairs, glove_embeddings, bug_history)
y_train = get_labels(train_pairs)

# In shape để xác nhận
X_train.shape, y_train.shape



((8619, 5), (8619,))

In [25]:
def build_feature_matrix_batch(pairs_batch, glove_dict, history):
    lexical = compute_lexical_similarity(pairs_batch)
    semantic = compute_semantic_similarity(pairs_batch, glove_dict)
    similar_score = compute_similar_bug_score(pairs_batch, history)
    recency = compute_time_since_last_fix(pairs_batch, history)
    freq = compute_fix_frequency(pairs_batch, history)

    # Stack theo chiều dọc → (batch_size, 5)
    X = np.stack([lexical, semantic, similar_score, recency, freq], axis=1)
    return X

def get_labels(pairs):
    return np.array([label for *_, label in pairs])

In [26]:
def pair_generator(pairs, batch_size, glove_dict, history):
    for i in range(0, len(pairs), batch_size):
        batch = pairs[i:i + batch_size]
        X = build_feature_matrix_batch(batch, glove_dict, history)
        y = np.array([label for *_, label in batch])
        yield torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
# Chuẩn bị Dataset & Dataloader từ numpy
def create_dataloader(X, y, batch_size=128):
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32)
    dataset = TensorDataset(X_tensor, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

## 4.2 Xây dựng mô hình

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Định nghĩa mô hình DNN giống bài báo
import torch
import torch.nn as nn

class BugLocalization(nn.Module):
        def __init__(self, input_dim=5, hidden_dims=[128, 64], output_dim=1):
            super(BugLocalization, self).__init__()

            # Define a series of fully connected (Dense) layers
            self.fc1 = nn.Linear(input_dim, hidden_dims[0])  # First hidden layer
            self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])  # Second hidden layer
            self.fc3 = nn.Linear(hidden_dims[1], output_dim)  # Output layer

            # Define activation function (ReLU for hidden layers and Sigmoid for output)
            self.relu = nn.ReLU()
            self.sigmoid = nn.Sigmoid()

        def forward(self, x):
        # Forward pass through the DNN
            x = self.relu(self.fc1(x))  # Pass through the first hidden layer with ReLU activation
            x = self.relu(self.fc2(x))  # Pass through the second hidden layer with ReLU activation
            x = self.sigmoid(self.fc3(x))  # Output layer with Sigmoid activation for binary classification
            return x.squeeze()  # Remove extra dimension from the output (as it's a single value per input)


        
# Định nghĩa focal loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.999, gamma=2.0, eps=1e-6):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.eps = eps

    def forward(self, preds, targets):
        preds = preds.clamp(min=self.eps, max=1. - self.eps)
        loss = -self.alpha * (1 - preds) ** self.gamma * targets * torch.log(preds) \
               - (1 - self.alpha) * preds ** self.gamma * (1 - targets) * torch.log(1 - preds)
        return loss.mean()


# Huấn luyện mô hình
def train_model_generator(model, train_gen, epochs=10, lr=1e-3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = FocalLoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_X, batch_y in train_gen:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            # 🧹 Dọn bộ nhớ mỗi batch
            del batch_X, batch_y, outputs, loss
            torch.cuda.empty_cache()
            import gc; gc.collect()

        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f}")



In [29]:
# In thử 1 batch đầu
for X_batch, y_batch in pair_generator(train_pairs, 128, glove_embeddings, bug_history):
    print("👉 Feature Sample:", X_batch[2])
    print("👉 Label Sample:", y_batch[2])
    break


👉 Feature Sample: tensor([0., 0., 0., 0., 0.])
👉 Label Sample: tensor(0.)


In [30]:
def test_pair_generator(pairs, batch_size, glove_dict, history):
    for i in range(0, len(pairs), batch_size):
        batch = pairs[i:i + batch_size]
        X = build_feature_matrix_batch(batch, glove_dict, history)
        yield X


In [31]:

# Đánh giá các chỉ số (MAP, MRR, Top-k)
def compute_topk_accuracy(y_true, y_scores, k=10):
    bug_to_scores = {}
    for (bug_id, _, src_path, _, label), score in zip(test_pairs, y_scores):
        if bug_id not in bug_to_scores:
            bug_to_scores[bug_id] = []
        bug_to_scores[bug_id].append((score, label))

    correct_at_k = 0
    total = 0

    for bug_id, entries in bug_to_scores.items():
        sorted_entries = sorted(entries, key=lambda x: x[0], reverse=True)
        top_k = sorted_entries[:k]
        if any(label == 1 for _, label in top_k):
            correct_at_k += 1
        total += 1

    return correct_at_k / total if total > 0 else 0

# MRR (Mean Reciprocal Rank)
def mean_reciprocal_rank(pairs, scores):
    bug_to_scores = {}
    for (bug_id, _, _, _, label), score in zip(pairs, scores):
        if bug_id not in bug_to_scores:
            bug_to_scores[bug_id] = []
        bug_to_scores[bug_id].append((score, label))

    rr_sum = 0
    count = 0
    for bug_id, ranked in bug_to_scores.items():
        ranked = sorted(ranked, key=lambda x: x[0], reverse=True)
        for idx, (_, label) in enumerate(ranked):
            if label == 1:
                rr_sum += 1 / (idx + 1)
                break
        count += 1
    return rr_sum / count if count > 0 else 0

In [32]:
def run_kfold_training_and_eval(folds, source_files, glove_dict, k=3):
    results = {
        "fold": [],
        "MAP": [],
        "MRR": [],
        "Top1": [],
        "Top2": [],
        "Top3": [],
        "Top4": [],
        "Top5": [],
        "Top10": [],
        "Top15": []
    }

    for i in range(k - 1):
        print(f"\n📦 Fold {i} ➤ {i+1}")
        train_fold = folds[i]
        test_fold = folds[i + 1]

        # ✅ Train: lấy một số lượng negative cố định
        train_pairs = generate_balanced_pairs(train_fold, source_files, num_negatives_per_positive=50)

        # ✅ Test: lấy full negative (không sampling)
        test_pairs = generate_all_negatives_pairs(test_fold, source_files)

        # Bỏ qua nếu không có positive nào
        if sum(1 for p in train_pairs if p[-1] == 1) < 1:
            print("⚠️ Bỏ qua do quá ít positive samples")
            continue

        # ✅ Xây history từ train
        bug_history = build_bug_fix_history(train_pairs)

        # ✅ Khởi tạo model, optimizer, loss
        model = BugLocalization(input_dim=5).to(device)  # ⚠️ sửa lại input_dim nếu dùng đủ 5 đặc trưng
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = FocalLoss(alpha=0.25)

        # ✅ Train bằng generator
        model.train()
        for epoch in range(10):
            train_gen = pair_generator(train_pairs, batch_size=128, glove_dict=glove_dict, history=bug_history)
            total_loss = 0
            for X_batch, y_batch in train_gen:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)

                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

                # Clean up
                del X_batch, y_batch, outputs, loss
                torch.cuda.empty_cache()
                import gc; gc.collect()

            print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")

        # ✅ Dự đoán theo batch
        bug_history_test = build_bug_fix_history(test_pairs)
        y_test = get_labels(test_pairs)
        y_pred_probs = []

        model.eval()
        with torch.no_grad():
            for X_batch in test_pair_generator(test_pairs, batch_size=128, glove_dict=glove_dict, history=bug_history_test):
                X_tensor = torch.tensor(X_batch, dtype=torch.float32).to(device)
                batch_probs = model(X_tensor).cpu().numpy()
                y_pred_probs.extend(batch_probs)

        # ✅ Tính các chỉ số
        map_score = average_precision_score(y_test, y_pred_probs)
        mrr_score = mean_reciprocal_rank(test_pairs, y_pred_probs)
        top1 = compute_topk_accuracy(y_test, y_pred_probs, k=1)
        top2 = compute_topk_accuracy(y_test, y_pred_probs, k=2)
        top3 = compute_topk_accuracy(y_test, y_pred_probs, k=3)
        top4 = compute_topk_accuracy(y_test, y_pred_probs, k=4)
        top5 = compute_topk_accuracy(y_test, y_pred_probs, k=5)
        top10 = compute_topk_accuracy(y_test, y_pred_probs, k=10)
        top15 = compute_topk_accuracy(y_test, y_pred_probs, k=15)

        print(f"✅ Fold {i + 1} Results:")
        print(f"  ➤ MAP:   {map_score:.4f}")
        print(f"  ➤ MRR:   {mrr_score:.4f}")
        print(f"  ➤ Top@1: {top1:.4f} | Top@2: {top2:.4f} | Top@3: {top3:.4f}")
        print(f"  ➤ Top@4: {top4:.4f} | Top@5: {top5:.4f} | Top@10: {top10:.4f} | Top@15: {top15:.4f}")

        # ✅ Ghi lại kết quả
        results["fold"].append(i)
        results["MAP"].append(map_score)
        results["MRR"].append(mrr_score)
        results["Top1"].append(top1)
        results["Top2"].append(top2)
        results["Top3"].append(top3)
        results["Top4"].append(top4)
        results["Top5"].append(top5)
        results["Top10"].append(top10)
        results["Top15"].append(top15)

    return results


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim  # ✅ phần bị thiếu

# Run K-fold training and evaluation
full_results = run_kfold_training_and_eval(data_folds, data_src, glove_embeddings)

# Output full results
print("\nFull Results:")
for key, value in full_results.items():
    print(f"{key}: {value}")



📦 Fold 0 ➤ 1
Epoch 1: Loss = 2.6087
Epoch 2: Loss = 0.5380
Epoch 3: Loss = 0.4695
Epoch 4: Loss = 0.4240
Epoch 5: Loss = 0.3950
Epoch 6: Loss = 0.3853
Epoch 7: Loss = 0.3828
Epoch 8: Loss = 0.3817
Epoch 9: Loss = 0.3812
Epoch 10: Loss = 0.3807


In [None]:

# In kết quả tổng hợp sau khi chạy tất cả folds
print("\n📊 Kết quả tổng hợp:")
for i in range(len(full_results["fold"])):
    print(f"Fold {full_results['fold'][i]}:")
    print(f"  ➤ MAP: {full_results['MAP'][i]:.4f}")
    print(f"  ➤ MRR: {full_results['MRR'][i]:.4f}")
    print(f"  ➤ Top1: {full_results['Top1'][i]:.4f}")
    print(f"  ➤ Top2: {full_results['Top2'][i]:.4f}")
    print(f"  ➤ Top3: {full_results['Top3'][i]:.4f}")
    print(f"  ➤ Top4: {full_results['Top4'][i]:.4f}")
    print(f"  ➤ Top5: {full_results['Top5'][i]:.4f}")
    print(f"  ➤ Top10: {full_results['Top10'][i]:.4f}")
    print(f"  ➤ Top15: {full_results['Top15'][i]:.4f}")

# Tính trung bình cho tất cả các chỉ số
mean_map = np.mean(full_results["MAP"])
mean_mrr = np.mean(full_results["MRR"])
mean_top1 = np.mean(full_results["Top1"])
mean_top2 = np.mean(full_results["Top2"])
mean_top3 = np.mean(full_results["Top3"])
mean_top4 = np.mean(full_results["Top4"])
mean_top5 = np.mean(full_results["Top5"])
mean_top10 = np.mean(full_results["Top10"])
mean_top15 = np.mean(full_results["Top15"])

# In kết quả trung bình
print("\n📊 Kết quả trung bình trên toàn bộ k-folds:")
print(f"  ➤ MAP: {mean_map:.4f}")
print(f"  ➤ MRR: {mean_mrr:.4f}")
print(f"  ➤ Top1: {mean_top1:.4f}")
print(f"  ➤ Top2: {mean_top2:.4f}")
print(f"  ➤ Top3: {mean_top3:.4f}")
print(f"  ➤ Top4: {mean_top4:.4f}")
print(f"  ➤ Top5: {mean_top5:.4f}")
print(f"  ➤ Top10: {mean_top10:.4f}")
print(f"  ➤ Top15: {mean_top15:.4f}")



In [None]:
import pandas as pd

# Dữ liệu SOTA: ImbalancedBugLoc
data = {
    "Project": ["AspectJ", "Tomcat", "Eclipse", "SWT", "Birt"],
    "Top1_SOTA": [52.5, 53.2, 48.1, 40.2, 28.3],
    "Top2_SOTA": [68.7, 65.5, 62.1, 54.9, 39.3],
    "Top3_SOTA": [77.2, 71.0, 68.8, 64.2, 45.7],
    "Top4_SOTA": [81.0, 75.0, 73.0, 69.3, 51.0],
    "Top5_SOTA": [83.8, 78.3, 76.7, 73.4, 53.6],
    "Top10_SOTA": [89.0, 85.6, 84.7, 84.8, 63.2],
    "Top15_SOTA": [91.5, 88.9, 87.8, 89.1, 69.2],
    "MRR_SOTA": [0.66, 0.64, 0.60, 0.55, 0.40],
    "MAP_SOTA": [0.50, 0.59, 0.54, 0.50, 0.32],
    "Top1_New": [61.90, 56.75, 66.30, 60.63, 45.97],
    "Top2_New": [71.13, 69.58, 78.20, 74.70, 59.04],
    "Top3_New": [76.19, 77.48, 83.35, 81.28, 66.60],
    "Top4_New": [80.06, 82.71, 86.71, 84.77, 71.74],
    "Top5_New": [82.74, 85.79, 89.13, 87.49, 75.62],
    "Top10_New": [88.39, 92.90, 94.38, 93.91, 85.76],
    "Top15_New": [91.07, 94.90, 96.19, 96.07, 90.50],
    "MRR_New": [0.7109, 0.6899, 0.7644, 0.7183, 0.5921],
    "MAP_New": [0.5367, 0.4946, 0.5692, 0.4669, 0.3734]
}

# Tạo DataFrame từ dữ liệu
df = pd.DataFrame(data)

# Tách thành hai bảng: SOTA và New Model
df_sota = df[["Project", "Top1_SOTA", "Top2_SOTA", "Top3_SOTA", "Top4_SOTA", "Top5_SOTA", 
              "Top10_SOTA", "Top15_SOTA", "MRR_SOTA", "MAP_SOTA"]].copy()
df_sota["Model"] = "SOTA"

df_new = df[["Project", "Top1_New", "Top2_New", "Top3_New", "Top4_New", "Top5_New", 
             "Top10_New", "Top15_New", "MRR_New", "MAP_New"]].copy()
df_new["Model"] = "New Model"

# Đổi tên cột giống bảng trong ảnh
rename_cols = {
    "Top1_SOTA": "1", "Top2_SOTA": "2", "Top3_SOTA": "3", "Top4_SOTA": "4",
    "Top5_SOTA": "5", "Top10_SOTA": "10", "Top15_SOTA": "15", "MRR_SOTA": "MRR", "MAP_SOTA": "MAP",
    "Top1_New": "1", "Top2_New": "2", "Top3_New": "3", "Top4_New": "4",
    "Top5_New": "5", "Top10_New": "10", "Top15_New": "15", "MRR_New": "MRR", "MAP_New": "MAP"
}

df_sota.rename(columns=rename_cols, inplace=True)
df_new.rename(columns=rename_cols, inplace=True)

# Gộp lại
df_combined = pd.concat([df_sota, df_new], axis=0)
df_combined = df_combined.sort_values(by=["Project", "Model"]).reset_index(drop=True)

# Đưa cột 'Model' về sau 'Project'
cols = df_combined.columns.tolist()
cols.insert(1, cols.pop(cols.index('Model')))
df_combined = df_combined[cols]

# Hiển thị kết quả
df_combined


In [None]:
# Xuất ra file Excel
df_combined.to_excel("model_comparison.xlsx", index=False)


In [None]:
# Tạo lại df_combined để cập nhật MAP và MRR nhân 100
df_combined_scaled = df_combined.copy()
df_combined_scaled["MAP"] *= 100
df_combined_scaled["MRR"] *= 100

# Vẽ lại 5 biểu đồ cột với MAP và MRR nhân 100
projects = df_combined_scaled["Project"].unique()
metrics = ["1", "2", "3", "4", "5", "10", "15", "MRR", "MAP"]

plt.figure(figsize=(20, 25))

for i, project in enumerate(projects, 1):
    plt.subplot(3, 2, i)
    sota_vals = df_combined_scaled[(df_combined_scaled["Project"] == project) & (df_combined_scaled["Model"] == "SOTA")][metrics].values.flatten()
    new_vals = df_combined_scaled[(df_combined_scaled["Project"] == project) & (df_combined_scaled["Model"] == "New Model")][metrics].values.flatten()
    
    x = range(len(metrics))
    bar_width = 0.35

    plt.bar([xi - bar_width/2 for xi in x], sota_vals, width=bar_width, label="SOTA")
    plt.bar([xi + bar_width/2 for xi in x], new_vals, width=bar_width, label="New Model")

    plt.title(f"So sánh SOTA vs New Model - {project}")
    plt.xticks(ticks=x, labels=metrics)
    plt.ylabel("Giá trị (%)")
    plt.ylim(0, 110)
    plt.grid(axis='y')
    plt.legend()

plt.tight_layout()
plt.show()
