## 1. Load dữ liệu đã xử lí

In [1]:
!pip install nltk




In [2]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
# English stop words
stop_words = set(
    ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
     'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
     'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs',
     'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
     'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
     'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
     'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
     'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
     'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
     'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
     'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
     'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
     's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o',
     're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven',
     'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won',
     'wouldn', 'b', 'c', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'n', 'p', 'q', 'u', 'v',
     'w', 'x', 'z', 'us'])

# Java language keywords
java_keywords = set(
    ['abstract', 'assert', 'boolean', 'break', 'byte', 'case',
     'catch', 'char', 'class', 'const', 'continue', 'default', 'do', 'double',
     'else', 'enum', 'extends', 'false', 'final', 'finally', 'float', 'for', 'goto',
     'if', 'implements', 'import', 'instanceof', 'int', 'interface', 'long',
     'native', 'new', 'null', 'package', 'private', 'protected', 'public', 'return',
     'short', 'static', 'strictfp', 'super', 'switch', 'synchronized', 'this',
     'throw', 'throws', 'transient', 'true', 'try', 'void', 'volatile', 'while'])

from collections import namedtuple
from pathlib import Path

# Dataset root directory (điều chỉnh đường dẫn nếu cần)
_DATASET_ROOT = Path('/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/')

Dataset = namedtuple('Dataset', ['name', 'src', 'bug_repo', 'repo_url', 'features'])

# Các dataset được định nghĩa
aspectj = Dataset(
    'aspectj',
    _DATASET_ROOT / 'source files/org.aspectj',
    _DATASET_ROOT / 'bug reports/AspectJ.txt',
    "https://github.com/eclipse/org.aspectj/tree/bug433351.git",
    _DATASET_ROOT / 'bug reports/AspectJ.xlsx'
)

eclipse = Dataset(
    'eclipse',
    _DATASET_ROOT / 'source files/eclipse.platform.ui-johna-402445',
    _DATASET_ROOT / 'bug reports/Eclipse_Platform_UI.txt',
    "https://github.com/eclipse/eclipse.platform.ui.git",
    _DATASET_ROOT / 'bug reports/Eclipse_Platform_UI.xlsx'
)

swt = Dataset(
    'swt',
    _DATASET_ROOT / 'source files/eclipse.platform.swt-xulrunner-31',
    _DATASET_ROOT / 'bug reports/SWT.txt',
    "https://github.com/eclipse/eclipse.platform.swt.git",
    _DATASET_ROOT / 'bug reports/SWT.xlsx'
)

tomcat = Dataset(
    'tomcat',
    _DATASET_ROOT / 'source files/tomcat-7.0.51',
    _DATASET_ROOT / 'bug reports/Tomcat.txt',
    "https://github.com/apache/tomcat.git",
    _DATASET_ROOT / 'bug reports/Tomcat.xlsx'
)

birt = Dataset(
    'birt',
    _DATASET_ROOT / 'source files/birt-20140211-1400',
    _DATASET_ROOT / 'bug reports/Birt.txt',
    "https://github.com/apache/birt.git",
    _DATASET_ROOT / 'bug reports/Birt.xlsx'
)


### Current dataset in use. (change this name to change the dataset)
DATASET = tomcat

class BugReport:
    """Class representing each bug report"""
    __slots__ = ['summary', 'description', 'fixed_files', 'report_time', 'pos_tagged_summary', 'pos_tagged_description','stack_traces','stack_traces_remove']

    def __init__(self, summary, description, fixed_files, report_time):
        self.summary = summary
        self.description = description
        self.fixed_files = fixed_files
        self.report_time = report_time
        self.pos_tagged_summary = None
        self.pos_tagged_description = None
        self.stack_traces = None
        self.stack_traces_remove = None

class SourceFile:
    """Class representing each source file"""
    __slots__ = ['all_content', 'comments', 'class_names', 'attributes', 'method_names', 'variables', 'file_name',
                 'pos_tagged_comments', 'exact_file_name', 'package_name']

    def __init__(self, all_content, comments, class_names, attributes, method_names, variables, file_name,
                 package_name):
        self.all_content = all_content
        self.comments = comments
        self.class_names = class_names
        self.attributes = attributes
        self.method_names = method_names
        self.variables = variables
        self.file_name = file_name
        self.exact_file_name = file_name[0]
        self.package_name = package_name
        self.pos_tagged_comments = None


class Parser:
    """Class containing different parsers"""
    __slots__ = ['name', 'src', 'bug_repo']

    def __init__(self, pro):
        self.name = pro.name
        self.src = pro.src
        self.bug_repo = pro.bug_repo

    def report_parser(self):
        reader = csv.DictReader(open(self.bug_repo, "r"), delimiter="\t")
        bug_reports = OrderedDict()
        # raw_texts = []
        # fixed_files = []
        for line in reader:
            # line["raw_text"] = line["summary"] + ' ' + line["description"]
            line["report_time"] = datetime.strptime(line["report_time"], "%Y-%m-%d %H:%M:%S")
            temp = line["files"].strip().split(".java")
            length = len(temp)
            x = []
            for i, f in enumerate(temp):
                if i == (length - 1):
                    x.append(os.path.normpath(f))
                    continue
                x.append(os.path.normpath(f + ".java"))
            line["files"] = x
            bug_reports[line["bug_id"]] = BugReport(line["summary"], line["description"], line["files"],
                                                    line["report_time"])
        # bug_reports = tsv2dict(self.bug_repo)

        return bug_reports

    def src_parser(self):
        """Parse source code directory of a program and colect its java files"""

        # Gettting the list of source files recursively from the source directory
        src_addresses = glob.glob(str(self.src) + '/**/*.java', recursive=True)
        print(src_addresses)
        # Creating a java lexer instance for pygments.lex() method
        java_lexer = JavaLexer()
        src_files = OrderedDict()
        # src_files = dict()
        # Looping to parse each source file
        for src_file in src_addresses:
            with open(src_file, encoding='latin-1') as file:
                src = file.read()

            # Placeholder for different parts of a source file
            comments = ''
            class_names = []
            attributes = []
            method_names = []
            variables = []

            # Source parsing
            parse_tree = None
            try:
                parse_tree = javalang.parse.parse(src)
                for path, node in parse_tree.filter(javalang.tree.VariableDeclarator):
                    if isinstance(path[-2], javalang.tree.FieldDeclaration):
                        attributes.append(node.name)
                    elif isinstance(path[-2], javalang.tree.VariableDeclaration):
                        variables.append(node.name)
            except:
                pass

            # Triming the source file
            ind = False
            if parse_tree:
                if parse_tree.imports:
                    last_imp_path = parse_tree.imports[-1].path
                    src = src[src.index(last_imp_path) + len(last_imp_path) + 1:]
                elif parse_tree.package:
                    package_name = parse_tree.package.name
                    src = src[src.index(package_name) + len(package_name) + 1:]
                else:  # no import and no package declaration
                    ind = True
            # javalang can't parse the source file
            else:
                ind = True

            # Lexically tokenize the source file
            lexed_src = pygments.lex(src, java_lexer)

            for i, token in enumerate(lexed_src):
                if token[0] in Token.Comment:
                    if ind and i == 0 and token[0] is Token.Comment.Multiline:
                        src = src[src.index(token[1]) + len(token[1]):]
                        continue
                    comments = comments + token[1]
                elif token[0] is Token.Name.Class:
                    class_names.append(token[1])
                elif token[0] is Token.Name.Function:
                    method_names.append(token[1])

            # get the package declaration if exists
            if parse_tree and parse_tree.package:
                package_name = parse_tree.package.name
            else:
                package_name = None

            if self.name == 'aspectj' or 'tomcat' or 'eclipse' or 'swt':
                src_files[os.path.relpath(src_file, start=self.src)] = SourceFile(src, comments, class_names,
                                                                                  attributes, method_names, variables, [
                                                                                      os.path.basename(src_file).split(
                                                                                          '.')[0]], package_name)
            else:
                # If source files has package declaration
                if package_name:
                    src_id = (package_name + '.' + os.path.basename(src_file))
                else:
                    src_id = os.path.basename(src_file)
                src_files[src_id] = SourceFile(src, comments, class_names, attributes, method_names, variables,
                                               [os.path.basename(src_file).split('.')[0]], package_name)
            # print(src_files)
            # print("===========")
        return src_files


class ReportPreprocessing:
    """Class preprocess bug reports"""
    __slots__ = ['bug_reports']

    def __init__(self, bug_reports):
        self.bug_reports = bug_reports

    def extract_stack_traces(self):
        """Extract stack traces from bug reports"""
        pattern = re.compile(r' at (.*?)\((.*?)\)')
        signs = ['.java', 'Unknown Source', 'Native Method']
        for report in self.bug_reports.values():
            st_canid = re.findall(pattern, report.description)
            st = [x for x in st_canid if any(s in x[1] for s in signs)]
            report.stack_traces = st

    def extract_stack_traces_remove(self):
        pattern = re.compile(r' at (.*?)\((.*?)\)')
        signs = ['.java', 'Unknown Source', 'Native Method']
        for report in self.bug_reports.values():
            st_canid = re.findall(pattern, report.description)
            st = [x for x in st_canid if any(s in x[1] for s in signs)]
            at = []
            for x in st:
                if (x[1] == 'Unknown Source'):
                    temp = 'Unknown'
                    y = x[0]+ '(' + temp
                else:
                    y = x[0] + '(' + x[1] + ')'
                at.append(y)
            report.stack_traces_remove = at

    def pos_tagging(self):
        """Extracing specific pos tags from bug reports raw_text"""
        for report in self.bug_reports.values():
            # Tokenizing using word_tokeize for more accurate pos-tagging
            sum_tok = nltk.word_tokenize(report.summary)
            desc_tok = nltk.word_tokenize(report.description)
            sum_pos = nltk.pos_tag(sum_tok)
            desc_pos = nltk.pos_tag(desc_tok)
            report.pos_tagged_summary = [token for token, pos in sum_pos if 'NN' in pos or 'VB' in pos]
            report.pos_tagged_description = [token for token, pos in desc_pos if 'NN' in pos or 'VB' in pos]

    def tokenize(self):
        """Tokenize bug report intro tokens"""
        for report in self.bug_reports.values():
            report.summary = nltk.wordpunct_tokenize(report.summary)
            report.description = nltk.wordpunct_tokenize(report.description)

    def _split_camelcase(self, tokens):
        # copy tokens
        returning_tokens = tokens[:]
        for token in tokens:
            split_tokens = re.split(fr'[{string.punctuation}]+', token)
            # if token is split into some other tokens
            if len(split_tokens) > 1:
                returning_tokens.remove(token)
                # camel case detection for new tokens
                for st in split_tokens:
                    camel_split = inflection.underscore(st).split('_')
                    if len(camel_split) > 1:
                        returning_tokens.append(st)
                        returning_tokens = returning_tokens + camel_split
                    else:
                        returning_tokens.append(st)
            else:
                camel_split = inflection.underscore(token).split('_')
                if len(camel_split) > 1:
                    returning_tokens = returning_tokens + camel_split
        return returning_tokens

    def split_camelcase(self):
        """Split camelcase indentifiers"""
        for report in self.bug_reports.values():
            report.summary = self._split_camelcase(report.summary)
            report.description = self._split_camelcase(report.description)
            report.pos_tagged_summary = self._split_camelcase(report.pos_tagged_summary)
            report.pos_tagged_description = self._split_camelcase(report.pos_tagged_description)

    def normalize(self):
        """remove punctuation, numbers and lowecase conversion"""
        # build a translate table for punctuation and number removal
        punctnum_table = str.maketrans({c: None for c in string.punctuation + string.digits})

        for report in self.bug_reports.values():
            summary_punctnum_rem = [token.translate(punctnum_table) for token in report.summary]
            desc_punctnum_rem = [token.translate(punctnum_table) for token in report.description]
            pos_sum_punctnum_rem = [token.translate(punctnum_table) for token in report.pos_tagged_summary]
            pos_desc_punctnum_rem = [token.translate(punctnum_table) for token in report.pos_tagged_description]
            report.summary = [token.lower() for token in summary_punctnum_rem if token]
            report.description = [token.lower() for token in desc_punctnum_rem if token]
            report.pos_tagged_summary = [token.lower() for token in pos_sum_punctnum_rem if token]
            report.pos_tagged_description = [token.lower() for token in pos_desc_punctnum_rem if token]

    def remove_stopwords(self):
        """removing stop word from tokens"""
        for report in self.bug_reports.values():
            report.summary = [token for token in report.summary if token not in stop_words]
            report.description = [token for token in report.description if token not in stop_words]
            report.pos_tagged_summary = [token for token in report.pos_tagged_summary if token not in stop_words]
            report.pos_tagged_description = [token for token in report.pos_tagged_description if token not in stop_words]

    def remove_java_keywords(self):
        """removing java language keywords from tokens"""
        for report in self.bug_reports.values():
            report.summary = [token for token in report.summary if token not in java_keywords]
            report.description = [token for token in report.description if token not in java_keywords]
            report.pos_tagged_summary = [token for token in report.pos_tagged_summary if token not in java_keywords]
            report.pos_tagged_description = [token for token in report.pos_tagged_description if token not in java_keywords]

    def stem(self):
        # stemming tokens
        stemmer = PorterStemmer()
        for report in self.bug_reports.values():
            report.summary = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.summary], report.summary]))
            report.description = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.description], report.description]))
            report.pos_tagged_summary = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.pos_tagged_summary], report.pos_tagged_summary]))
            report.pos_tagged_description = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.pos_tagged_description], report.pos_tagged_description]))

    def preprocess(self):
        self.extract_stack_traces()
        self.extract_stack_traces_remove()
        self.pos_tagging()
        self.tokenize()
        self.split_camelcase()
        self.normalize()
        self.remove_stopwords()
        self.remove_java_keywords()
        self.stem()

class SrcPreprocessing:
    """class to preprocess source code"""
    __slots__ = ['src_files']

    def __init__(self, src_files):
        self.src_files = src_files

    def pos_tagging(self):
        """Extracing specific pos tags from comments"""
        for src in self.src_files.values():
            # tokenize using word_tokenize
            comments_tok = nltk.word_tokenize(src.comments)
            comments_pos = nltk.pos_tag(comments_tok)
            src.pos_tagged_comments = [token for token, pos in comments_pos if 'NN' in pos or 'VB' in pos]

    def tokenize(self):
        """tokenize source code to tokens"""
        for src in self.src_files.values():
            src.all_content = nltk.wordpunct_tokenize(src.all_content)
            src.comments = nltk.wordpunct_tokenize(src.comments)

    def _split_camelcase(self, tokens):
        # copy token
        returning_tokens = tokens[:]
        for token in tokens:
            split_tokens = re.split(fr'[{string.punctuation}]+', token)
            # if token is split into some other tokens
            if len(split_tokens) > 1:
                returning_tokens.remove(token)
                # camelcase defect for new tokens
                for st in split_tokens:
                    camel_split = inflection.underscore(st).split('_')
                    if len(camel_split) > 1:
                        returning_tokens.append(st)
                        returning_tokens = returning_tokens + camel_split
                    else:
                        returning_tokens.append(st)
            else:
                camel_split = inflection.underscore(token).split('_')
                if len(camel_split) > 1:
                    returning_tokens = returning_tokens + camel_split
        return returning_tokens

    def split_camelcase(self):
        # Split camelcase indenti
        for src in self.src_files.values():
            src.all_content = self._split_camelcase(src.all_content)
            src.comments = self._split_camelcase(src.comments)
            src.class_names = self._split_camelcase(src.class_names)
            src.attributes = self._split_camelcase(src.attributes)
            src.method_names = self._split_camelcase(src.method_names)
            src.variables = self._split_camelcase(src.variables)
            src.pos_tagged_comments = self._split_camelcase(src.pos_tagged_comments)

    def normalize(self):
        "remove punctuation, number and lowercase conversion"
        # build a translate table for punctuation and number
        punctnum_table = str.maketrans({c: None for c in string.punctuation + string.digits})
        for src in self.src_files.values():
            content_punctnum_rem = [token.translate(punctnum_table) for token in src.all_content]
            comments_punctnum_rem = [token.translate(punctnum_table) for token in src.comments]
            classnames_punctnum_rem = [token.translate(punctnum_table) for token in src.class_names]
            attributes_punctnum_rem = [token.translate(punctnum_table) for token in src.attributes]
            methodnames_punctnum_rem = [token.translate(punctnum_table) for token in src.method_names]
            variables_punctnum_rem = [token.translate(punctnum_table) for token in src.variables]
            filename_punctnum_rem = [token.translate(punctnum_table) for token in src.file_name]
            pos_comments_punctnum_rem = [token.translate(punctnum_table) for token in src.pos_tagged_comments]

            src.all_content = [token.lower() for token in content_punctnum_rem if token]
            src.comments = [token.lower() for token in comments_punctnum_rem if token]
            src.class_names = [token.lower() for token in classnames_punctnum_rem if token]
            src.attributes = [token.lower() for token in attributes_punctnum_rem if token]
            src.method_names = [token.lower() for token in methodnames_punctnum_rem if token]
            src.variables = [token.lower() for token in variables_punctnum_rem if token]
            src.file_name = [token.lower() for token in filename_punctnum_rem if token]
            src.pos_tagged_comments = [token.lower() for token in pos_comments_punctnum_rem if token]

    def remove_stopwords(self):
        for src in self.src_files.values():
            src.all_content = [token for token in src.all_content if token not in stop_words]
            src.comments = [token for token in src.comments if token not in stop_words]
            src.class_names = [token for token in src.class_names if token not in stop_words]
            src.attributes = [token for token in src.attributes if token not in stop_words]
            src.method_names = [token for token in src.method_names if token not in stop_words]
            src.variables = [token for token in src.variables if token not in stop_words]
            src.file_name = [token for token in src.file_name if token not in stop_words]
            src.pos_tagged_comments = [token for token in src.pos_tagged_comments if token not in stop_words]

    def remove_javakeywords(self):
        for src in self.src_files.values():
            src.all_content = [token for token in src.all_content if token not in java_keywords]
            src.comments = [token for token in src.comments if token not in java_keywords]
            src.class_names = [token for token in src.class_names if token not in java_keywords]
            src.attributes = [token for token in src.attributes if token not in java_keywords]
            src.method_names = [token for token in src.method_names if token not in java_keywords]
            src.variables = [token for token in src.variables if token not in java_keywords]
            src.file_name = [token for token in src.file_name if token not in java_keywords]
            src.pos_tagged_comments = [token for token in src.pos_tagged_comments if token not in java_keywords]

    def stem(self):
        # stemming tokens
        stemmer = PorterStemmer()
        for src in self.src_files.values():
            src.all_content = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.all_content], src.all_content]))
            src.comments = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.comments], src.comments]))
            src.class_names = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.class_names], src.class_names]))
            src.attributes = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.attributes], src.attributes]))
            src.method_names = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.method_names], src.method_names]))
            src.variables = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.variables], src.variables]))
            src.file_name = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.file_name], src.file_name]))
            src.pos_tagged_comments = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.pos_tagged_comments], src.pos_tagged_comments]))


    def preprocess(self):
        self.pos_tagging()
        self.tokenize()
        self.split_camelcase()
        self.normalize()
        self.remove_stopwords()
        self.remove_javakeywords()
        self.stem()

In [4]:
!pip install inflection
import inflection




In [5]:
import nltk
nltk.download('punkt_tab')
import pickle
from google.colab import drive
import csv
from collections import OrderedDict
from datetime import datetime
import re
import string
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Load dữ liệu

In [6]:
import pickle

# Đường dẫn đến các file pickle
file_paths = {
    'aspectj': '/kaggle/input/bug-localization-data/aspectj_src_processed.pkl',
    'eclipse': '/kaggle/input/bug-localization-data/eclipse_src_processed.pkl',
    'swt': '/kaggle/input/bug-localization-data/swt_src_processed.pkl',
    'tomcat': '/kaggle/input/bug-localization-data/tomcat_src_processed.pkl',
    'birt': '/kaggle/input/bug-localization-data/birt_src_processed.pkl'
}

# Load từng file và lưu vào các biến tương ứng
datasets = {}

for name, path in file_paths.items():
    with open(path, 'rb') as f:
        datasets[name] = pickle.load(f)

# Kiểm tra dữ liệu đã được load vào các biến
for name, data in datasets.items():
    print(f"Data for {name}:")


Data for aspectj:
Data for eclipse:
Data for swt:
Data for tomcat:
Data for birt:


In [7]:
eclipse_src = datasets['eclipse']
birt_src = datasets['birt']
swt_src = datasets['swt']
tomcat_src = datasets['tomcat']
aspectj_src = datasets['aspectj']

In [8]:
# Load dữ liệu từ các file pickle đã lưu
file_paths = {
    'aspectj': '/kaggle/input/bug-localization-data/aspectj_reports_processed.pkl',
    'eclipse': '/kaggle/input/bug-localization-data/eclipse_reports_processed.pkl',
    'swt': '/kaggle/input/bug-localization-data/swt_reports_processed.pkl',
    'tomcat': '/kaggle/input/bug-localization-data/tomcat_reports_processed.pkl',
    'birt': '/kaggle/input/bug-localization-data/birt_reports_processed.pkl'
}

# Load từng dataset và lưu vào các biến
all_processed_reports = {}

for name, path in file_paths.items():
    with open(path, 'rb') as f:
        all_processed_reports[name] = pickle.load(f)

# Kiểm tra dữ liệu đã load vào
for dataset, reports in all_processed_reports.items():
    print(f"Processed reports for {dataset}:")

Processed reports for aspectj:
Processed reports for eclipse:
Processed reports for swt:
Processed reports for tomcat:
Processed reports for birt:


In [9]:
eclipse_reports = all_processed_reports['eclipse']
birt_reports = all_processed_reports['birt']
swt_reports = all_processed_reports['swt']
tomcat_reports = all_processed_reports['tomcat']
aspectj_reports = all_processed_reports['aspectj']

## 2. Xử lí data, gán nhãn
- Sắp xếp bug report theo thời gian (report_time)
- Chia thành 10 folds
- Tạo training/test dataset theo kiểu fold i → fold i+1
- Gán nhãn cho từng cặp (bug report, source file)

In [10]:
# B1: Lấy danh sách (bug_id, bug_report), sau đó sắp xếp theo report_time
sorted_bug_reports = sorted(tomcat_reports.items(), key=lambda x: x[1].report_time)
data_src = tomcat_src

In [11]:
def split_into_folds(sorted_reports, num_folds=10):
    fold_size = len(sorted_reports) // num_folds
    folds = [sorted_reports[i*fold_size:(i+1)*fold_size] for i in range(num_folds)]

    # Nếu còn dư, rải đều vào các fold đầu
    remainder = sorted_reports[num_folds*fold_size:]
    for i, extra in enumerate(remainder):
        folds[i].append(extra)
    return folds

data_folds = split_into_folds(sorted_bug_reports, num_folds=10)


In [12]:
i = 0 # thử với fold 0 → 1
train_fold = data_folds[i]
test_fold = data_folds[i+1]

In [13]:
import random

def generate_balanced_pairs(bug_fold, source_files, num_negatives_per_positive=50):
    data = []
    for bug_id, bug in bug_fold:
        # Danh sách file chứa bug (poszqitive)
        positive_paths = set(bug.fixed_files)
        positive = [
            (bug_id, bug, src_path, source_files[src_path], 1)
            for src_path in positive_paths if src_path in source_files
        ]

        # Danh sách file còn lại để lấy negative
        all_paths = list(source_files.keys())
        negative_paths = list(set(all_paths) - positive_paths)
        sampled_negatives = random.sample(negative_paths, min(num_negatives_per_positive * len(positive), len(negative_paths)))

        negative = [
            (bug_id, bug, src_path, source_files[src_path], 0)
            for src_path in sampled_negatives if src_path in source_files
        ]

        data.extend(positive + negative)
    return data
def generate_all_negatives_pairs(bug_fold, source_files):
    data = []
    for bug_id, bug in bug_fold:
        positive_paths = set(bug.fixed_files)
        positive = [
            (bug_id, bug, src_path, source_files[src_path], 1)
            for src_path in positive_paths if src_path in source_files
        ]

        all_paths = list(source_files.keys())
        negative_paths = list(set(all_paths) - positive_paths)

        negative = [
            (bug_id, bug, src_path, source_files[src_path], 0)
            for src_path in negative_paths if src_path in source_files
        ]

        data.extend(positive + negative)
    return data


train_pairs = generate_balanced_pairs(train_fold, data_src, num_negatives_per_positive=50)
#test_pairs = generate_balanced_pairs(test_fold, data_src, num_negatives_per_positive=50)
test_pairs = generate_all_negatives_pairs(test_fold, data_src)




Xử lí mất cân bằng

In [14]:
def compute_stats(pairs):
    total = len(pairs)
    pos = sum(1 for _, _, _, _, label in pairs if label == 1)
    neg = total - pos
    ratio = pos / total if total > 0 else 0
    return total, pos, neg, ratio

  
total, pos, neg, ratio = compute_stats(train_pairs)
print("📊 Train Set:")
print(f"  ➤ Tổng cặp: {total}")
print(f"  ✅ Positive (label=1): {pos}")
print(f"  ❌ Negative (label=0): {neg}")
print(f"  ⚖️ Tỷ lệ positive: {ratio:.4f}")

total, pos, neg, ratio = compute_stats(test_pairs)
print("\n🧪 Test Set:")
print(f"  ➤ Tổng cặp: {total}")
print(f"  ✅ Positive (label=1): {pos}")
print(f"  ❌ Negative (label=0): {neg}")
print(f"  ⚖️ Tỷ lệ positive: {ratio:.4f}")


📊 Train Set:
  ➤ Tổng cặp: 5355
  ✅ Positive (label=1): 105
  ❌ Negative (label=0): 5250
  ⚖️ Tỷ lệ positive: 0.0196

🧪 Test Set:
  ➤ Tổng cặp: 190164
  ✅ Positive (label=1): 103
  ❌ Negative (label=0): 190061
  ⚖️ Tỷ lệ positive: 0.0005


### Hàm 1: Tạo batches có bootstrapping (luôn chứa ít nhất 1 positive sample)

In [15]:
def create_bootstrapped_batches(pairs, batch_size=128, pos_ratio=0.1):
    positives = [p for p in pairs if p[-1] == 1]
    negatives = [p for p in pairs if p[-1] == 0]

    pos_per_batch = max(1, int(batch_size * pos_ratio))
    neg_per_batch = batch_size - pos_per_batch

    random.shuffle(positives)
    random.shuffle(negatives)

    batches = []
    pos_idx, neg_idx = 0, 0

    while neg_idx + neg_per_batch <= len(negatives):
        pos_batch = []
        for _ in range(pos_per_batch):
            pos_batch.append(positives[pos_idx % len(positives)])
            pos_idx += 1

        neg_batch = negatives[neg_idx:neg_idx + neg_per_batch]
        neg_idx += neg_per_batch

        batch = pos_batch + neg_batch
        random.shuffle(batch)
        batches.append(batch)

    return batches


### Focal Loss Function

In [16]:

def focal_loss(predictions, targets, alpha=0.999, gamma=2.0, eps=1e-6):
    """
    predictions: tensor (batch_size,) - output sigmoid from model
    targets: tensor (batch_size,) - true labels (0 or 1)
    """
    # Avoid log(0)
    predictions = predictions.clamp(min=eps, max=1.0 - eps)

    # Compute focal loss
    loss = -alpha * (1 - predictions)**gamma * targets * predictions.log() \
           - (1 - alpha) * predictions**gamma * (1 - targets) * (1 - predictions).log()
    return loss.mean()

# 4. Trích xuất đặc trưng

In [17]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Hàm xử lý text gộp lại từ bug report
def bug_to_text(bug):
    summary = bug.summary['unstemmed'] if isinstance(bug.summary, dict) else bug.summary
    desc = bug.description['unstemmed'] if isinstance(bug.description, dict) else bug.description
    return " ".join(summary + desc)

# Hàm xử lý text từ source file
def src_to_text(src):
    content = src.all_content['unstemmed'] if isinstance(src.all_content, dict) else src.all_content
    comments = src.comments['unstemmed'] if isinstance(src.comments, dict) else src.comments
    return " ".join(content + comments)


## Đặc trưng 1: Tính toán độ tương đồng từ vựng (lexical similarity)
- Phương pháp: sử dụng TF-IDF và cosine similarity.
- Input: Cặp dữ liệu (bug report, source file)
- Output: mảng numpy chứa các giá trị độ tương đồng cosine giữa bug report và source file cho mỗi cặp.

In [18]:
def compute_lexical_similarity(pairs):
    bug_texts = [bug_to_text(bug) for _, bug, _, _, _ in pairs]
    src_texts = [src_to_text(src) for _, _, _, src, _ in pairs]

    # Gộp cả bug + src lại để fit chung vectorizer
    combined = bug_texts + src_texts
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(combined)

    # Tách riêng lại từng phần
    bug_vecs = tfidf_matrix[:len(pairs)]
    src_vecs = tfidf_matrix[len(pairs):]

    # Tính cosine cho từng cặp (theo hàng tương ứng)
    similarities = cosine_similarity(bug_vecs, src_vecs).diagonal()

    return similarities


In [19]:
glove_path = "/kaggle/input/glove-embedding/glove.6B.100d.txt"
# Load GloVe 100d vào dictionary
import numpy as np

def load_glove_embeddings(filepath):
    embeddings = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings
    
glove_embeddings = load_glove_embeddings(glove_path)

## Đặc trưng 2: Tính toán độ tương đồng ngữ nghĩa (semantic similarity)
- Phương pháp: TF-IDF weighted average của GloVe vectors và cosine similarity
- Input:  (bug report, source file).
- Output: Một mảng numpy chứa các giá trị độ tương đồng cosine giữa bug report và source file cho mỗi cặp, dựa trên GloVe vectors và trọng số TF-IDF.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_semantic_similarity(pairs, glove_dict, dim=100):
    bug_texts = [bug_to_text(bug) for _, bug, _, _, _ in pairs]
    src_texts = [src_to_text(src) for _, _, _, src, _ in pairs]

    # Dùng TF-IDF để lấy trọng số từ
    tfidf = TfidfVectorizer()
    tfidf.fit(bug_texts + src_texts)
    vocab = tfidf.vocabulary_
    idf_weights = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

    def embed_text(text):
        tokens = text.split()
        vecs = []
        weights = []
        for token in tokens:
            if token in glove_dict and token in vocab:
                vecs.append(glove_dict[token])
                weights.append(idf_weights[token])
        if not vecs:
            return np.zeros(dim)
        vecs = np.array(vecs)
        weights = np.array(weights).reshape(-1, 1)
        weighted_vecs = vecs * weights
        return weighted_vecs.sum(axis=0) / weights.sum()

    # Tính vector trung bình cho bug và src
    bug_vecs = [embed_text(text) for text in bug_texts]
    src_vecs = [embed_text(text) for text in src_texts]

    # Tính cosine similarity giữa từng cặp
    similarities = [cosine_similarity([b], [s])[0][0] for b, s in zip(bug_vecs, src_vecs)]

    return np.array(similarities)


### Đặc trưng 3: Similar Bug Report Score 

→ Kiểm tra xem bug report này có giống **những bug report cũ từng sửa cùng file đó** không?

- `build_bug_fix_history(pairs)` → XD lịch sử chỉnh sửa theo từng file
- `compute_similar_bug_score(pairs, history)`
    - Input: pairs, history
    - So sánh bug hiện tại và bug cũ:
    
    cosine_similarity(TfidfVectorizer().fit_transform([bug_now, bug_old]))[0, 1]
    
    - Lấy giá trị tương đồng cao nhất vừa tìm được

In [21]:
def build_bug_fix_history(pairs):
    history = {}
    for bug_id, bug, src_path, _, label in pairs:
        if label == 1:  # chỉ tính các bug thật sự sửa file
            if src_path not in history:
                history[src_path] = []
            history[src_path].append((bug_id, bug.report_time, bug_to_text(bug)))
    return history

# Đặc trưng 3: Similar Bug Report Score
def compute_similar_bug_score(pairs, history):
    scores = []
    for bug_id, bug, src_path, _, _ in pairs:
        current_time = bug.report_time
        current_text = bug_to_text(bug)

        sim_scores = []
        if src_path in history:
            for hist_bug_id, hist_time, hist_text in history[src_path]:
                if hist_time < current_time:  # chỉ tính bug trong quá khứ
                    sim = cosine_similarity(
                        TfidfVectorizer().fit_transform([current_text, hist_text])
                    )[0, 1]
                    sim_scores.append(sim)
        scores.append(max(sim_scores) if sim_scores else 0.0)
    return np.array(scores)

### Đặc trung 4: Time Since Last Fix (ngày, normalize)
- Kiểm tra với mỗi `(bug report, source file)` xem từng được sửa trước đó không và lần cuối khi nào
    - Đã lâu k sửa → Ít lỗi → Điểm thấp
    - Mới sửa → có thể liên quan tới lỗi → Điểm cao
- Cách hđ:
    - Tìm thời điểm bug current_time
    - Tìm history các lần sửa file trong quá khứ
    - Tính khoảng cách time giữa current và history gần nhất
    - Chưa sửa → Gán số delta_days=9999
    - Chuẩn hoá

In [22]:
# Đặc trưng 4: Time Since Last Fix (ngày, normalize)
def compute_time_since_last_fix(pairs, history):
    scores = []
    for _, bug, src_path, _, _ in pairs:
        current_time = bug.report_time
        if src_path in history:
            past_times = [hist_time for _, hist_time, _ in history[src_path] if hist_time < current_time]
            if past_times:
                delta_days = (current_time - max(past_times)).days
            else:
                delta_days = 9999  # Cực lớn nếu chưa từng sửa
        else:
            delta_days = 9999
        scores.append(delta_days)

    # Normalize về [0,1]
    max_days = max(scores) if max(scores) != 0 else 1  # Tránh chia cho 0

    return np.array([1 - (s / max_days) for s in scores])



### Đặc trưng 5: Fix Frequency (số lần bị sửa trong quá khứ, normalize)


- Kiểm tra xme mỗi cặp được sửa bao nhiêu lần

→ Sửa nhiều → File dễ dính lỗi → Điểm cao

In [23]:
# Đặc trưng 5: Fix Frequency (số lần bị sửa trong quá khứ, normalize)
def compute_fix_frequency(pairs, history):
    scores = []
    for _, bug, src_path, _, _ in pairs:
        current_time = bug.report_time
        if src_path in history:
            past_fixes = [1 for _, hist_time, _ in history[src_path] if hist_time < current_time]
            freq = len(past_fixes)
        else:
            freq = 0
        scores.append(freq)
    # Normalize về [0,1] an toàn
    max_freq = max(scores)
    max_freq = max(max_freq, 1)  # tránh chia 0
    return np.array([s / max_freq for s in scores])


# Dùng cho 500 cặp mẫu
sampled_pairs = train_pairs[:5000]
bug_history = build_bug_fix_history(train_pairs)

similar_bug_score = compute_similar_bug_score(sampled_pairs, bug_history)
time_since_last_fix = compute_time_since_last_fix(sampled_pairs, bug_history)
fix_frequency = compute_fix_frequency(sampled_pairs, bug_history)

# Trích 5 giá trị đầu mỗi feature
similar_bug_score[:50], time_since_last_fix[:50], fix_frequency[:50]

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [24]:
def min_max_normalize(values):
    min_val = min(values)
    max_val = max(values)
    denom = max_val - min_val if max_val != min_val else 1
    return [(v - min_val) / denom for v in values]


Đặc trưng 6: Ngữ nghĩa học sâu từ CNN encoder

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# CNN encoder dùng cho cả bug và source
class CNNEncoder(nn.Module):
    def __init__(self, embed_dim=100, num_filters=64, kernel_sizes=(3, 5), dropout=0.5):
        super(CNNEncoder, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k)
            for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):  # x: (batch_size, seq_len, embed_dim)
        x = x.permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        conv_outs = [F.relu(conv(x)) for conv in self.convs]
        pooled = [F.max_pool1d(c, c.size(2)).squeeze(2) for c in conv_outs]
        out = torch.cat(pooled, dim=1)
        return self.dropout(out)  # shape: (batch_size, num_filters * len(kernel_sizes))

# Token list → batch embedding tensor
def batch_tokens_to_embeddings(batch_tokens, glove_dict, dim=100, max_len=100):
    batch_embeddings = []
    for tokens in batch_tokens:
        emb = []
        for token in tokens[:max_len]:
            if token in glove_dict:
                emb.append(glove_dict[token])
            else:
                emb.append(np.zeros(dim))
        while len(emb) < max_len:
            emb.append(np.zeros(dim))
        batch_embeddings.append(emb)
    return torch.tensor(np.array(batch_embeddings), dtype=torch.float32)

# 🆕 CNN feature extraction – batch, nhanh, GPU, normalized
def extract_cnn_features_batch(pairs, glove_dict, bug_encoder, src_encoder, device="cuda", dim=100, max_len=100):
    bug_encoder = bug_encoder.to(device)
    src_encoder = src_encoder.to(device)
    bug_encoder.eval()
    src_encoder.eval()

    bug_token_list = []
    src_token_list = []

    for _, bug, _, src, _ in pairs:
        bug_tokens = bug.summary['stemmed'] + bug.description['stemmed']
        src_tokens = src.all_content['stemmed'] + src.comments['stemmed'] + \
                     src.class_names['stemmed'] + src.method_names['stemmed']
        bug_token_list.append(bug_tokens)
        src_token_list.append(src_tokens)

    # Embed → tensor
    bug_tensor = batch_tokens_to_embeddings(bug_token_list, glove_dict, dim, max_len).to(device)
    src_tensor = batch_tokens_to_embeddings(src_token_list, glove_dict, dim, max_len).to(device)

    # Forward CNN
    with torch.no_grad():
        bug_tensor = bug_tensor.to(device)
        src_tensor = src_tensor.to(device)
    
        bug_vec = bug_encoder(bug_tensor)
        src_vec = src_encoder(src_tensor)
    
        combined = torch.cat([bug_vec, src_vec], dim=1).cpu().numpy()  # chuyển về CPU để convert sang NumPy


    # Normalize từng chiều về [0, 1]
    min_vals = combined.min(axis=0)
    max_vals = combined.max(axis=0)
    denom = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)
    normalized = (combined - min_vals) / denom

    return normalized  # shape: (n_samples, 2*filters)


In [26]:
bug_encoder = CNNEncoder()
src_encoder = CNNEncoder()
glove_dict=glove_embeddings
cnn_combined_vector = extract_cnn_features_batch(train_pairs[:5000], glove_dict, bug_encoder, src_encoder)
print("✅ CNN đặc trưng đã chuẩn hóa:", cnn_combined_vector.shape)
print(cnn_combined_vector[:10])

✅ CNN đặc trưng đã chuẩn hóa: (5000, 256)
[[0.43916818 0.526737   0.78628516 ... 0.50926834 0.47500017 0.446147  ]
 [0.43916818 0.526737   0.78628516 ... 0.848012   0.49996847 0.41077703]
 [0.43916818 0.526737   0.78628516 ... 0.6389387  0.4253201  0.42294663]
 ...
 [0.43916818 0.526737   0.78628516 ... 0.34159052 0.47255486 0.30888104]
 [0.43916818 0.526737   0.78628516 ... 0.41852915 0.36961567 0.50590175]
 [0.43916818 0.526737   0.78628516 ... 0.30014607 0.33713138 0.18723792]]


ĐT7: identifier_overlap_count – Số tên hàm/biến trùng với từ trong bug report

In [27]:
def compute_identifier_overlap_count(pairs):
    counts = []
    for _, bug, _, src, _ in pairs:
        bug_tokens = set(bug.summary['stemmed'] + bug.description['stemmed'])
        identifiers = set(
            src.class_names['stemmed'] + src.method_names['stemmed'] + src.variables['stemmed']
        )
        overlap = bug_tokens & identifiers
        counts.append(len(overlap))
    counts = np.array(counts)
    # 🔄 Log normalization về [0,1]
    return np.log1p(counts) / np.log1p(np.max(counts)) if counts.max() > 0 else np.zeros_like(counts)


def compute_shared_token_ratio(pairs):
    ratios = []
    for _, bug, _, src, _ in pairs:
        bug_tokens = set(bug.summary['stemmed'] + bug.description['stemmed'])
        src_tokens = set(
            src.all_content['stemmed'] + src.comments['stemmed'] + 
            src.class_names['stemmed'] + src.method_names['stemmed']
        )
        if not src_tokens:
            ratios.append(0.0)
        else:
            ratios.append(len(bug_tokens & src_tokens) / len(src_tokens))
    ratios = np.array(ratios)
    # 🔄 Min-max normalization về [0,1]
    return (ratios - ratios.min()) / (ratios.max() - ratios.min()) if ratios.max() > ratios.min() else np.zeros_like(ratios)

sampled_pairs = train_pairs[:5000]
idf_overlap = compute_identifier_overlap_count(sampled_pairs)
shared_ratio = compute_shared_token_ratio(sampled_pairs)

print("✅ identifier_overlap_count:", idf_overlap[:10])
print("✅ shared_token_ratio:", shared_ratio[:10])




✅ identifier_overlap_count: [0.61284179 0.15686177 0.15686177 0.24862003 0.24862003 0.44036668
 0.15686177 0.44036668 0.15686177 0.        ]
✅ shared_token_ratio: [0.18360277 0.17647059 0.22881356 0.32142857 0.18       0.08559783
 0.18461538 0.19565217 0.0984556  0.1875    ]


# 4. Quá trình huấn luyện

## 4.1 Tạo ma trận train, test

In [28]:
def build_feature_matrix_batch(pairs_batch, glove_dict, bug_encoder, src_encoder, history, device="cuda"):
    # Đặc trưng vector thường (1 chiều)
    lexical = compute_lexical_similarity(pairs_batch)                         # (N,)
    semantic = compute_semantic_similarity(pairs_batch, glove_dict)          # (N,)
    idf_overlap = compute_identifier_overlap_count(pairs_batch)              # (N,)
    shared_ratio = compute_shared_token_ratio(pairs_batch)                   # (N,)

    # Đặc trưng học sâu (N, 256)
    cnn_vec = extract_cnn_features_batch(pairs_batch, glove_dict, bug_encoder, src_encoder)

    # Ghép toàn bộ lại
    X = np.hstack([
        lexical.reshape(-1, 1),         
        semantic.reshape(-1, 1),        
        idf_overlap.reshape(-1, 1),     
        shared_ratio.reshape(-1, 1),    
        cnn_vec                         
    ])  # → Tổng cộng (N, 263)

    return X


In [29]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# Tạo nhãn y
def get_labels(pairs):
    return np.array([label for *_, label in pairs])
glove_dict = glove_embeddings
X_train = build_feature_matrix_batch(train_pairs, glove_dict, bug_encoder, src_encoder,bug_history, device=device)
y_train = get_labels(train_pairs)


In [30]:
def pair_generator(pairs, batch_size, glove_dict, history, bug_encoder, src_encoder, device="cuda"):
    for i in range(0, len(pairs), batch_size):
        batch = pairs[i:i + batch_size]
        X = build_feature_matrix_batch(batch, glove_dict, bug_encoder, src_encoder, history, device)
        y = np.array([label for *_, label in batch])

        yield torch.tensor(X, dtype=torch.float32).to(device), torch.tensor(y, dtype=torch.float32).to(device)


In [31]:
print("✅ X_train shape:", X_train.shape)  # (5000, 260)
print("✅ y_train shape:", y_train.shape)  # (5000,)

✅ X_train shape: (5355, 260)
✅ y_train shape: (5355,)


## 4.2 Xây dựng mô hình

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Định nghĩa mô hình DNN giống bài báo
import torch
import torch.nn as nn

class BugLocalization(nn.Module):
    def __init__(self, input_dim=5, hidden_dim=64, output_dim=1):
        super(BugLocalization, self).__init__()
        self.rnn = nn.RNN(input_size=input_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = x.to(next(self.parameters()).device)  # auto move to model's device
        # Nếu x chỉ có 2 chiều (batch_size, input_dim), hãy thêm một chiều giả định (sequence_length=1)
        if len(x.shape) == 2:
            x = x.unsqueeze(1)  # Thêm một chiều giả định (batch_size, 1, input_dim)

        # x shape: (batch_size, sequence_length, input_dim)
        rnn_out, _ = self.rnn(x)
        # Lấy output của phần cuối cùng trong chuỗi
        final_rnn_out = rnn_out[:, -1, :]
        out = torch.sigmoid(self.fc(final_rnn_out)).squeeze()
        return out


        
# Định nghĩa focal loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.999, gamma=2.0, eps=1e-6):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.eps = eps

    def forward(self, preds, targets):
        preds = preds.clamp(min=self.eps, max=1. - self.eps)
        loss = -self.alpha * (1 - preds) ** self.gamma * targets * torch.log(preds) \
               - (1 - self.alpha) * preds ** self.gamma * (1 - targets) * torch.log(1 - preds)
        return loss.mean()


# Huấn luyện mô hình
def train_model_generator(model, train_gen, epochs=10, lr=1e-3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = FocalLoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_X, batch_y in train_gen:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            # 🧹 Dọn bộ nhớ mỗi batch
            del batch_X, batch_y, outputs, loss
            torch.cuda.empty_cache()
            import gc; gc.collect()

        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f}")



In [33]:
for X_batch, y_batch in pair_generator(
    train_pairs, batch_size=128,
    glove_dict=glove_embeddings,
    history=bug_history,
    bug_encoder=bug_encoder,
    src_encoder=src_encoder, 
    device=device
):
    print("👉 Feature shape:", X_batch.shape)
    print("👉 Label shape:", y_batch.shape)
    print("👉 Feature Sample [2]:", X_batch[2].cpu().numpy())
    print("👉 Label Sample [2]:", y_batch[2].item())
    break


👉 Feature shape: torch.Size([128, 260])
👉 Label shape: torch.Size([128])
👉 Feature Sample [2]: [0.0213706  0.77864593 0.25595802 0.49922958 1.         0.4471326
 1.         1.         0.         0.3159268  0.75139946 1.
 0.         0.35441217 0.         1.         0.311645   0.
 1.         0.10277365 0.01000413 0.89433575 1.         1.
 1.         1.         0.48448765 0.3146649  0.8492759  0.
 0.5602385  1.         0.         0.88617194 0.9309485  0.08908636
 0.         0.18085344 1.         0.19750349 1.         0.
 0.         1.         1.         1.         0.         0.
 1.         1.         0.32935524 1.         0.47296247 1.
 0.6549694  0.2940925  1.         0.35267025 0.         0.39856026
 1.         0.         1.         0.94961154 0.61076987 0.
 0.         0.         0.5091864  1.         0.60320026 0.6403919
 0.55791926 0.         1.         0.5802944  0.         0.
 0.24913223 0.12660448 1.         0.57677543 1.         0.6360219
 0.         0.         1.         1.      

In [34]:
def test_pair_generator(pairs, batch_size, glove_dict, history, bug_encoder, src_encoder, device="cuda"):
    for i in range(0, len(pairs), batch_size):
        batch = pairs[i:i + batch_size]
        X = build_feature_matrix_batch(batch, glove_dict, bug_encoder, src_encoder, history, device)

        yield torch.tensor(X, dtype=torch.float32, device=device)


In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.metrics import average_precision_score
from collections import defaultdict
import numpy as np

# Đánh giá các chỉ số (MAP, MRR, Top-k)
def compute_topk_accuracy(test_pairs, y_scores, k=10):
    bug_to_scores = {}
    for (bug_id, _, src_path, _, label), score in zip(test_pairs, y_scores):
        if bug_id not in bug_to_scores:
            bug_to_scores[bug_id] = []
        bug_to_scores[bug_id].append((score, label))

    correct_at_k = 0
    total = 0

    for entries in bug_to_scores.values():
        sorted_entries = sorted(entries, key=lambda x: x[0], reverse=True)
        top_k = sorted_entries[:k]
        if any(label == 1 for _, label in top_k):
            correct_at_k += 1
        total += 1

    return correct_at_k / total if total > 0 else 0



def compute_MAP_per_bug(test_pairs, y_pred_probs):
    # Gom nhãn và score theo bug_id
    bug_to_ytrue = defaultdict(list)
    bug_to_yscore = defaultdict(list)

    for (bug_id, _, _, _, label), score in zip(test_pairs, y_pred_probs):
        bug_to_ytrue[bug_id].append(label)
        bug_to_yscore[bug_id].append(score)

    # Tính AP cho từng bug, chỉ giữ bug có ít nhất 1 label = 1
    ap_list = []
    for bug_id in bug_to_ytrue:
        y_true = np.array(bug_to_ytrue[bug_id])
        y_score = np.array(bug_to_yscore[bug_id])

        if np.sum(y_true) == 0:
            continue  # bỏ qua bug không có file liên quan

        ap = average_precision_score(y_true, y_score)
        ap_list.append(ap)

    # Tính MAP
    MAP = np.mean(ap_list) if ap_list else 0.0
    return MAP

# MRR (Mean Reciprocal Rank)
def mean_reciprocal_rank(pairs, scores):
    bug_to_scores = {}
    for (bug_id, _, _, _, label), score in zip(pairs, scores):
        if bug_id not in bug_to_scores:
            bug_to_scores[bug_id] = []
        bug_to_scores[bug_id].append((score, label))

    rr_sum = 0
    count = 0
    for bug_id, ranked in bug_to_scores.items():
        ranked = sorted(ranked, key=lambda x: x[0], reverse=True)
        for idx, (_, label) in enumerate(ranked):
            if label == 1:
                rr_sum += 1 / (idx + 1)
                break
        count += 1
    return rr_sum / count if count > 0 else 0

In [36]:
def build_feature_matrix_batch_large(
    pairs, 
    glove_dict, 
    bug_encoder, 
    src_encoder, 
    history, 
    batch_size=20000, 
    device="cuda"
):
    all_features = []

    for i in range(0, len(pairs), batch_size):
        sub_batch = pairs[i:i+batch_size]
        
        # 🔤 Các đặc trưng truyền thống
        lexical = compute_lexical_similarity(sub_batch)
        semantic = compute_semantic_similarity(sub_batch, glove_dict)
        idf_overlap = compute_identifier_overlap_count(sub_batch)
        shared_ratio = compute_shared_token_ratio(sub_batch)


        # 🔍 Đặc trưng học sâu
        cnn = extract_cnn_features_batch(sub_batch, glove_dict, bug_encoder, src_encoder, device=device)

        # 🧩 Ghép toàn bộ đặc trưng
        others = np.stack([
            lexical, semantic, idf_overlap, shared_ratio, 
        ], axis=1)

        combined = np.concatenate([others, cnn], axis=1)
        all_features.append(combined)

        print(f"✅ Done {i+len(sub_batch)}/{len(pairs)} samples")

    return np.vstack(all_features)


In [37]:
def run_kfold_training_and_eval(
    folds,
    source_files,
    glove_dict,
    bug_encoder,
    src_encoder,
    model_class,
    k=10,
    device="cuda",
    cache_dir="/kaggle/working"
):
    results = {
        "fold": [],
        "MAP": [],
        "MRR": [],
        "Top1": [],
        "Top2": [],
        "Top3": [],
        "Top4": [],
        "Top5": [],
        "Top10": [],
        "Top15": []
    }

    for i in range(7,k - 1):
        train_fold = [pair for j in range(i + 1) for pair in folds[j]]
        test_fold = folds[i + 1]
        

        print(f"\n📦 Fold 0..{i} ➤ {i+1}")

        train_pairs = generate_balanced_pairs(train_fold, source_files, num_negatives_per_positive=50)
        test_pairs = generate_all_negatives_pairs(test_fold, source_files)

        if sum(1 for p in train_pairs if p[-1] == 1) < 1:
            print("⚠️ Bỏ qua do quá ít positive samples")
            continue

        train_X_path = os.path.join(cache_dir, f"X_train_tomcat_fold{i}_260.npy")
        train_y_path = os.path.join(cache_dir, f"y_train_tomcat_fold{i}_260.npy")
        test_X_path = os.path.join(cache_dir, f"X_test_tomcat_fold{i+1}_260.npy")
        test_y_path = os.path.join(cache_dir, f"y_test_tomcat_fold{i+1}_260.npy")

        if os.path.exists(train_X_path) and os.path.exists(train_y_path):
            print("✅ Load đặc trưng train từ cache")
            X_train = np.load(train_X_path)
            y_train = np.load(train_y_path)
        else:
            print("🛠 Trích đặc trưng train...")
            X_train = build_feature_matrix_batch_large(train_pairs, glove_dict, bug_encoder, src_encoder, bug_history,batch_size=20000, device=device)
            y_train = get_labels(train_pairs)
            np.save(train_X_path, X_train)
            np.save(train_y_path, y_train)

        if os.path.exists(test_X_path) and os.path.exists(test_y_path):
            print("✅ Load đặc trưng test từ cache")
            X_test = np.load(test_X_path)
            y_test = np.load(test_y_path)
        else:
            print("🛠 Trích đặc trưng test...")
            X_test = build_feature_matrix_batch_large(test_pairs, glove_dict, bug_encoder, src_encoder, bug_history,batch_size=20000, device=device)
            y_test = get_labels(test_pairs)
            np.save(test_X_path, X_test)
            np.save(test_y_path, y_test)

        print("✅ Trích xuất test done")

        train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                      torch.tensor(y_train, dtype=torch.float32))
        train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

        model = model_class(input_dim=X_train.shape[1]).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        criterion = FocalLoss(alpha=0.99)

        model.train()
        print("🚀 Bắt đầu huấn luyện")
        for epoch in range(10):
            total_loss = 0
            for batch_X, batch_y in train_loader:
                batch_X = batch_X.to(device)
                batch_y = batch_y.to(device)

                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

                del batch_X, batch_y, outputs, loss
                torch.cuda.empty_cache()
                import gc; gc.collect()

            print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")

        model.eval()
        y_pred_probs = []
        with torch.no_grad():
            for i in range(0, len(X_test), 128):
                batch_X = torch.tensor(X_test[i:i+128], dtype=torch.float32).to(device)
                probs = model(batch_X).cpu().numpy()
                y_pred_probs.extend(probs)

        map_score = compute_MAP_per_bug(test_pairs, y_pred_probs)
        mrr_score = mean_reciprocal_rank(test_pairs, y_pred_probs)
        top1 = compute_topk_accuracy(test_pairs, y_pred_probs, k=1)
        top2 = compute_topk_accuracy(test_pairs, y_pred_probs, k=2)
        top3 = compute_topk_accuracy(test_pairs, y_pred_probs, k=3)
        top4 = compute_topk_accuracy(test_pairs, y_pred_probs, k=4)
        top5 = compute_topk_accuracy(test_pairs, y_pred_probs, k=5)
        top10 = compute_topk_accuracy(test_pairs, y_pred_probs, k=10)
        top15 = compute_topk_accuracy(test_pairs, y_pred_probs, k=15)

        print(f"✅ Results:")
        print(f"  ➤ MAP:   {map_score:.4f}")
        print(f"  ➤ MRR:   {mrr_score:.4f}")
        print(f"  ➤ Top@1: {top1:.4f} | Top@2: {top2:.4f} | Top@3: {top3:.4f}")
        print(f"  ➤ Top@4: {top4:.4f} | Top@5: {top5:.4f} | Top@10: {top10:.4f} | Top@15: {top15:.4f}")

        results["fold"].append(i)
        results["MAP"].append(map_score)
        results["MRR"].append(mrr_score)
        results["Top1"].append(top1)
        results["Top2"].append(top2)
        results["Top3"].append(top3)
        results["Top4"].append(top4)
        results["Top5"].append(top5)
        results["Top10"].append(top10)
        results["Top15"].append(top15)

    return results


In [38]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim  # ✅ phần bị thiếu
from sklearn.metrics import average_precision_score
import os




results = run_kfold_training_and_eval(
    folds=data_folds,
    source_files=data_src,
    glove_dict=glove_dict,
    bug_encoder=bug_encoder,
    src_encoder=src_encoder,
    model_class=BugLocalization,  # hoặc ImprovedBugLocalization
    k=10,
    device="cuda"
)



# Output full results
print("\nFull Results:")
for key, value in results.items():
    print(f"{key}: {value}")



📦 Fold 0..7 ➤ 8
🛠 Trích đặc trưng train...
✅ Done 20000/41667 samples
✅ Done 40000/41667 samples
✅ Done 41667/41667 samples
🛠 Trích đặc trưng test...
✅ Done 20000/188370 samples
✅ Done 40000/188370 samples
✅ Done 60000/188370 samples
✅ Done 80000/188370 samples
✅ Done 100000/188370 samples
✅ Done 120000/188370 samples
✅ Done 140000/188370 samples
✅ Done 160000/188370 samples
✅ Done 180000/188370 samples
✅ Done 188370/188370 samples
✅ Trích xuất test done
🚀 Bắt đầu huấn luyện
Epoch 1: Loss = 1.1582
Epoch 2: Loss = 0.8709
Epoch 3: Loss = 0.7445
Epoch 4: Loss = 0.6673
Epoch 5: Loss = 0.6347
Epoch 6: Loss = 0.5877
Epoch 7: Loss = 0.5840
Epoch 8: Loss = 0.5619
Epoch 9: Loss = 0.5457
Epoch 10: Loss = 0.5907
✅ Results:
  ➤ MAP:   0.4089
  ➤ MRR:   0.3778
  ➤ Top@1: 0.2571 | Top@2: 0.3524 | Top@3: 0.4762
  ➤ Top@4: 0.5143 | Top@5: 0.5333 | Top@10: 0.6000 | Top@15: 0.6381

📦 Fold 0..8 ➤ 9
🛠 Trích đặc trưng train...
✅ Done 20000/46614 samples
✅ Done 40000/46614 samples
✅ Done 46614/46614 sample

In [39]:

# In kết quả tổng hợp sau khi chạy tất cả folds
print("\n📊 Kết quả tổng hợp:")
for i in range(len(full_results["fold"])):
    print(f"Fold {full_results['fold'][i]}:")
    print(f"  ➤ MAP: {full_results['MAP'][i]:.4f}")
    print(f"  ➤ MRR: {full_results['MRR'][i]:.4f}")
    print(f"  ➤ Top1: {full_results['Top1'][i]:.4f}")
    print(f"  ➤ Top2: {full_results['Top2'][i]:.4f}")
    print(f"  ➤ Top3: {full_results['Top3'][i]:.4f}")
    print(f"  ➤ Top4: {full_results['Top4'][i]:.4f}")
    print(f"  ➤ Top5: {full_results['Top5'][i]:.4f}")
    print(f"  ➤ Top10: {full_results['Top10'][i]:.4f}")
    print(f"  ➤ Top15: {full_results['Top15'][i]:.4f}")

# Tính trung bình cho tất cả các chỉ số
mean_map = np.mean(full_results["MAP"])
mean_mrr = np.mean(full_results["MRR"])
mean_top1 = np.mean(full_results["Top1"])
mean_top2 = np.mean(full_results["Top2"])
mean_top3 = np.mean(full_results["Top3"])
mean_top4 = np.mean(full_results["Top4"])
mean_top5 = np.mean(full_results["Top5"])
mean_top10 = np.mean(full_results["Top10"])
mean_top15 = np.mean(full_results["Top15"])

# In kết quả trung bình
print("\n📊 Kết quả trung bình trên toàn bộ k-folds:")
print(f"  ➤ MAP: {mean_map:.4f}")
print(f"  ➤ MRR: {mean_mrr:.4f}")
print(f"  ➤ Top1: {mean_top1:.4f}")
print(f"  ➤ Top2: {mean_top2:.4f}")
print(f"  ➤ Top3: {mean_top3:.4f}")
print(f"  ➤ Top4: {mean_top4:.4f}")
print(f"  ➤ Top5: {mean_top5:.4f}")
print(f"  ➤ Top10: {mean_top10:.4f}")
print(f"  ➤ Top15: {mean_top15:.4f}")




📊 Kết quả tổng hợp:


NameError: name 'full_results' is not defined