## 1. Tiền xử lí dữ liệu

### 1.1. Load dữ liệu

In [2]:
import pickle
import numpy as np
import pandas as pd
from google.colab import drive
import os

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1.2. Tiền xử lí dữ liệu

In [3]:
!pip install nltk




In [4]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [5]:
# English stop words
stop_words = set(
    ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
     'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
     'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs',
     'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
     'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
     'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
     'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
     'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
     'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
     'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
     'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
     'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
     's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o',
     're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven',
     'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won',
     'wouldn', 'b', 'c', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'n', 'p', 'q', 'u', 'v',
     'w', 'x', 'z', 'us'])

# Java language keywords
java_keywords = set(
    ['abstract', 'assert', 'boolean', 'break', 'byte', 'case',
     'catch', 'char', 'class', 'const', 'continue', 'default', 'do', 'double',
     'else', 'enum', 'extends', 'false', 'final', 'finally', 'float', 'for', 'goto',
     'if', 'implements', 'import', 'instanceof', 'int', 'interface', 'long',
     'native', 'new', 'null', 'package', 'private', 'protected', 'public', 'return',
     'short', 'static', 'strictfp', 'super', 'switch', 'synchronized', 'this',
     'throw', 'throws', 'transient', 'true', 'try', 'void', 'volatile', 'while'])

from collections import namedtuple
from pathlib import Path

# Dataset root directory (điều chỉnh đường dẫn nếu cần)
_DATASET_ROOT = Path('/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/')

Dataset = namedtuple('Dataset', ['name', 'src', 'bug_repo', 'repo_url', 'features'])

# Các dataset được định nghĩa
aspectj = Dataset(
    'aspectj',
    _DATASET_ROOT / 'source files/org.aspectj',
    _DATASET_ROOT / 'bug reports/AspectJ.txt',
    "https://github.com/eclipse/org.aspectj/tree/bug433351.git",
    _DATASET_ROOT / 'bug reports/AspectJ.xlsx'
)

eclipse = Dataset(
    'eclipse',
    _DATASET_ROOT / 'source files/eclipse.platform.ui-johna-402445',
    _DATASET_ROOT / 'bug reports/Eclipse_Platform_UI.txt',
    "https://github.com/eclipse/eclipse.platform.ui.git",
    _DATASET_ROOT / 'bug reports/Eclipse_Platform_UI.xlsx'
)

swt = Dataset(
    'swt',
    _DATASET_ROOT / 'source files/eclipse.platform.swt-xulrunner-31',
    _DATASET_ROOT / 'bug reports/SWT.txt',
    "https://github.com/eclipse/eclipse.platform.swt.git",
    _DATASET_ROOT / 'bug reports/SWT.xlsx'
)

tomcat = Dataset(
    'tomcat',
    _DATASET_ROOT / 'source files/tomcat-7.0.51',
    _DATASET_ROOT / 'bug reports/Tomcat.txt',
    "https://github.com/apache/tomcat.git",
    _DATASET_ROOT / 'bug reports/Tomcat.xlsx'
)

birt = Dataset(
    'birt',
    _DATASET_ROOT / 'source files/birt-20140211-1400',
    _DATASET_ROOT / 'bug reports/Birt.txt',
    "https://github.com/apache/birt.git",
    _DATASET_ROOT / 'bug reports/Birt.xlsx'
)


### Current dataset in use. (change this name to change the dataset)
DATASET = tomcat

class BugReport:
    """Class representing each bug report"""
    __slots__ = ['summary', 'description', 'fixed_files', 'report_time', 'pos_tagged_summary', 'pos_tagged_description','stack_traces','stack_traces_remove']

    def __init__(self, summary, description, fixed_files, report_time):
        self.summary = summary
        self.description = description
        self.fixed_files = fixed_files
        self.report_time = report_time
        self.pos_tagged_summary = None
        self.pos_tagged_description = None
        self.stack_traces = None
        self.stack_traces_remove = None

class SourceFile:
    """Class representing each source file"""
    __slots__ = ['all_content', 'comments', 'class_names', 'attributes', 'method_names', 'variables', 'file_name',
                 'pos_tagged_comments', 'exact_file_name', 'package_name']

    def __init__(self, all_content, comments, class_names, attributes, method_names, variables, file_name,
                 package_name):
        self.all_content = all_content
        self.comments = comments
        self.class_names = class_names
        self.attributes = attributes
        self.method_names = method_names
        self.variables = variables
        self.file_name = file_name
        self.exact_file_name = file_name[0]
        self.package_name = package_name
        self.pos_tagged_comments = None


class Parser:
    """Class containing different parsers"""
    __slots__ = ['name', 'src', 'bug_repo']

    def __init__(self, pro):
        self.name = pro.name
        self.src = pro.src
        self.bug_repo = pro.bug_repo

    def report_parser(self):
        reader = csv.DictReader(open(self.bug_repo, "r"), delimiter="\t")
        bug_reports = OrderedDict()
        # raw_texts = []
        # fixed_files = []
        for line in reader:
            # line["raw_text"] = line["summary"] + ' ' + line["description"]
            line["report_time"] = datetime.strptime(line["report_time"], "%Y-%m-%d %H:%M:%S")
            temp = line["files"].strip().split(".java")
            length = len(temp)
            x = []
            for i, f in enumerate(temp):
                if i == (length - 1):
                    x.append(os.path.normpath(f))
                    continue
                x.append(os.path.normpath(f + ".java"))
            line["files"] = x
            bug_reports[line["bug_id"]] = BugReport(line["summary"], line["description"], line["files"],
                                                    line["report_time"])
        # bug_reports = tsv2dict(self.bug_repo)

        return bug_reports

    def src_parser(self):
        """Parse source code directory of a program and colect its java files"""

        # Gettting the list of source files recursively from the source directory
        src_addresses = glob.glob(str(self.src) + '/**/*.java', recursive=True)
        print(src_addresses)
        # Creating a java lexer instance for pygments.lex() method
        java_lexer = JavaLexer()
        src_files = OrderedDict()
        # src_files = dict()
        # Looping to parse each source file
        for src_file in src_addresses:
            with open(src_file, encoding='latin-1') as file:
                src = file.read()

            # Placeholder for different parts of a source file
            comments = ''
            class_names = []
            attributes = []
            method_names = []
            variables = []

            # Source parsing
            parse_tree = None
            try:
                parse_tree = javalang.parse.parse(src)
                for path, node in parse_tree.filter(javalang.tree.VariableDeclarator):
                    if isinstance(path[-2], javalang.tree.FieldDeclaration):
                        attributes.append(node.name)
                    elif isinstance(path[-2], javalang.tree.VariableDeclaration):
                        variables.append(node.name)
            except:
                pass

            # Triming the source file
            ind = False
            if parse_tree:
                if parse_tree.imports:
                    last_imp_path = parse_tree.imports[-1].path
                    src = src[src.index(last_imp_path) + len(last_imp_path) + 1:]
                elif parse_tree.package:
                    package_name = parse_tree.package.name
                    src = src[src.index(package_name) + len(package_name) + 1:]
                else:  # no import and no package declaration
                    ind = True
            # javalang can't parse the source file
            else:
                ind = True

            # Lexically tokenize the source file
            lexed_src = pygments.lex(src, java_lexer)

            for i, token in enumerate(lexed_src):
                if token[0] in Token.Comment:
                    if ind and i == 0 and token[0] is Token.Comment.Multiline:
                        src = src[src.index(token[1]) + len(token[1]):]
                        continue
                    comments = comments + token[1]
                elif token[0] is Token.Name.Class:
                    class_names.append(token[1])
                elif token[0] is Token.Name.Function:
                    method_names.append(token[1])

            # get the package declaration if exists
            if parse_tree and parse_tree.package:
                package_name = parse_tree.package.name
            else:
                package_name = None

            if self.name == 'aspectj' or 'tomcat' or 'eclipse' or 'swt':
                src_files[os.path.relpath(src_file, start=self.src)] = SourceFile(src, comments, class_names,
                                                                                  attributes, method_names, variables, [
                                                                                      os.path.basename(src_file).split(
                                                                                          '.')[0]], package_name)
            else:
                # If source files has package declaration
                if package_name:
                    src_id = (package_name + '.' + os.path.basename(src_file))
                else:
                    src_id = os.path.basename(src_file)
                src_files[src_id] = SourceFile(src, comments, class_names, attributes, method_names, variables,
                                               [os.path.basename(src_file).split('.')[0]], package_name)
            # print(src_files)
            # print("===========")
        return src_files


class ReportPreprocessing:
    """Class preprocess bug reports"""
    __slots__ = ['bug_reports']

    def __init__(self, bug_reports):
        self.bug_reports = bug_reports

    def extract_stack_traces(self):
        """Extract stack traces from bug reports"""
        pattern = re.compile(r' at (.*?)\((.*?)\)')
        signs = ['.java', 'Unknown Source', 'Native Method']
        for report in self.bug_reports.values():
            st_canid = re.findall(pattern, report.description)
            st = [x for x in st_canid if any(s in x[1] for s in signs)]
            report.stack_traces = st

    def extract_stack_traces_remove(self):
        pattern = re.compile(r' at (.*?)\((.*?)\)')
        signs = ['.java', 'Unknown Source', 'Native Method']
        for report in self.bug_reports.values():
            st_canid = re.findall(pattern, report.description)
            st = [x for x in st_canid if any(s in x[1] for s in signs)]
            at = []
            for x in st:
                if (x[1] == 'Unknown Source'):
                    temp = 'Unknown'
                    y = x[0]+ '(' + temp
                else:
                    y = x[0] + '(' + x[1] + ')'
                at.append(y)
            report.stack_traces_remove = at

    def pos_tagging(self):
        """Extracing specific pos tags from bug reports raw_text"""
        for report in self.bug_reports.values():
            # Tokenizing using word_tokeize for more accurate pos-tagging
            sum_tok = nltk.word_tokenize(report.summary)
            desc_tok = nltk.word_tokenize(report.description)
            sum_pos = nltk.pos_tag(sum_tok)
            desc_pos = nltk.pos_tag(desc_tok)
            report.pos_tagged_summary = [token for token, pos in sum_pos if 'NN' in pos or 'VB' in pos]
            report.pos_tagged_description = [token for token, pos in desc_pos if 'NN' in pos or 'VB' in pos]

    def tokenize(self):
        """Tokenize bug report intro tokens"""
        for report in self.bug_reports.values():
            report.summary = nltk.wordpunct_tokenize(report.summary)
            report.description = nltk.wordpunct_tokenize(report.description)

    def _split_camelcase(self, tokens):
        # copy tokens
        returning_tokens = tokens[:]
        for token in tokens:
            split_tokens = re.split(fr'[{string.punctuation}]+', token)
            # if token is split into some other tokens
            if len(split_tokens) > 1:
                returning_tokens.remove(token)
                # camel case detection for new tokens
                for st in split_tokens:
                    camel_split = inflection.underscore(st).split('_')
                    if len(camel_split) > 1:
                        returning_tokens.append(st)
                        returning_tokens = returning_tokens + camel_split
                    else:
                        returning_tokens.append(st)
            else:
                camel_split = inflection.underscore(token).split('_')
                if len(camel_split) > 1:
                    returning_tokens = returning_tokens + camel_split
        return returning_tokens

    def split_camelcase(self):
        """Split camelcase indentifiers"""
        for report in self.bug_reports.values():
            report.summary = self._split_camelcase(report.summary)
            report.description = self._split_camelcase(report.description)
            report.pos_tagged_summary = self._split_camelcase(report.pos_tagged_summary)
            report.pos_tagged_description = self._split_camelcase(report.pos_tagged_description)

    def normalize(self):
        """remove punctuation, numbers and lowecase conversion"""
        # build a translate table for punctuation and number removal
        punctnum_table = str.maketrans({c: None for c in string.punctuation + string.digits})

        for report in self.bug_reports.values():
            summary_punctnum_rem = [token.translate(punctnum_table) for token in report.summary]
            desc_punctnum_rem = [token.translate(punctnum_table) for token in report.description]
            pos_sum_punctnum_rem = [token.translate(punctnum_table) for token in report.pos_tagged_summary]
            pos_desc_punctnum_rem = [token.translate(punctnum_table) for token in report.pos_tagged_description]
            report.summary = [token.lower() for token in summary_punctnum_rem if token]
            report.description = [token.lower() for token in desc_punctnum_rem if token]
            report.pos_tagged_summary = [token.lower() for token in pos_sum_punctnum_rem if token]
            report.pos_tagged_description = [token.lower() for token in pos_desc_punctnum_rem if token]

    def remove_stopwords(self):
        """removing stop word from tokens"""
        for report in self.bug_reports.values():
            report.summary = [token for token in report.summary if token not in stop_words]
            report.description = [token for token in report.description if token not in stop_words]
            report.pos_tagged_summary = [token for token in report.pos_tagged_summary if token not in stop_words]
            report.pos_tagged_description = [token for token in report.pos_tagged_description if token not in stop_words]

    def remove_java_keywords(self):
        """removing java language keywords from tokens"""
        for report in self.bug_reports.values():
            report.summary = [token for token in report.summary if token not in java_keywords]
            report.description = [token for token in report.description if token not in java_keywords]
            report.pos_tagged_summary = [token for token in report.pos_tagged_summary if token not in java_keywords]
            report.pos_tagged_description = [token for token in report.pos_tagged_description if token not in java_keywords]

    def stem(self):
        # stemming tokens
        stemmer = PorterStemmer()
        for report in self.bug_reports.values():
            report.summary = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.summary], report.summary]))
            report.description = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.description], report.description]))
            report.pos_tagged_summary = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.pos_tagged_summary], report.pos_tagged_summary]))
            report.pos_tagged_description = dict(
                zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in report.pos_tagged_description], report.pos_tagged_description]))

    def preprocess(self):
        self.extract_stack_traces()
        self.extract_stack_traces_remove()
        self.pos_tagging()
        self.tokenize()
        self.split_camelcase()
        self.normalize()
        self.remove_stopwords()
        self.remove_java_keywords()
        self.stem()

class SrcPreprocessing:
    """class to preprocess source code"""
    __slots__ = ['src_files']

    def __init__(self, src_files):
        self.src_files = src_files

    def pos_tagging(self):
        """Extracing specific pos tags from comments"""
        for src in self.src_files.values():
            # tokenize using word_tokenize
            comments_tok = nltk.word_tokenize(src.comments)
            comments_pos = nltk.pos_tag(comments_tok)
            src.pos_tagged_comments = [token for token, pos in comments_pos if 'NN' in pos or 'VB' in pos]

    def tokenize(self):
        """tokenize source code to tokens"""
        for src in self.src_files.values():
            src.all_content = nltk.wordpunct_tokenize(src.all_content)
            src.comments = nltk.wordpunct_tokenize(src.comments)

    def _split_camelcase(self, tokens):
        # copy token
        returning_tokens = tokens[:]
        for token in tokens:
            split_tokens = re.split(fr'[{string.punctuation}]+', token)
            # if token is split into some other tokens
            if len(split_tokens) > 1:
                returning_tokens.remove(token)
                # camelcase defect for new tokens
                for st in split_tokens:
                    camel_split = inflection.underscore(st).split('_')
                    if len(camel_split) > 1:
                        returning_tokens.append(st)
                        returning_tokens = returning_tokens + camel_split
                    else:
                        returning_tokens.append(st)
            else:
                camel_split = inflection.underscore(token).split('_')
                if len(camel_split) > 1:
                    returning_tokens = returning_tokens + camel_split
        return returning_tokens

    def split_camelcase(self):
        # Split camelcase indenti
        for src in self.src_files.values():
            src.all_content = self._split_camelcase(src.all_content)
            src.comments = self._split_camelcase(src.comments)
            src.class_names = self._split_camelcase(src.class_names)
            src.attributes = self._split_camelcase(src.attributes)
            src.method_names = self._split_camelcase(src.method_names)
            src.variables = self._split_camelcase(src.variables)
            src.pos_tagged_comments = self._split_camelcase(src.pos_tagged_comments)

    def normalize(self):
        "remove punctuation, number and lowercase conversion"
        # build a translate table for punctuation and number
        punctnum_table = str.maketrans({c: None for c in string.punctuation + string.digits})
        for src in self.src_files.values():
            content_punctnum_rem = [token.translate(punctnum_table) for token in src.all_content]
            comments_punctnum_rem = [token.translate(punctnum_table) for token in src.comments]
            classnames_punctnum_rem = [token.translate(punctnum_table) for token in src.class_names]
            attributes_punctnum_rem = [token.translate(punctnum_table) for token in src.attributes]
            methodnames_punctnum_rem = [token.translate(punctnum_table) for token in src.method_names]
            variables_punctnum_rem = [token.translate(punctnum_table) for token in src.variables]
            filename_punctnum_rem = [token.translate(punctnum_table) for token in src.file_name]
            pos_comments_punctnum_rem = [token.translate(punctnum_table) for token in src.pos_tagged_comments]

            src.all_content = [token.lower() for token in content_punctnum_rem if token]
            src.comments = [token.lower() for token in comments_punctnum_rem if token]
            src.class_names = [token.lower() for token in classnames_punctnum_rem if token]
            src.attributes = [token.lower() for token in attributes_punctnum_rem if token]
            src.method_names = [token.lower() for token in methodnames_punctnum_rem if token]
            src.variables = [token.lower() for token in variables_punctnum_rem if token]
            src.file_name = [token.lower() for token in filename_punctnum_rem if token]
            src.pos_tagged_comments = [token.lower() for token in pos_comments_punctnum_rem if token]

    def remove_stopwords(self):
        for src in self.src_files.values():
            src.all_content = [token for token in src.all_content if token not in stop_words]
            src.comments = [token for token in src.comments if token not in stop_words]
            src.class_names = [token for token in src.class_names if token not in stop_words]
            src.attributes = [token for token in src.attributes if token not in stop_words]
            src.method_names = [token for token in src.method_names if token not in stop_words]
            src.variables = [token for token in src.variables if token not in stop_words]
            src.file_name = [token for token in src.file_name if token not in stop_words]
            src.pos_tagged_comments = [token for token in src.pos_tagged_comments if token not in stop_words]

    def remove_javakeywords(self):
        for src in self.src_files.values():
            src.all_content = [token for token in src.all_content if token not in java_keywords]
            src.comments = [token for token in src.comments if token not in java_keywords]
            src.class_names = [token for token in src.class_names if token not in java_keywords]
            src.attributes = [token for token in src.attributes if token not in java_keywords]
            src.method_names = [token for token in src.method_names if token not in java_keywords]
            src.variables = [token for token in src.variables if token not in java_keywords]
            src.file_name = [token for token in src.file_name if token not in java_keywords]
            src.pos_tagged_comments = [token for token in src.pos_tagged_comments if token not in java_keywords]

    def stem(self):
        # stemming tokens
        stemmer = PorterStemmer()
        for src in self.src_files.values():
            src.all_content = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.all_content], src.all_content]))
            src.comments = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.comments], src.comments]))
            src.class_names = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.class_names], src.class_names]))
            src.attributes = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.attributes], src.attributes]))
            src.method_names = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.method_names], src.method_names]))
            src.variables = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.variables], src.variables]))
            src.file_name = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.file_name], src.file_name]))
            src.pos_tagged_comments = dict(zip(['stemmed', 'unstemmed'], [[stemmer.stem(token) for token in src.pos_tagged_comments], src.pos_tagged_comments]))


    def preprocess(self):
        self.pos_tagging()
        self.tokenize()
        self.split_camelcase()
        self.normalize()
        self.remove_stopwords()
        self.remove_javakeywords()
        self.stem()

In [None]:
!pip install inflection
import inflection




In [None]:
# Tạo đối tượng `Parser` với bộ dữ liệu bạn muốn xử lý (Ví dụ: `DATASET` là 'tomcat')
parser = Parser(DATASET)

# Tiến hành xử lý báo cáo lỗi
report_prep = ReportPreprocessing(parser.report_parser())
report_prep.preprocess()

# Kiểm tra báo cáo lỗi đã được xử lý
processed_reports = report_prep.bug_reports
print(processed_reports)


NameError: name 'csv' is not defined

In [None]:
# Lấy một bug report cụ thể, ví dụ bug report đầu tiên
first_bug_report = list(processed_reports.values())[0]

# Lấy các thông tin từ bug report
summary = first_bug_report.summary  # Lấy mô tả lỗi
report_time= first_bug_report.report_time  # Lấy thời gian báo cáo lỗi

# In ra các thông tin này
print("Summary:", summary)
print("Report Time:", report_time)


In [None]:
# Danh sách các datasets mà bạn muốn xử lý
datasets = [aspectj, eclipse, swt, tomcat, birt]  # Thêm các dataset khác nếu cần

# Khởi tạo một danh sách để lưu trữ các báo cáo lỗi đã xử lý
all_processed_reports = {}

# Lặp qua tất cả các dataset và tiến hành xử lý báo cáo lỗi
for dataset in datasets:
    # Tạo đối tượng Parser cho mỗi dataset
    parser = Parser(dataset)

    # Tiến hành xử lý báo cáo lỗi
    report_prep = ReportPreprocessing(parser.report_parser())
    report_prep.preprocess()

    # Lưu báo cáo lỗi đã xử lý vào từ điển, với key là tên dataset
    all_processed_reports[dataset] = report_prep.bug_reports

# Kiểm tra kết quả đã xử lý của tất cả các datasets
for dataset, reports in all_processed_reports.items():
    print(f"Processed reports for dataset: {dataset}")
    print(reports)  # In hoặc xử lý thêm báo cáo lỗi


Processed reports for dataset: Dataset(name='aspectj', src=PosixPath('/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/source files/org.aspectj-bug433351'), bug_repo=PosixPath('/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/bug reports/AspectJ.txt'), repo_url='https://github.com/eclipse/org.aspectj/tree/bug433351.git', features=PosixPath('/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/bug reports/AspectJ.xlsx'))
OrderedDict([('423257', <__main__.BugReport object at 0x7b8746896260>), ('420210', <__main__.BugReport object at 0x7b8746896320>), ('419279', <__main__.BugReport object at 0x7b8746896bc0>), ('415266', <__main__.BugReport object at 0x7b8746896a40>), ('418129', <__main__.BugReport object at 0x7b8746896440>), ('368046', <__main__.BugReport object at 0x7b87468945e0>), ('413378', <__main__.BugReport object at 0x7b8746895c00>), ('407017', <__main__.BugReport object at 0x7b8746895d20>), ('408721', <__main__.BugReport object at 0x7b87

In [None]:
import glob
from pygments.lexers import JavaLexer
import pygments
from pygments.token import Token


# Giả sử bạn đã có đối tượng `Parser` cho dataset 'tomcat'
parser = Parser(tomcat)  # Chỉ định dataset 'tomcat'

# Tiến hành xử lý mã nguồn Java cho dataset 'tomcat'
src_prep = SrcPreprocessing(parser.src_parser())
src_prep.preprocess()

# Kiểm tra mã nguồn Java đã được xử lý
src_files = src_prep.src_files
print(src_files)




KeyboardInterrupt: 

In [6]:
import nltk
nltk.download('punkt_tab')
import pickle
from google.colab import drive
import csv
from collections import OrderedDict
from datetime import datetime
import re
import string
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
import pickle
from google.colab import drive
import csv
from collections import OrderedDict
from datetime import datetime
import re
import string
from nltk.stem.porter import PorterStemmer

# Gắn kết Google Drive
drive.mount('/content/drive')

# Danh sách các datasets mà bạn muốn xử lý
datasets = [aspectj, eclipse, swt, tomcat, birt]  # Thêm các dataset khác nếu cần

# Lặp qua tất cả các dataset và tiến hành xử lý báo cáo lỗi
for dataset in datasets:
    # Tạo đối tượng Parser cho mỗi dataset
    parser = Parser(dataset)

    # Tiến hành xử lý báo cáo lỗi
    report_prep = ReportPreprocessing(parser.report_parser())
    report_prep.preprocess()

    # Lưu báo cáo lỗi đã xử lý vào từ điển, với key là tên dataset
    dataset_name = dataset.name  # Lấy tên dataset (aspectj, eclipse, etc.)

    # Đường dẫn đến Google Drive để lưu file cho mỗi dataset
    file_path = f'/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/{dataset_name}_reports_processed.pkl'

    # Lưu kết quả vào file pickle trong Google Drive cho từng dataset
    with open(file_path, "wb") as f:
        pickle.dump(report_prep.bug_reports, f)

    # Kiểm tra kết quả đã xử lý của dataset
    print(f"Processed reports for dataset: {dataset_name}")
    print(report_prep.bug_reports)  # In hoặc xử lý thêm báo cáo lỗi

# ---------------------------
# Tải lại kết quả đã lưu từ file pickle trong Google Drive (khi cần sử dụng lại)
# ---------------------------
for dataset in datasets:
    dataset_name = dataset.name
    file_path = f'/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/{dataset_name}_reports_processed.pkl'

    # Tải lại dữ liệu đã xử lý từ file pickle
    with open(file_path, "rb") as f:
        processed_reports = pickle.load(f)

    # Kiểm tra kết quả đã tải lại
    print(f"Loaded processed reports for dataset: {dataset_name}")
    print(processed_reports)  # In hoặc xử lý thêm báo cáo lỗi


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


KeyboardInterrupt: 

In [None]:
import glob
import pickle
from pygments.lexers import JavaLexer
import pygments
from pygments.token import Token
from google.colab import drive

# Gắn kết Google Drive
drive.mount('/content/drive')

# Danh sách các datasets mà bạn muốn xử lý
datasets = [birt]  # Thêm các dataset khác nếu cần

# Lặp qua tất cả các dataset và tiến hành xử lý mã nguồn
for dataset in datasets:
    dataset_name = dataset.name  # Lấy tên của dataset (aspectj, eclipse, etc.)

    # Tạo đối tượng Parser cho mỗi dataset
    parser = Parser(dataset)

    # Tiến hành xử lý mã nguồn Java cho dataset
    src_prep = SrcPreprocessing(parser.src_parser())
    src_prep.preprocess()

    # Kiểm tra mã nguồn Java đã được xử lý
    src_files = src_prep.src_files
    print(f"Processed source files for {dataset_name}")

    # Đường dẫn lưu file pickle vào Google Drive
    file_path = f'/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/{dataset_name}_src_processed.pkl'

    # Lưu kết quả đã xử lý vào Google Drive
    with open(file_path, "wb") as f:
        pickle.dump(src_files, f)

    print(f"Processed source files for {dataset_name} dataset saved at {file_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processed source files for birt
Processed source files for birt dataset saved at /content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/birt_src_processed.pkl


In [None]:
import glob
import pickle
from pygments.lexers import JavaLexer
import pygments
from pygments.token import Token
from google.colab import drive

# Gắn kết Google Drive
drive.mount('/content/drive')

# Danh sách các datasets mà bạn muốn xử lý
datasets = [swt]  # Thêm các dataset khác nếu cần

# Lặp qua tất cả các dataset và tiến hành xử lý mã nguồn
for dataset in datasets:
    dataset_name = dataset.name  # Lấy tên của dataset (aspectj, eclipse, etc.)

    # Tạo đối tượng Parser cho mỗi dataset
    parser = Parser(dataset)

    # Tiến hành xử lý mã nguồn Java cho dataset
    src_prep = SrcPreprocessing(parser.src_parser())
    src_prep.preprocess()

    # Kiểm tra mã nguồn Java đã được xử lý
    src_files = src_prep.src_files
    print(f"Processed source files for {dataset_name}")

    # Đường dẫn lưu file pickle vào Google Drive
    file_path = f'/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/{dataset_name}_src_processed.pkl'

    # Lưu kết quả đã xử lý vào Google Drive
    with open(file_path, "wb") as f:
        pickle.dump(src_files, f)

    print(f"Processed source files for {dataset_name} dataset saved at {file_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/source files/eclipse.platform.swt-xulrunner-31/tests/org.eclipse.swt.tests/JUnit Tests/org/eclipse/swt/tests/junit/Test_org_eclipse_swt_custom_TableTreeItem.java', '/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/source files/eclipse.platform.swt-xulrunner-31/tests/org.eclipse.swt.tests/JUnit Tests/org/eclipse/swt/tests/junit/Test_org_eclipse_swt_widgets_Combo.java', '/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/source files/eclipse.platform.swt-xulrunner-31/tests/org.eclipse.swt.tests/JUnit Tests/org/eclipse/swt/tests/junit/Test_org_eclipse_swt_events_PaintEvent.java', '/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/source files/eclipse.platform.swt-xulrunner-31/tests/org.eclipse.swt.tests/JUnit Tests/org/eclipse/swt/tests/junit/AllNonBro

In [None]:
import glob
import pickle
from pygments.lexers import JavaLexer
import pygments
from pygments.token import Token
from google.colab import drive

# Gắn kết Google Drive
drive.mount('/content/drive')

# Danh sách các datasets mà bạn muốn xử lý
datasets = [eclipse]  # Thêm các dataset khác nếu cần

# Lặp qua tất cả các dataset và tiến hành xử lý mã nguồn
for dataset in datasets:
    dataset_name = dataset.name  # Lấy tên của dataset (aspectj, eclipse, etc.)

    # Tạo đối tượng Parser cho mỗi dataset
    parser = Parser(dataset)

    # Tiến hành xử lý mã nguồn Java cho dataset
    src_prep = SrcPreprocessing(parser.src_parser())
    src_prep.preprocess()

    # Kiểm tra mã nguồn Java đã được xử lý
    src_files = src_prep.src_files
    print(f"Processed source files for {dataset_name}")

    # Đường dẫn lưu file pickle vào Google Drive
    file_path = f'/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/{dataset_name}_src_processed.pkl'

    # Lưu kết quả đã xử lý vào Google Drive
    with open(file_path, "wb") as f:
        pickle.dump(src_files, f)

    print(f"Processed source files for {dataset_name} dataset saved at {file_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processed source files for eclipse
Processed source files for eclipse dataset saved at /content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/eclipse_src_processed.pkl


In [None]:
import pickle
from google.colab import drive

# Gắn kết Google Drive
drive.mount('/content/drive')

# Đọc file pickle đã lưu từ Google Drive
dataset_name = 'birt'  # Ví dụ: thay 'tomcat' bằng tên dataset bạn muốn kiểm tra
file_path = f'/content/drive/MyDrive/Colab Notebooks/NLP/Task bug localization/{dataset_name}_src_processed.pkl'

# Mở và tải dữ liệu từ file pickle
with open(file_path, "rb") as f:
    src_files = pickle.load(f)

# Kiểm tra số lượng file đã xử lý trong src_files
print(f"Number of processed Java files in '{dataset_name}':", len(src_files))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of processed Java files in 'birt': 0


In [None]:
import glob
from google.colab import drive

# Gắn kết Google Drive
drive.mount('/content/drive')

# Đường dẫn đến thư mục chứa các file (thay [tên file] bằng tên thư mục hoặc đường dẫn thực tế)
folder_path = '/content/drive/MyDrive/Colab Notebooks/Data/org.aspectj/**/*.java'

# Sử dụng glob để tìm tất cả các file .java trong thư mục
java_files = glob.glob(folder_path, recursive=True)

# Đếm số lượng file .java
java_files_count = len(java_files)

# In kết quả
print(f"Number of .java files in the directory: {java_files_count}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of .java files in the directory: 0
