In [8]:
import re

def remove_comments(code):
    """Get rid of comments."""
    pattern = r"""
        ('(?:\\.|[^\\'])*'          # 단일 따옴표 문자열
        | "(?:\\.|[^\\"])*"         # 이중 따옴표 문자열
        | '''[\s\S]*?'''            # 삼중 따옴표 문자열 (여러 줄 문자열)
        | \"\"\"[\s\S]*?\"\"\"      # 삼중 따옴표 문자열
        | \#.*?$                    # 한 줄 주석
        )"""
    regex = re.compile(pattern, re.VERBOSE | re.MULTILINE)
    def replacer(match):
        s = match.group(0)
        if s.startswith('#'):
            return ''  # 주석은 제거
        else:
            return s  # 문자열은 그대로 유지
    code_no_comments = regex.sub(replacer, code)
    return code_no_comments


In [3]:
import random

def augment_code(code):
    # adding comments
    augmented_codes = []
    comments = ["# TODO: Refactor this", "# This is a placeholder", "# Added for augmentation"]
    for comment in comments:
        augmented_code = f"{comment}\n{code}"
        augmented_codes.append(augmented_code)
    

    code_lines = code.split('\n')
    if len(code_lines) > 1:
        # adding spaces in random spot
        idx = random.randint(0, len(code_lines)-1)
        code_lines.insert(idx, '')
        augmented_code = '\n'.join(code_lines)
        augmented_codes.append(augmented_code)
    
    return augmented_codes


In [2]:
def getDataPoints(path, augment=False):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                        if augment:
                            # apply data augmentation
                            augmented_codes = augment_code(dp)
                            dataPointsList.extend(augmented_codes)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList


In [16]:
import os
import time
import zipfile
import numpy as np
import shutil


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

###############################################################################
# Utility functions

def augment_code(code):
    """코드에 변형을 주어 데이터 증강을 수행합니다."""
    augmented_codes = []

    # 예시 1: 주석 추가
    comments = ["# TODO: Refactor this", "# Added for augmentation", "# Generated comment"]
    for comment in comments:
        augmented_code = f"{comment}\n{code}"
        augmented_codes.append(augmented_code)

    # 예시 2: 공백 줄 추가
    code_lines = code.split('\n')
    if len(code_lines) > 1:
        idx = random.randint(0, len(code_lines) - 1)
        code_lines.insert(idx, '')  # 빈 줄 추가
        augmented_code = '\n'.join(code_lines)
        augmented_codes.append(augmented_code)

    # 예시 3: 변수명 변경 (간단한 예)
    if 'temp' in code:
        augmented_code = code.replace('temp', 'tmp')
        augmented_codes.append(augmented_code)

    return augmented_codes

def extract_zip(zip_file, extract_to):
    """지정된 디렉토리에 zip 파일을 해제합니다."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path, augment=False):
    """주어진 디렉토리 내의 모든 .py 파일의 내용을 수집합니다."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                        if augment:
                            # 데이터 증강 적용
                            augmented_codes = augment_code(dp)
                            dataPointsList.extend(augmented_codes)
                    else:
                        print(f"Empty file: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList

def flastVectorization(dataPoints):
    """데이터 포인트를 벡터화합니다."""
    countVec = CountVectorizer(stop_words=None)
    Z = countVec.fit_transform(dataPoints)
    return Z

###############################################################################
# KNN 모델과 GridSearchCV를 사용한 분석 함수

def flastKNNWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # 압축 파일 해제
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    if os.path.exists(flakyDir):
        shutil.rmtree(flakyDir)
    if os.path.exists(nonFlakyDir):
        shutil.rmtree(nonFlakyDir)
    

    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    # 데이터 로딩 및 증강 적용
    dataPointsFlaky = getDataPoints(flakyDir, augment=True)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir, augment=False)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # 벡터화
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # KNN 모델 정의
    knn = KNeighborsClassifier()

    # 하이퍼파라미터 그리드 정의
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11, 15, 20],
        'metric': ['cosine', 'euclidean'],
        'weights': ['uniform', 'distance'],
    }

    # 스코어링 메트릭 정의
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score)
    }

    # 교차 검증 설정
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # GridSearchCV 실행
    grid_search = GridSearchCV(knn, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # 모델 학습
    grid_search.fit(Z, dataLabelsList)

    # 최적 파라미터 및 점수 출력
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # 결과 저장
    outFile = f"{combination_label}-params-knn-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_neighbors,metric,weights,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            preparationTime = vecTime / len(dataPoints)
            fo.write(f"{param['n_neighbors']},{param['metric']},{param['weights']},{accuracy},{precision},{recall},{f1},{preparationTime}\n")

    print(f"KNN analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    flakyZip = "compressedDataset/cleaned_flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    nonFlakyUnbalance = "compressedDataset/all_nonflaky_files.zip"
    
    extractDir = "extracted"
    outDir = "results/resut_FlastKNN_augment"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    # Perform KNN analysis for 5 folds and 3 folds
    print("Starting 5-fold analysis...")
    best_params_5folds, best_score_5folds = flastKNNWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir,5, "")
    
   
    
    print ("Starting 5-fold analysis with unbalanced data...")
    best_params_5folds_imbalance,best_score_5folds_imbalance = flastKNNWithGridSearchCV(outDir, flakyZip,nonFlakyUnbalance, extractDir, 5,"imbalance")
  
    
    print("Starting 3-fold analysis...")
    best_params_3folds, best_score_3folds = flastKNNWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 3,"")

    
    print ("Starting 3-fold analysis with unbalanced data...")
    best_params_3folds_imbalance,best_score_3folds_imbalance = flastKNNWithGridSearchCV(outDir, flakyZip,nonFlakyUnbalance, extractDir, 3,"imbalance")
  
    print("Best results for 5-fold:")
    print(f"Best Parameters: {best_params_5folds}")
    print(f"Best f1 Score: {best_score_5folds}")

    
    print("Best results for imbalanced 5-fold:")
    print(f"Best Parameters: {best_params_5folds_imbalance}")
    print(f"Best f1 Score: {best_score_5folds_imbalance}")
    
    print("Best results for imbalanced 3-fold:")
    print(f"Best Parameters: {best_params_3folds}")
    print(f"Best f1 Score: {best_score_3folds}")
    
    print("Best results for 3-fold:")
    print(f"Best Parameters: {best_params_3folds_imbalance}")
    print(f"Best f1 Score: {best_score_3folds_imbalance}")

Starting 5-fold analysis...
Number of flaky documents: 245
Number of non-flaky documents: 47
Total number of documents: 292
Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best F1 Score: 0.9744077208406164
KNN analysis completed for 5-folds. Results saved to: -params-knn-5-folds.csv
Starting 5-fold analysis with unbalanced data...
Number of flaky documents: 245
Number of non-flaky documents: 254
Total number of documents: 499
Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Best F1 Score: 0.9684705882352942
KNN analysis completed for 5-folds. Results saved to: imbalance-params-knn-5-folds.csv
Starting 3-fold analysis...
Number of flaky documents: 245
Number of non-flaky documents: 47
Total number of documents: 292
Fitting 3 folds for each of 28 candidates, totalling 84 fits
Best Parameters: {'metric

In [4]:
import os
import time
import zipfile
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
)
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# Decision Tree with GridSearchCV

def flastThreshold(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define Decision Tree model
    dt_model = DecisionTreeClassifier()

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
        'max_depth': [None, 10, 30, 50, 100, 300, 500],  # Maximum depth of each tree
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 5, 10],  # Minimum number of samples required to be at a leaf node
        'max_features': [None, 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    }

    # Custom scoring function (F1 score) for GridSearchCV
    scoring = make_scorer(f1_score, zero_division=1)

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(dt_model, param_grid, cv=skf, scoring=scoring, refit=True, verbose=1)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best estimator
    best_dt_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"Best Parameters: {best_params}")

    # Finding threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    best_threshold = 0.5  
    best_f1 = 0.0

    for threshold in thresholds:
        y_pred_proba = cross_val_predict(best_dt_model, Z, dataLabelsList, cv=skf, method='predict_proba')
        y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)
        f1 = f1_score(dataLabelsList, y_pred, zero_division=1)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold

    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score with Threshold: {best_f1}")

    # Calculate other metrics using the best threshold
    y_pred_proba = cross_val_predict(best_dt_model, Z, dataLabelsList, cv=skf, method='predict_proba')
    y_pred = (y_pred_proba[:, 1] >= best_threshold).astype(int)
    
    
    
    accuracy = accuracy_score(dataLabelsList, y_pred)
    precision = precision_score(dataLabelsList, y_pred, zero_division=1)
    recall = recall_score(dataLabelsList, y_pred, zero_division=1)
    mcc = matthews_corrcoef(dataLabelsList, y_pred)
    preparationTime = vecTime / len(dataPoints)

    # Save the results
    outFile = f"{combination_label}-params-dt-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Write the header
        fo.write("criterion,max_depth,min_samples_split,min_samples_leaf,max_features,threshold,accuracy,precision,recall,f1,mcc,preparationTime\n")
        # Write the data row
        fo.write(f"{best_params['criterion']},{best_params['max_depth']},{best_params['min_samples_split']},"
                 f"{best_params['min_samples_leaf']},{best_params['max_features']},{best_threshold},"
                 f"{accuracy},{precision},{recall},{best_f1},{mcc},{preparationTime}\n")

    print(f"Decision Tree analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_f1

if __name__ == "__main__":
    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform Decision Tree analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting Decision Tree analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastDTWithGridSearchCV(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal")
    best_params_3folds_1, best_score_3folds_1 = flastDTWithGridSearchCV(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, dim=100, eps=0.3, combination_label="equal")

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best F1 Score: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best F1 Score: {best_score_3folds_1}")

    # Perform Decision Tree analysis for the second combination (flaky vs larger non-flaky)
    print("Starting Decision Tree analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastDTWithGridSearchCV(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger")
    best_params_3folds_2, best_score_3folds_2 = flastDTWithGridSearchCV(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, dim=100, eps=0.3, combination_label="larger")

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best F1 Score: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best F1 Score: {best_score_3folds_2}")



Starting Decision Tree analysis for flaky vs smaller non-flaky files (47 each)...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Fitting 5 folds for each of 504 candidates, totalling 2520 fits
Best Parameters: {'criterion': 'entropy', 'max_depth': 100, 'max_features': 'log2', 'min_samples_leaf': 10, 'min_samples_split': 2}
Best Threshold: 0.2
Best F1 Score with Threshold: 0.6771653543307087
Decision Tree analysis completed for 5-folds. Results saved to: equal-params-dt-5-folds.csv
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Fitting 3 folds for each of 504 candidates, totalling 1512 fits
Best Parameters: {'criterion': 'entropy', 'max_depth': 100, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10}
Best Threshold: 0.1
Best F1 Score with Threshold: 0.6666666666666666
Decision Tree analysis completed for 3-folds. Results saved to: equal-params-dt-3-folds.csv
Best results for

In [8]:
import os
import time
import zipfile
import numpy as np
import xgboost as xgb

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.random_projection import SparseRandomProjection

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints, dim=0, eps=0.3):
    """Performs vectorization using CountVectorizer with optional dimensionality reduction."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z_full = countVec.fit_transform(dataPoints)
    if eps == 0:
        Z = Z_full
    else:
        if dim <= 0:
            from sklearn.random_projection import johnson_lindenstrauss_min_dim
            dim = johnson_lindenstrauss_min_dim(Z_full.shape[0], eps=eps)
        srp = SparseRandomProjection(n_components=dim)
        Z = srp.fit_transform(Z_full)
    return Z

###############################################################################
# XGBoost with GridSearchCV and Threshold Adjustment
def flastXGBWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, dim, eps, combination_label):
    v0 = time.perf_counter()

    # Extract the zip files
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    
    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    dataPointsFlaky = getDataPoints(flakyDir)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")
    
    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # Vectorization
    Z = flastVectorization(dataPoints, dim=dim, eps=eps)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # Define XGBoost model without 'use_label_encoder'
    xgb_model = xgb.XGBClassifier(eval_metric="logloss")

    # Define parameter grid for hyperparameter tuning
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.3, 0.5],  # Learning rate
        'max_depth': [3, 5, 7, 10],  # Tree depth
        'n_estimators': [50, 100, 200, 300],  # Number of boosting rounds
    }

    # Custom scoring function (F1 score) for GridSearchCV
    scoring = make_scorer(f1_score, zero_division=1)

    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Perform GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(xgb_model, param_grid, cv=skf, scoring=scoring, refit=True, verbose=1)

    # Fit the GridSearchCV on training data
    grid_search.fit(Z, dataLabelsList)

    # Get the best estimator
    best_xgb_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    print(f"Best Parameters: {best_params}")

    # Find the optimal threshold using cross-validation
    thresholds = np.linspace(0.1, 0.9, 9)
    best_threshold = 0.5  # Default threshold
    best_f1 = 0.0

    # Get cross-validated predicted probabilities
    y_pred_proba = cross_val_predict(best_xgb_model, Z, dataLabelsList, cv=skf, method='predict_proba')

    for threshold in thresholds:
        y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)
        f1 = f1_score(dataLabelsList, y_pred, zero_division=1)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold

    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score with Threshold: {best_f1}")

    # Calculate other metrics using the best threshold
    y_pred = (y_pred_proba[:, 1] >= best_threshold).astype(int)
    accuracy = accuracy_score(dataLabelsList, y_pred)
    precision = precision_score(dataLabelsList, y_pred, zero_division=1)
    recall = recall_score(dataLabelsList, y_pred, zero_division=1)
    mcc = matthews_corrcoef(dataLabelsList, y_pred)
    preparationTime = vecTime / len(dataPoints)

    # Save the results
    outFile = f"{combination_label}-params-xgb-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        # Write the header
        fo.write("learning_rate,max_depth,n_estimators,threshold,accuracy,precision,recall,f1,mcc,preparationTime\n")
        # Write the data row
        fo.write(f"{best_params['learning_rate']},{best_params['max_depth']},{best_params['n_estimators']},"
                 f"{best_threshold},{accuracy},{precision},{recall},{best_f1},{mcc},{preparationTime}\n")
    print(f"XGBoost analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_f1

if __name__ == "__main__":
    # Parameters setup for the first combination
    flakyZip = "compressedDataset/flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

    # Create separate result directories for equal and larger non-flaky combinations
    outDirEqual = "results/equal_flaky_nonflaky/"
    outDirLarger = "results/larger_nonflaky/"
    os.makedirs(outDirEqual, exist_ok=True)
    os.makedirs(outDirLarger, exist_ok=True)

    # Create separate extract directories for each combination to avoid file confusion
    extractDirEqual = "extracted/equal_flaky_nonflaky/"
    extractDirLarger = "extracted/larger_nonflaky/"
    os.makedirs(extractDirEqual, exist_ok=True)
    os.makedirs(extractDirLarger, exist_ok=True)

    # Perform XGBoost analysis for the first combination (flaky vs smaller non-flaky)
    print("Starting XGBoost analysis for flaky vs smaller non-flaky files (47 each)...")
    best_params_5folds_1, best_score_5folds_1 = flastXGBWithGridSearchCV(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 5, dim=100, eps=0.3, combination_label="equal")
    best_params_3folds_1, best_score_3folds_1 = flastXGBWithGridSearchCV(
        outDirEqual, flakyZip, nonFlakyZip, extractDirEqual, 3, dim=100, eps=0.3, combination_label="equal")

    print("Best results for 5-fold on equal combination:")
    print(f"Best Parameters: {best_params_5folds_1}")
    print(f"Best F1 Score: {best_score_5folds_1}")

    print("Best results for 3-fold on equal combination:")
    print(f"Best Parameters: {best_params_3folds_1}")
    print(f"Best F1 Score: {best_score_3folds_1}")

    # Perform XGBoost analysis for the second combination (flaky vs larger non-flaky)
    print("Starting XGBoost analysis for flaky vs larger non-flaky files...")
    best_params_5folds_2, best_score_5folds_2 = flastXGBWithGridSearchCV(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 5, dim=100, eps=0.3, combination_label="larger")
    best_params_3folds_2, best_score_3folds_2 = flastXGBWithGridSearchCV(
        outDirLarger, flakyZip, largerNonFlakyZip, extractDirLarger, 3, dim=100, eps=0.3, combination_label="larger")

    print("Best results for 5-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_5folds_2}")
    print(f"Best F1 Score: {best_score_5folds_2}")

    print("Best results for 3-fold on larger non-flaky combination:")
    print(f"Best Parameters: {best_params_3folds_2}")
    print(f"Best F1 Score: {best_score_3folds_2}")



Starting XGBoost analysis for flaky vs smaller non-flaky files (47 each)...
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best Threshold: 0.5
Best F1 Score with Threshold: 0.7741935483870968
XGBoost analysis completed for 5-folds. Results saved to: equal-params-xgb-5-folds.csv
Number of flaky documents: 47
Number of non-flaky documents: 47
Total number of documents: 94
Fitting 3 folds for each of 64 candidates, totalling 192 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best Threshold: 0.30000000000000004
Best F1 Score with Threshold: 0.7524752475247525
XGBoost analysis completed for 3-folds. Results saved to: equal-params-xgb-3-folds.csv
Best results for 5-fold on equal combination:
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best F1 S