In [8]:
import re

def remove_comments(code):
    """파이썬 코드에서 주석을 제거합니다."""
    pattern = r"""
        ('(?:\\.|[^\\'])*'          # 단일 따옴표 문자열
        | "(?:\\.|[^\\"])*"         # 이중 따옴표 문자열
        | '''[\s\S]*?'''            # 삼중 따옴표 문자열 (여러 줄 문자열)
        | \"\"\"[\s\S]*?\"\"\"      # 삼중 따옴표 문자열
        | \#.*?$                    # 한 줄 주석
        )"""
    regex = re.compile(pattern, re.VERBOSE | re.MULTILINE)
    def replacer(match):
        s = match.group(0)
        if s.startswith('#'):
            return ''  # 주석은 제거
        else:
            return s  # 문자열은 그대로 유지
    code_no_comments = regex.sub(replacer, code)
    return code_no_comments


In [3]:
import random

def augment_code(code):
    # 주석 추가
    augmented_codes = []
    comments = ["# TODO: Refactor this", "# This is a placeholder", "# Added for augmentation"]
    for comment in comments:
        augmented_code = f"{comment}\n{code}"
        augmented_codes.append(augmented_code)
    

    code_lines = code.split('\n')
    if len(code_lines) > 1:
        # 랜덤한 위치에 빈 줄 추가
        idx = random.randint(0, len(code_lines)-1)
        code_lines.insert(idx, '')
        augmented_code = '\n'.join(code_lines)
        augmented_codes.append(augmented_code)
    
    return augmented_codes


In [2]:
def getDataPoints(path, augment=False):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                        if augment:
                            # 데이터 증강 적용
                            augmented_codes = augment_code(dp)
                            dataPointsList.extend(augmented_codes)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList


In [16]:
import os
import time
import zipfile
import numpy as np
import shutil


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

###############################################################################
# Utility functions

def augment_code(code):
    """코드에 변형을 주어 데이터 증강을 수행합니다."""
    augmented_codes = []

    # 예시 1: 주석 추가
    comments = ["# TODO: Refactor this", "# Added for augmentation", "# Generated comment"]
    for comment in comments:
        augmented_code = f"{comment}\n{code}"
        augmented_codes.append(augmented_code)

    # 예시 2: 공백 줄 추가
    code_lines = code.split('\n')
    if len(code_lines) > 1:
        idx = random.randint(0, len(code_lines) - 1)
        code_lines.insert(idx, '')  # 빈 줄 추가
        augmented_code = '\n'.join(code_lines)
        augmented_codes.append(augmented_code)

    # 예시 3: 변수명 변경 (간단한 예)
    if 'temp' in code:
        augmented_code = code.replace('temp', 'tmp')
        augmented_codes.append(augmented_code)

    return augmented_codes

def extract_zip(zip_file, extract_to):
    """지정된 디렉토리에 zip 파일을 해제합니다."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path, augment=False):
    """주어진 디렉토리 내의 모든 .py 파일의 내용을 수집합니다."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                        if augment:
                            # 데이터 증강 적용
                            augmented_codes = augment_code(dp)
                            dataPointsList.extend(augmented_codes)
                    else:
                        print(f"Empty file: {file_path}")

    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")

    return dataPointsList

def flastVectorization(dataPoints):
    """데이터 포인트를 벡터화합니다."""
    countVec = CountVectorizer(stop_words=None)
    Z = countVec.fit_transform(dataPoints)
    return Z

###############################################################################
# KNN 모델과 GridSearchCV를 사용한 분석 함수

def flastKNNWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, n_splits, combination_label):
    v0 = time.perf_counter()

    # 압축 파일 해제
    flakyDir = os.path.join(extractDir, 'flaky')
    nonFlakyDir = os.path.join(extractDir, 'nonFlaky')
    os.makedirs(flakyDir, exist_ok=True)
    os.makedirs(nonFlakyDir, exist_ok=True)
    if os.path.exists(flakyDir):
        shutil.rmtree(flakyDir)
    if os.path.exists(nonFlakyDir):
        shutil.rmtree(nonFlakyDir)
    

    extract_zip(flakyZip, flakyDir)
    extract_zip(nonFlakyZip, nonFlakyDir)

    # 데이터 로딩 및 증강 적용
    dataPointsFlaky = getDataPoints(flakyDir, augment=True)
    dataPointsNonFlaky = getDataPoints(nonFlakyDir, augment=False)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky

    print(f"Number of flaky documents: {len(dataPointsFlaky)}")
    print(f"Number of non-flaky documents: {len(dataPointsNonFlaky)}")
    print(f"Total number of documents: {len(dataPoints)}")

    if len(dataPoints) == 0:
        raise ValueError("No documents available for vectorization. Please check the input directories.")

    # 벡터화
    Z = flastVectorization(dataPoints)
    dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))
    vecTime = time.perf_counter() - v0

    # KNN 모델 정의
    knn = KNeighborsClassifier()

    # 하이퍼파라미터 그리드 정의
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11, 15, 20],
        'metric': ['cosine', 'euclidean'],
        'weights': ['uniform', 'distance'],
    }

    # 스코어링 메트릭 정의
    scoring = {
        'precision': make_scorer(precision_score, zero_division=1),
        'recall': make_scorer(recall_score),
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score)
    }

    # 교차 검증 설정
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # GridSearchCV 실행
    grid_search = GridSearchCV(knn, param_grid, cv=skf, scoring=scoring, refit='f1', verbose=1, return_train_score=True)

    # 모델 학습
    grid_search.fit(Z, dataLabelsList)

    # 최적 파라미터 및 점수 출력
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best F1 Score: {best_score}")

    # 결과 저장
    outFile = f"{combination_label}-params-knn-{n_splits}-folds.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write("n_neighbors,metric,weights,accuracy,precision,recall,f1,preparationTime\n")
        for idx, param in enumerate(grid_search.cv_results_['params']):
            accuracy = grid_search.cv_results_['mean_test_accuracy'][idx]
            precision = grid_search.cv_results_['mean_test_precision'][idx]
            recall = grid_search.cv_results_['mean_test_recall'][idx]
            f1 = grid_search.cv_results_['mean_test_f1'][idx]
            preparationTime = vecTime / len(dataPoints)
            fo.write(f"{param['n_neighbors']},{param['metric']},{param['weights']},{accuracy},{precision},{recall},{f1},{preparationTime}\n")

    print(f"KNN analysis completed for {n_splits}-folds. Results saved to: {outFile}")
    return best_params, best_score

if __name__ == "__main__":
    flakyZip = "compressedDataset/cleaned_flaky_files.zip"
    nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
    nonFlakyUnbalance = "compressedDataset/all_nonflaky_files.zip"
    
    extractDir = "extracted"
    outDir = "results/resut_FlastKNN_augment"
    os.makedirs(outDir, exist_ok=True)
    os.makedirs(extractDir, exist_ok=True)

    # Perform KNN analysis for 5 folds and 3 folds
    print("Starting 5-fold analysis...")
    best_params_5folds, best_score_5folds = flastKNNWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir,5, "")
    
   
    
    print ("Starting 5-fold analysis with unbalanced data...")
    best_params_5folds_imbalance,best_score_5folds_imbalance = flastKNNWithGridSearchCV(outDir, flakyZip,nonFlakyUnbalance, extractDir, 5,"imbalance")
  
    
    print("Starting 3-fold analysis...")
    best_params_3folds, best_score_3folds = flastKNNWithGridSearchCV(outDir, flakyZip, nonFlakyZip, extractDir, 3,"")

    
    print ("Starting 3-fold analysis with unbalanced data...")
    best_params_3folds_imbalance,best_score_3folds_imbalance = flastKNNWithGridSearchCV(outDir, flakyZip,nonFlakyUnbalance, extractDir, 3,"imbalance")
  
    print("Best results for 5-fold:")
    print(f"Best Parameters: {best_params_5folds}")
    print(f"Best f1 Score: {best_score_5folds}")

    
    print("Best results for imbalanced 5-fold:")
    print(f"Best Parameters: {best_params_5folds_imbalance}")
    print(f"Best f1 Score: {best_score_5folds_imbalance}")
    
    print("Best results for imbalanced 3-fold:")
    print(f"Best Parameters: {best_params_3folds}")
    print(f"Best f1 Score: {best_score_3folds}")
    
    print("Best results for 3-fold:")
    print(f"Best Parameters: {best_params_3folds_imbalance}")
    print(f"Best f1 Score: {best_score_3folds_imbalance}")

Starting 5-fold analysis...
Number of flaky documents: 245
Number of non-flaky documents: 47
Total number of documents: 292
Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best F1 Score: 0.9744077208406164
KNN analysis completed for 5-folds. Results saved to: -params-knn-5-folds.csv
Starting 5-fold analysis with unbalanced data...
Number of flaky documents: 245
Number of non-flaky documents: 254
Total number of documents: 499
Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Best F1 Score: 0.9684705882352942
KNN analysis completed for 5-folds. Results saved to: imbalance-params-knn-5-folds.csv
Starting 3-fold analysis...
Number of flaky documents: 245
Number of non-flaky documents: 47
Total number of documents: 292
Fitting 3 folds for each of 28 candidates, totalling 84 fits
Best Parameters: {'metric

In [1]:
import sklearn
import imblearn
print("scikit-learn 버전:", sklearn.__version__)
print("imbalanced-learn 버전:", imblearn.__version__)

scikit-learn 버전: 1.5.2
imbalanced-learn 버전: 0.12.3
