In [8]:
import os
import time
import zipfile
import numpy as np
import pandas as pd  # Import pandas for DataFrame

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score
)
from sklearn.model_selection import StratifiedKFold

###############################################################################
# Utility functions

def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):  # Only consider Python files
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:  # Ensure the document is not empty
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file: {file_path}")
    
    if len(dataPointsList) == 0:
        print(f"No valid documents found in directory: {path}")
    
    return dataPointsList

def flastVectorization(dataPoints):
    """Performs vectorization using CountVectorizer."""
    countVec = CountVectorizer(stop_words=None)  # No stop word removal
    Z = countVec.fit_transform(dataPoints)
    return Z

###############################################################################
# Data Extraction and Vectorization

# Parameters setup
flakyZip = "compressedDataset/flaky_files.zip"
nonFlakyZip = "compressedDataset/reduced_nonflaky_files.zip"
largerNonFlakyZip = "compressedDataset/all_nonflaky_files.zip"

# Create directories
outDirEqual = "results/equal_flaky_nonflaky/"
outDirLarger = "results/larger_nonflaky/"
os.makedirs(outDirEqual, exist_ok=True)
os.makedirs(outDirLarger, exist_ok=True)

extractDirEqual = "extracted/equal_flaky_nonflaky/"
extractDirLarger = "extracted/larger_nonflaky/"
os.makedirs(extractDirEqual, exist_ok=True)
os.makedirs(extractDirLarger, exist_ok=True)

# Extract and read data once for equal combination
flakyDirEqual = os.path.join(extractDirEqual, 'flaky')
nonFlakyDirEqual = os.path.join(extractDirEqual, 'nonFlaky')
os.makedirs(flakyDirEqual, exist_ok=True)
os.makedirs(nonFlakyDirEqual, exist_ok=True)

extract_zip(flakyZip, flakyDirEqual)
extract_zip(nonFlakyZip, nonFlakyDirEqual)

dataPointsFlakyEqual = getDataPoints(flakyDirEqual)
dataPointsNonFlakyEqual = getDataPoints(nonFlakyDirEqual)
dataPointsEqual = dataPointsFlakyEqual + dataPointsNonFlakyEqual

# Print the number of datasets for equal combination
print(f"Number of flaky documents (equal combination): {len(dataPointsFlakyEqual)}")
print(f"Number of non-flaky documents (equal combination): {len(dataPointsNonFlakyEqual)}")
print(f"Total number of documents (equal combination): {len(dataPointsEqual)}")

dataLabelsListEqual = np.array([1]*len(dataPointsFlakyEqual) + [0]*len(dataPointsNonFlakyEqual))

# Vectorize data once
Z_equal = flastVectorization(dataPointsEqual)

# Extract and read data once for larger non-flaky combination
flakyDirLarger = os.path.join(extractDirLarger, 'flaky')
nonFlakyDirLarger = os.path.join(extractDirLarger, 'nonFlaky')
os.makedirs(flakyDirLarger, exist_ok=True)
os.makedirs(nonFlakyDirLarger, exist_ok=True)

extract_zip(flakyZip, flakyDirLarger)
extract_zip(largerNonFlakyZip, nonFlakyDirLarger)

dataPointsFlakyLarger = getDataPoints(flakyDirLarger)
dataPointsNonFlakyLarger = getDataPoints(nonFlakyDirLarger)
dataPointsLarger = dataPointsFlakyLarger + dataPointsNonFlakyLarger

# Print the number of datasets for larger combination
print(f"Number of flaky documents (larger combination): {len(dataPointsFlakyLarger)}")
print(f"Number of non-flaky documents (larger combination): {len(dataPointsNonFlakyLarger)}")
print(f"Total number of documents (larger combination): {len(dataPointsLarger)}")

dataLabelsListLarger = np.array([1]*len(dataPointsFlakyLarger) + [0]*len(dataPointsNonFlakyLarger))

# Vectorize data once
Z_larger = flastVectorization(dataPointsLarger)


Number of flaky documents (equal combination): 45
Number of non-flaky documents (equal combination): 45
Total number of documents (equal combination): 90
Number of flaky documents (larger combination): 45
Number of non-flaky documents (larger combination): 254
Total number of documents (larger combination): 299


## KNN

In [11]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score
)
from sklearn.model_selection import StratifiedKFold
from itertools import product

def runKNNThreshold(Z, dataLabelsList, outDir, n_splits, combination_label, param_grid):
    v0 = time.perf_counter()
    
    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []
    
    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")
        
        # Initialize KNeighborsClassifier with current hyperparameter combination
        knn_model = KNeighborsClassifier(
            n_neighbors=param_dict['n_neighbors'],
            weights=param_dict['weights'],
            algorithm=param_dict['algorithm'],
            leaf_size=param_dict['leaf_size'],
            p=param_dict['p'],
            n_jobs=-1
        )
        
        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Train the model
            knn_model.fit(X_train, y_train)

            # Predict probabilities on test set
            if hasattr(knn_model, "predict_proba"):
                y_pred_proba = knn_model.predict_proba(X_test)
            else:
                # If predict_proba is not available, use distance-based probabilities
                distances, indices = knn_model.kneighbors(X_test)
                weights = knn_model._get_weights(distances)
                y_pred_proba = np.zeros((X_test.shape[0], 2))
                for i, neighbors in enumerate(indices):
                    neighbor_labels = y_train[neighbors]
                    if weights is None:
                        proba = np.bincount(neighbor_labels, minlength=2) / knn_model.n_neighbors
                    else:
                        proba = np.bincount(neighbor_labels, weights=weights[i], minlength=2) / weights[i].sum()
                    y_pred_proba[i] = proba

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)

                metrics_per_combination.append({
                    'n_neighbors': param_dict['n_neighbors'],
                    'weights': param_dict['weights'],
                    'algorithm': param_dict['algorithm'],
                    'leaf_size': param_dict['leaf_size'],
                    'p': param_dict['p'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Save the results for each combination, threshold, and fold
    df_results = pd.DataFrame(metrics_per_combination)
    outFile = f"{combination_label}-params-knn-{n_splits}-folds-Threshold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"KNN analysis completed. Results saved to: {outFile}")
    return None, df_results

# Parameters for the KNN model
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 20],           # Number of neighbors to use
    'weights': ['uniform', 'distance'],                # Weight function used in prediction
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute the nearest neighbors
    'leaf_size': [30, 50],                             # Leaf size passed to BallTree or KDTree
    'p': [1, 2],                                       # Power parameter for the Minkowski metric
}

# Run KNN Threshold Model on equal combination
print("\nStarting KNN Threshold analysis for flaky vs smaller non-flaky files (equal combination)...")
_, df_results_5folds_equal = runKNNThreshold(
    Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal", param_grid)

print("Results for 5-fold on equal combination:")
print(df_results_5folds_equal)

# Run KNN Threshold Model on larger non-flaky combination
print("\nStarting KNN Threshold analysis for flaky vs larger non-flaky files (larger combination)...")
_, df_results_5folds_larger = runKNNThreshold(
    Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger", param_grid)

print("Results for 5-fold on larger combination:")
print(df_results_5folds_larger)



Starting KNN Threshold analysis for flaky vs smaller non-flaky files (equal combination)...
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'auto



Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'aut



Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
KNN analysis completed. Results saved to: equal-params-knn-5-folds-Threshold.csv
Results for 5-fold on equal combination:
       n_neighbors   weights algorithm  leaf_size  p  fold  threshold  \
0                3   uniform      auto         30  1     1        0.1   
1                3   uniform      auto         30  1     1        0.2   
2                3   uniform      auto         30  1     1        0.3   
3                3   uniform      auto         30  1     1        0.4   
4                3   uniform      auto         30  1     1        0.5   
...            ...       ...       ...        ... ..   ...        ...   
10075           20 



Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}




Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 7, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'auto



Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 9, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}




Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 15, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm':



Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'uniform', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'ball_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 50, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 1}




Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 30, 'p': 2}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 1}
Training with parameters: {'n_neighbors': 20, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 50, 'p': 2}
KNN analysis completed. Results saved to: larger-params-knn-5-folds-Threshold.csv
Results for 5-fold on larger combination:
       n_neighbors   weights algorithm  leaf_size  p  fold  threshold  \
0                3   uniform      auto         30  1     1        0.1   
1                3   uniform      auto         30  1     1        0.2   
2                3   uniform      auto         30  1     1        0.3   
3                3   uniform      auto         30  1     1        0.4   
4                3   uniform      auto         30  1     1        0.5   
...            ...       ...       ...        ... ..   ...        ...   
10075           2

## SVM

In [12]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score
)
from sklearn.model_selection import StratifiedKFold
from itertools import product

def runSVMThreshold(Z, dataLabelsList, outDir, n_splits, combination_label, param_grid):
    v0 = time.perf_counter()

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []

    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")

        # Initialize SVM with current hyperparameter combination
        svm_model = SVC(
            C=param_dict['C'],
            kernel=param_dict['kernel'],
            probability=True,  # Enable probability estimates
            random_state=42
        )

        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Train the model
            try:
                svm_model.fit(X_train, y_train)
            except Exception as e:
                print(f"Failed to train SVM with parameters {param_dict} on fold {fold+1}: {e}")
                continue

            # Predict probabilities on test set
            try:
                y_pred_proba = svm_model.predict_proba(X_test)
            except Exception as e:
                print(f"Failed to predict probabilities with SVM on fold {fold+1}: {e}")
                continue

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)

                metrics_per_combination.append({
                    'C': param_dict['C'],
                    'kernel': param_dict['kernel'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Save the results for each combination, threshold, and fold
    df_results = pd.DataFrame(metrics_per_combination)
    outFile = f"{combination_label}-params-svm-{n_splits}-folds-Threshold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"SVM analysis completed. Results saved to: {outFile}")
    return None, df_results


# Parameters for the SVM model
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']  # Kernel types
}

# Assume Z_equal, dataLabelsListEqual, Z_larger, dataLabelsListLarger are already defined from the first cell

# Run SVM Threshold Model on equal combination
print("\nStarting SVM Threshold analysis for flaky vs smaller non-flaky files (equal combination)...")
_, df_results_5folds_equal = runSVMThreshold(
    Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal", param_grid)

print("Results for 5-fold on equal combination:")
print(df_results_5folds_equal)

# Run SVM Threshold Model on larger non-flaky combination
print("\nStarting SVM Threshold analysis for flaky vs larger non-flaky files (larger combination)...")
_, df_results_5folds_larger = runSVMThreshold(
    Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger", param_grid)

print("Results for 5-fold on larger combination:")
print(df_results_5folds_larger)



Starting SVM Threshold analysis for flaky vs smaller non-flaky files (equal combination)...
Training with parameters: {'C': 0.01, 'kernel': 'linear'}
Training with parameters: {'C': 0.01, 'kernel': 'rbf'}
Training with parameters: {'C': 0.01, 'kernel': 'poly'}
Training with parameters: {'C': 0.01, 'kernel': 'sigmoid'}
Training with parameters: {'C': 0.1, 'kernel': 'linear'}
Training with parameters: {'C': 0.1, 'kernel': 'rbf'}
Training with parameters: {'C': 0.1, 'kernel': 'poly'}
Training with parameters: {'C': 0.1, 'kernel': 'sigmoid'}
Training with parameters: {'C': 1.0, 'kernel': 'linear'}
Training with parameters: {'C': 1.0, 'kernel': 'rbf'}
Training with parameters: {'C': 1.0, 'kernel': 'poly'}
Training with parameters: {'C': 1.0, 'kernel': 'sigmoid'}
Training with parameters: {'C': 10.0, 'kernel': 'linear'}
Training with parameters: {'C': 10.0, 'kernel': 'rbf'}
Training with parameters: {'C': 10.0, 'kernel': 'poly'}
Training with parameters: {'C': 10.0, 'kernel': 'sigmoid'}
Tra

## Naive Bayes

In [13]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score
)
from sklearn.model_selection import StratifiedKFold
from itertools import product

def runNBThreshold(Z, dataLabelsList, outDir, n_splits, combination_label, param_grid):
    v0 = time.perf_counter()

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []

    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")

        # Initialize MultinomialNB with current hyperparameter combination
        nb_model = MultinomialNB(
            alpha=param_dict['alpha']
        )

        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Train the model
            nb_model.fit(X_train, y_train)

            # Predict probabilities on test set
            y_pred_proba = nb_model.predict_proba(X_test)

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)

                metrics_per_combination.append({
                    'alpha': param_dict['alpha'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Save the results for each combination, threshold, and fold
    df_results = pd.DataFrame(metrics_per_combination)
    outFile = f"{combination_label}-params-nb-{n_splits}-folds-Threshold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"Naive Bayes analysis completed. Results saved to: {outFile}")
    return None, df_results

# Parameters for the Naive Bayes model
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]
}

# Run Naive Bayes on equal combination
print("\nStarting Naive Bayes Threshold analysis for flaky vs smaller non-flaky files (equal combination)...")
_, df_results_5folds_equal = runNBThreshold(
    Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal", param_grid)

print("Results for 5-fold on equal combination:")
print(df_results_5folds_equal)

# Run Naive Bayes on larger non-flaky combination
print("\nStarting Naive Bayes Threshold analysis for flaky vs larger non-flaky files (larger combination)...")
_, df_results_5folds_larger = runNBThreshold(
    Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger", param_grid)

print("Results for 5-fold on larger combination:")
print(df_results_5folds_larger)



Starting Naive Bayes Threshold analysis for flaky vs smaller non-flaky files (equal combination)...
Training with parameters: {'alpha': 0.001}
Training with parameters: {'alpha': 0.01}
Training with parameters: {'alpha': 0.1}
Training with parameters: {'alpha': 1.0}
Training with parameters: {'alpha': 10.0}
Naive Bayes analysis completed. Results saved to: equal-params-nb-5-folds-Threshold.csv
Results for 5-fold on equal combination:
      alpha  fold  threshold  accuracy  precision    recall        f1
0     0.001     1        0.1  0.722222   1.000000  0.444444  0.615385
1     0.001     1        0.2  0.722222   1.000000  0.444444  0.615385
2     0.001     1        0.3  0.722222   1.000000  0.444444  0.615385
3     0.001     1        0.4  0.722222   1.000000  0.444444  0.615385
4     0.001     1        0.5  0.722222   1.000000  0.444444  0.615385
..      ...   ...        ...       ...        ...       ...       ...
220  10.000     5        0.5  0.777778   0.777778  0.777778  0.777778
2

## Random Forest

In [15]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score
)
from sklearn.model_selection import StratifiedKFold
from itertools import product

def runRandomForestThreshold(Z, dataLabelsList, outDir, n_splits, combination_label, param_grid):
    v0 = time.perf_counter()

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []

    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")

        # Initialize RandomForestClassifier with current hyperparameter combination
        rf_model = RandomForestClassifier(
            n_estimators=param_dict['n_estimators'],
            criterion=param_dict['criterion'],
            max_depth=param_dict['max_depth'],
            min_samples_split=param_dict['min_samples_split'],
            min_samples_leaf=param_dict['min_samples_leaf'],
            random_state=42,
            n_jobs=-1
        )

        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Train the model
            rf_model.fit(X_train, y_train)

            # Predict probabilities on test set
            y_pred_proba = rf_model.predict_proba(X_test)

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)

                metrics_per_combination.append({
                    'n_estimators': param_dict['n_estimators'],
                    'criterion': param_dict['criterion'],
                    'max_depth': param_dict['max_depth'],
                    'min_samples_split': param_dict['min_samples_split'],
                    'min_samples_leaf': param_dict['min_samples_leaf'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Save the results for each combination, threshold, and fold
    df_results = pd.DataFrame(metrics_per_combination)
    outFile = f"{combination_label}-params-rf-{n_splits}-folds-Threshold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"Random Forest analysis completed. Results saved to: {outFile}")
    return None, df_results

# Parameters for the Random Forest model
param_grid = {
    'n_estimators': [10, 50, 100, 300, 500],  # Number of trees
    'max_depth': [10, 30, 50, 100, 300, 500],  # Maximum depth of each tree
    'min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],  # Minimum number of samples required to be at a leaf node
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
}

# Assume Z_equal, dataLabelsListEqual, Z_larger, dataLabelsListLarger are already defined from the first cell

# Run Random Forest Threshold Model on equal combination
print("\nStarting Random Forest Threshold analysis for flaky vs smaller non-flaky files (equal combination)...")
_, df_results_5folds_equal = runRandomForestThreshold(
    Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal", param_grid)

print("Results for 5-fold on equal combination:")
print(df_results_5folds_equal)

# Run Random Forest Threshold Model on larger non-flaky combination
print("\nStarting Random Forest Threshold analysis for flaky vs larger non-flaky files (larger combination)...")
_, df_results_5folds_larger = runRandomForestThreshold(
    Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger", param_grid)

print("Results for 5-fold on larger combination:")
print(df_results_5folds_larger)



Starting Random Forest Threshold analysis for flaky vs smaller non-flaky files (equal combination)...
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 5, 'min_sam

Training with parameters: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 

Training with parameters: {'n_estimators': 100, 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 100, 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 100, 'max_depth': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 100, 'max_depth': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 100, 'max_depth': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 100, 'max_depth': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 100, 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 100, 'max_

Training with parameters: {'n_estimators': 300, 'max_depth': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 300, 'm

Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 10, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 10, 'max_depth': 30, 'mi

Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 50, 'max_depth': 50, 'mi

Training with parameters: {'n_estimators': 100, 'max_depth': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 100, 'max_depth': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 100, 'max_depth': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 100, 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 100, 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 100, 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 100, 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 100, 'm

Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 300, 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'criterion': 'entropy'}
Training with parameters: {'n_estimators': 500, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'gini'}
Training with parameters: {'n_estimators': 500, 'max_d

## Decision Tree

In [14]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    precision_score, recall_score, accuracy_score, f1_score
)
from sklearn.model_selection import StratifiedKFold
from itertools import product

def runDecisionTreeThreshold(Z, dataLabelsList, outDir, n_splits, combination_label, param_grid):
    v0 = time.perf_counter()

    # Define StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize storage for metrics per fold and threshold
    thresholds = np.linspace(0.1, 0.9, 9)
    metrics_per_combination = []

    # Manually iterate over all combinations of hyperparameters
    for params in product(*param_grid.values()):
        # Convert params from tuple to dictionary
        param_dict = dict(zip(param_grid.keys(), params))
        print(f"Training with parameters: {param_dict}")

        # Initialize DecisionTreeClassifier with current hyperparameter combination
        dt_model = DecisionTreeClassifier(
            criterion=param_dict['criterion'],
            max_depth=param_dict['max_depth'],
            min_samples_split=param_dict['min_samples_split'],
            min_samples_leaf=param_dict['min_samples_leaf'],
            max_features=param_dict['max_features'],
            random_state=42
        )

        # Cross-validation
        for fold, (train_index, test_index) in enumerate(skf.split(Z, dataLabelsList)):
            X_train, X_test = Z[train_index], Z[test_index]
            y_train, y_test = dataLabelsList[train_index], dataLabelsList[test_index]

            if sum(y_train) == 0 or sum(y_test) == 0:
                print(f"Skipping fold {fold+1} due to no positive samples in train or test set")
                continue

            # Train the model
            dt_model.fit(X_train, y_train)

            # Predict probabilities on test set
            y_pred_proba = dt_model.predict_proba(X_test)

            # Calculate metrics for each threshold
            for threshold in thresholds:
                y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

                # Calculate metrics for this threshold
                f1 = f1_score(y_test, y_pred, zero_division=1)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=1)
                recall = recall_score(y_test, y_pred, zero_division=1)

                metrics_per_combination.append({
                    'criterion': param_dict['criterion'],
                    'max_depth': param_dict['max_depth'],
                    'min_samples_split': param_dict['min_samples_split'],
                    'min_samples_leaf': param_dict['min_samples_leaf'],
                    'max_features': param_dict['max_features'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1
                })

    if len(metrics_per_combination) == 0:
        print("No valid folds. Exiting.")
        return None, None

    # Save the results for each combination, threshold, and fold
    df_results = pd.DataFrame(metrics_per_combination)
    outFile = f"{combination_label}-params-dt-{n_splits}-folds-Threshold.csv"
    df_results.to_csv(os.path.join(outDir, outFile), index=False)

    print(f"Decision Tree analysis completed. Results saved to: {outFile}")
    return None, df_results

# Parameters for the Decision Tree model
param_grid = {
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': [None, 10, 30, 50, 100, 300, 500],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 5, 10],  # Minimum number of samples required to be at a leaf node
    'max_features': [None, 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
}

# Assume Z_equal, dataLabelsListEqual, Z_larger, dataLabelsListLarger are already defined from the first cell

# Run Decision Tree Threshold Model on equal combination
print("\nStarting Decision Tree Threshold analysis for flaky vs smaller non-flaky files (equal combination)...")
_, df_results_5folds_equal = runDecisionTreeThreshold(
    Z_equal, dataLabelsListEqual, outDirEqual, 5, "equal", param_grid)

print("Results for 5-fold on equal combination:")
print(df_results_5folds_equal)

# Run Decision Tree Threshold Model on larger non-flaky combination
print("\nStarting Decision Tree Threshold analysis for flaky vs larger non-flaky files (larger combination)...")
_, df_results_5folds_larger = runDecisionTreeThreshold(
    Z_larger, dataLabelsListLarger, outDirLarger, 5, "larger", param_grid)

print("Results for 5-fold on larger combination:")
print(df_results_5folds_larger)



Starting Decision Tree Threshold analysis for flaky vs smaller non-flaky files (equal combination)...
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_s

Training with parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini

Training with parameters: {'criterion': 'gini', 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 50, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_

Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gi

Training with parameters: {'criterion': 'gini', 'max_depth': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 500, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 500, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 500, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 500, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 500, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'criterio

Training with parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'c

Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'c

Training with parameters: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with p

Training with parameters: {'criterion': 'entropy', 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 500, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with paramete

Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'cri

Training with parameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'm

Training with parameters: {'criterion': 'gini', 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'g

Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'gini', 'max_depth': 300, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'log2'}
Training with parameters: {'criterion': 'gini', 'max_depth': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}
Training with parameters: {'criterion': 'gini', 'max_depth': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Training with parameters: {'criterion

Training with parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Training wit

Training with parameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Training with parameters: {'

Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Training with paramet

Training with parameters: {'criterion': 'entropy', 'max_depth': 300, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 300, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 300, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 300, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': 'sqrt'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 300, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': 'log2'}
Training with parameters: {'criterion': 'entropy', 'max_depth': 300, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None}
Training with parameters: {'criterion': 'entropy', 'max_depth': 300, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Training with param

## printint result

In [16]:
import os
import pandas as pd

# Function to extract the best results from the CSV files of each model
def extract_best_results(model_name, combination, fold_label, csv_file):
    """
    Extracts the best result from the CSV file for a model.

    Parameters:
    - model_name: The name of the model (e.g., "KNN", "SVM")
    - combination: The combination of flaky and non-flaky files (e.g., "equal", "larger")
    - fold_label: Number of folds (e.g., "5-fold" or "3-fold")
    - csv_file: The path to the CSV file containing the model's results

    Returns:
    A dictionary containing the best results for the model, combination, and fold.
    """
    if not os.path.exists(csv_file):
        print(f"CSV file for {model_name} ({combination}, {fold_label}) does not exist: {csv_file}")
        return None

    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    if df.empty:
        print(f"CSV file for {model_name} ({combination}, {fold_label}) is empty: {csv_file}")
        return None
    
    # Identify the metric columns
    metric_columns = ['accuracy', 'precision', 'recall', 'f1', 'mcc']
    # Identify parameter columns (exclude known metric columns and 'fold' and 'threshold')
    parameter_columns = [col for col in df.columns if col not in metric_columns + ['fold', 'threshold']]
    
    # Group by hyperparameters, fold, and threshold, then find the row with the highest F1 score
    idx = df.groupby(parameter_columns + ['fold', 'threshold'])['f1'].idxmax()
    df_best = df.loc[idx]
    
    # Now, find the overall best result (highest F1 score)
    best_row = df_best.loc[df_best['f1'].idxmax()]
    
    # Extract metrics
    accuracy = best_row['accuracy']
    precision = best_row['precision']
    recall = best_row['recall']
    f1 = best_row['f1']
    mcc = best_row.get('mcc', None)  # Get MCC if available
    
    # Extract parameters
    parameters = {col: best_row[col] for col in parameter_columns}
    parameters['threshold'] = best_row['threshold']
    parameters['fold'] = int(best_row['fold'])
    
    # Create a combined model name
    combined_model_name = f"{combination} {model_name}"
    
    # Collect the best results into a dictionary
    best_results = {
        'Model': combined_model_name,
        'Fold': fold_label,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'MCC': mcc,
        'Parameters': parameters
    }
    
    return best_results

# Function to gather and print/save the best results from a single combination
def gather_best_results_for_combination(models_results_dir, output_file_prefix, combination):
    """
    Gathers the best results from all models for a specific combination (e.g., "equal", "larger") and writes them to separate CSV files.

    Parameters:
    - models_results_dir: Directory where the model result CSV files are stored for the combination.
    - output_file_prefix: Prefix for the output CSV files to store the best results for both 'wogridsearch' and 'one_hyperparameter'.
    - combination: The combination name (e.g., "equal", "larger").
    """
    # List of models and their corresponding result file patterns
    models = {
        'KNN': 'params-knn',
        'SVM': 'params-svm',
        'Naive Bayes': 'params-nb',
        'XGBoost': 'params-xgb',
        'Random Forest': 'params-rf',
        'Decision Tree': 'params-dt'
    }

    # Initialize an empty list to store the best results from each model and fold for both file types
    best_results_wogridsearch = []
    best_results_one_hyperparameter = []

    # Iterate over each model and its result files for both 5-fold and 3-fold
    for model_name, base_filename in models.items():
        for n_splits in [5, 3]:
            fold_label = f"{n_splits}-fold"
            
            # Construct the CSV filenames for both "wogridsearch" and "one_hyperparameter"
            csv_file_wogridsearch = f"{combination}-{base_filename}-{n_splits}-folds-Threshold-allKfold-wogridsearch.csv"
            csv_file_one_hyperparameter = f"{combination}-{base_filename}-{n_splits}-folds-Threshold-allKfold_one_hyperparameter.csv"
            
            # Get full paths
            full_csv_path_wogridsearch = os.path.join(models_results_dir, csv_file_wogridsearch)
            full_csv_path_one_hyperparameter = os.path.join(models_results_dir, csv_file_one_hyperparameter)

            # Extract the best results from both files
            best_result_wogridsearch = extract_best_results(model_name, combination, fold_label, full_csv_path_wogridsearch)
            best_result_one_hyperparameter = extract_best_results(model_name, combination, fold_label, full_csv_path_one_hyperparameter)

            # Append results to respective lists
            if best_result_wogridsearch:
                best_results_wogridsearch.append(best_result_wogridsearch)
            if best_result_one_hyperparameter:
                best_results_one_hyperparameter.append(best_result_one_hyperparameter)

    # Save the best results to CSV files
    if best_results_wogridsearch:
        best_results_df_wogridsearch = pd.DataFrame(best_results_wogridsearch)
        output_file_wogridsearch = f"{output_file_prefix}_wogridsearch.csv"
        best_results_df_wogridsearch.to_csv(output_file_wogridsearch, index=False)
        print(f"Best results for {combination} combination (wogridsearch) saved to: {output_file_wogridsearch}")

    if best_results_one_hyperparameter:
        best_results_df_one_hyperparameter = pd.DataFrame(best_results_one_hyperparameter)
        output_file_one_hyperparameter = f"{output_file_prefix}_one_hyperparameter.csv"
        best_results_df_one_hyperparameter.to_csv(output_file_one_hyperparameter, index=False)
        print(f"Best results for {combination} combination (one_hyperparameter) saved to: {output_file_one_hyperparameter}")
    
    # Print the best results as a table for both cases
    if best_results_wogridsearch:
        print(f"\nBest Results from All Models for {combination} Combination (wogridsearch):")
        print(best_results_df_wogridsearch.to_string(index=False))

    if best_results_one_hyperparameter:
        print(f"\nBest Results from All Models for {combination} Combination (one_hyperparameter):")
        print(best_results_df_one_hyperparameter.to_string(index=False))

# Example usage
if __name__ == "__main__":
    # Directories where the model result CSV files are stored for each combination
    equal_results_dir = 'results/equal_flaky_nonflaky/'
    larger_results_dir = 'results/larger_nonflaky/'

    # Prefixes for the output CSV files
    equal_output_prefix = "best_results_equal_combination"
    larger_output_prefix = "best_results_larger_combination"

    # Gather and save the best results for the equal combination
    gather_best_results_for_combination(equal_results_dir, equal_output_prefix, "equal")

    # Gather and save the best results for the larger combination
    gather_best_results_for_combination(larger_results_dir, larger_output_prefix, "larger")



CSV file for KNN (equal, 3-fold) does not exist: results/equal_flaky_nonflaky/equal-params-knn-3-folds-Threshold-allKfold-wogridsearch.csv
CSV file for SVM (equal, 3-fold) does not exist: results/equal_flaky_nonflaky/equal-params-svm-3-folds-Threshold-allKfold-wogridsearch.csv
CSV file for Naive Bayes (equal, 5-fold) does not exist: results/equal_flaky_nonflaky/equal-params-nb-5-folds-Threshold-allKfold_one_hyperparameter.csv
CSV file for Naive Bayes (equal, 3-fold) does not exist: results/equal_flaky_nonflaky/equal-params-nb-3-folds-Threshold-allKfold-wogridsearch.csv
CSV file for Naive Bayes (equal, 3-fold) does not exist: results/equal_flaky_nonflaky/equal-params-nb-3-folds-Threshold-allKfold_one_hyperparameter.csv
CSV file for XGBoost (equal, 3-fold) does not exist: results/equal_flaky_nonflaky/equal-params-xgb-3-folds-Threshold-allKfold-wogridsearch.csv
CSV file for Random Forest (equal, 3-fold) does not exist: results/equal_flaky_nonflaky/equal-params-rf-3-folds-Threshold-allKfol