In [1]:
import os
import zipfile
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from itertools import product
import os
import time


# Function to extract zip files
def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Function to read .py files from the given directory
def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file skipped: {file_path}")
    return dataPointsList
# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)
# Paths to your zip files (update these paths as necessary)
flakyZip = "Dataset/flaky_files.zip"
nonFlakyZip = "Dataset/nonflaky_files.zip"

# Create directories for extraction
extractDir = "extracted_files"
flakyDir = os.path.join(extractDir, 'flaky')
nonFlakyDir = os.path.join(extractDir, 'nonFlaky')

# Ensure directories exist for flaky and non-flaky files
os.makedirs(flakyDir, exist_ok=True)
os.makedirs(nonFlakyDir, exist_ok=True)

# Extract the zip files to their respective directories
extract_zip(flakyZip, flakyDir)
extract_zip(nonFlakyZip, nonFlakyDir)

# Collect data points (Python files) from both flaky and non-flaky directories
dataPointsFlaky = getDataPoints(flakyDir)
dataPointsNonFlaky = getDataPoints(nonFlakyDir)

# Combine the data points from both classes (flaky and non-flaky)
dataPoints = dataPointsFlaky + dataPointsNonFlaky

# Create labels: 1 for flaky files and 0 for non-flaky files
dataLabelsList = np.array([1] * len(dataPointsFlaky) + [0] * len(dataPointsNonFlaky))

# Output some basic information
print(f"Number of flaky files: {len(dataPointsFlaky)}")
print(f"Number of non-flaky files: {len(dataPointsNonFlaky)}")
print(f"Total number of data points: {len(dataPoints)}")


Number of flaky files: 45
Number of non-flaky files: 243
Total number of data points: 288


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def runRFwithThreshold(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()
    
    # Define parameter grid
    param_grid = {
        'rf__n_estimators': [50],          # Number of trees in the forest
        'rf__max_depth': [10,],               # Maximum depth of the tree
        'rf__min_samples_split': [5],            # Minimum number of samples required to split a node
        'rf__min_samples_leaf': [2],              # Minimum number of samples required at a leaf node
        'rf__criterion': ['entropy']         # Function to measure the quality of a split
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluate
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline Random Forest with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),
                ('rf', RandomForestClassifier(
                    n_estimators=param_dict['rf__n_estimators'],
                    max_depth=param_dict['rf__max_depth'],
                    min_samples_split=param_dict['rf__min_samples_split'],
                    min_samples_leaf=param_dict['rf__min_samples_leaf'],
                    criterion=param_dict['rf__criterion'],
                    random_state=42))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            avg_accuracy = np.mean([tm['accuracy'] for tm in threshold_metrics])
            avg_precision = np.mean([tm['precision'] for tm in threshold_metrics])
            avg_recall = np.mean([tm['recall'] for tm in threshold_metrics])
            avg_f1 = np.mean([tm['f1'] for tm in threshold_metrics])
            avg_mcc = np.mean([tm['mcc'] for tm in threshold_metrics])
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'precision': avg_precision,
                'recall': avg_recall,
                'f1': avg_f1,
                'mcc': avg_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"rf-threshold-results.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    final_mcc = best_result['mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1}")
    print(f"Final MCC: {final_mcc}")
    
    return best_params, best_threshold, best_f1, final_mcc

# Main Execution for 5-Fold Cross-Validation
outDir = "results"
os.makedirs(outDir, exist_ok=True)

# Run Random Forest with SMOTE and Threshold adjustment using 5-fold cross-validation
print("\nStarting Random Forest analysis with Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, final_mcc_5folds = runRFwithThreshold(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for Random Forest with Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")
print(f"Final MCC: {final_mcc_5folds}")



Starting Random Forest analysis with SMOTE and Threshold adjustment for 5-fold cross-validation...
Results saved to: Threshold-results-RF-new\rf-smote-threshold-results-5-folds.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'rf__n_estimators': 50, 'rf__max_depth': 10, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2, 'rf__criterion': 'entropy'}
Best Threshold: 0.5
Best F1 Score: 0.8661764705882353
Final MCC: 0.8487345249535231

Best results for Random Forest with SMOTE and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'rf__n_estimators': 50, 'rf__max_depth': 10, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2, 'rf__criterion': 'entropy'}
Best Threshold: 0.5
Best F1 Score: 0.8661764705882353
Final MCC: 0.8487345249535231


## Decision Tree

In [2]:
from sklearn.tree import DecisionTreeClassifier 
import time

def runDTwithThreshold(dataPoints, dataLabelsList, outDir, n_splits):
    v0 = time.perf_counter()
    
    # Define parameter grid
    param_grid = {
        'dt__max_depth': [10],
        'dt__min_samples_split': [5],
        'dt__min_samples_leaf': [2],
        'dt__criterion': ['gini'],
        'dt__max_features': [None]
    }
    
    param_combinations = list(product(*param_grid.values()))
    param_keys = list(param_grid.keys())
    
    # Prepare to store metrics
    metrics_per_combination = []
    
    # Define thresholds to evaluate
    thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9
    
    # Setup cross-validation strategy
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Loop over each hyperparameter combination
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))
        
        # Initialize lists to store metrics per threshold
        threshold_metrics_list = []
        
        # For each fold in cross-validation
        for fold_idx, (train_index, test_index) in enumerate(skf.split(dataPoints, dataLabelsList)):
            X_train = [dataPoints[i] for i in train_index]
            X_test = [dataPoints[i] for i in test_index]
            y_train = [dataLabelsList[i] for i in train_index]
            y_test = [dataLabelsList[i] for i in test_index]
            
            # Define a pipeline Decision Tree with current params
            pipeline = ImbPipeline([
                ('vectorizer', CountVectorizer(stop_words=None)),
                ('dt', DecisionTreeClassifier(
                    max_depth=param_dict['dt__max_depth'],
                    min_samples_split=param_dict['dt__min_samples_split'],
                    min_samples_leaf=param_dict['dt__min_samples_leaf'],
                    criterion=param_dict['dt__criterion'],
                    max_features=param_dict['dt__max_features'],
                    random_state=42))
            ])
            
            # Train the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Iterate over thresholds
            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)
                
                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred_threshold)
                precision = precision_score(y_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_test, y_pred_threshold)
                
                # Store fold metrics
                threshold_metrics_list.append({
                    **param_dict,
                    'threshold': threshold,
                    'fold': fold_idx + 1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })
        
        # Calculate average metrics over folds for each threshold
        for threshold in thresholds:
            # Filter metrics for the current threshold
            threshold_metrics = [tm for tm in threshold_metrics_list if tm['threshold'] == threshold]
            
            avg_accuracy = np.mean([tm['accuracy'] for tm in threshold_metrics])
            avg_precision = np.mean([tm['precision'] for tm in threshold_metrics])
            avg_recall = np.mean([tm['recall'] for tm in threshold_metrics])
            avg_f1 = np.mean([tm['f1'] for tm in threshold_metrics])
            avg_mcc = np.mean([tm['mcc'] for tm in threshold_metrics])
            
            # Store the metrics along with parameters and threshold
            metrics_per_combination.append({
                **param_dict,
                'threshold': threshold,
                'accuracy': avg_accuracy,
                'precision': avg_precision,
                'recall': avg_recall,
                'f1': avg_f1,
                'mcc': avg_mcc
            })
    
    # Now, find the parameter combination with the best F1 score
    best_result = max(metrics_per_combination, key=lambda x: x['f1'])
    
    # Save the results to CSV
    df_metrics = pd.DataFrame(metrics_per_combination)
    outFile_metrics = os.path.join(outDir, f"dt-threshold-results.csv")
    df_metrics.to_csv(outFile_metrics, index=False)
    
    print(f"Results saved to: {outFile_metrics}")
    
    # Extract the best parameters, threshold, and metrics
    best_params = {key: best_result[key] for key in param_keys}
    best_threshold = best_result['threshold']
    best_f1 = best_result['f1']
    final_mcc = best_result['mcc']
    
    print("\nBest Parameters, Threshold, and Metrics:")
    print(f"Best Parameters: {best_params}")
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score: {best_f1}")
    print(f"Final MCC: {final_mcc}")
    
    return best_params, best_threshold, best_f1, final_mcc
# Main Execution for 5-Fold Cross-Validation
outDir = "results"
os.makedirs(outDir, exist_ok=True)

# Run Decision Tree with SMOTE and Threshold adjustment using 5-fold cross-validation
print("\nStarting Decision Tree analysis with SMOTE and Threshold adjustment for 5-fold cross-validation...")
best_params_5folds, best_threshold_5folds, best_f1_5folds, final_mcc_5folds = runDTwithThreshold(
    dataPoints, dataLabelsList, outDir, n_splits=5)

# Display results
print("\nBest results for Decision Tree with SMOTE and Threshold adjustment 5-fold cross-validation:")
print(f"Best Parameters: {best_params_5folds}")
print(f"Best Threshold: {best_threshold_5folds}")
print(f"Best F1 Score: {best_f1_5folds}")
print(f"Final MCC: {final_mcc_5folds}")



Starting Decision Tree analysis with SMOTE and Threshold adjustment for 5-fold cross-validation...
Results saved to: results\dt-threshold-results.csv

Best Parameters, Threshold, and Metrics:
Best Parameters: {'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__criterion': 'gini', 'dt__max_features': None}
Best Threshold: 0.1
Best F1 Score: 0.9013453419645371
Final MCC: 0.8924103960373342

Best results for Decision Tree with SMOTE and Threshold adjustment 5-fold cross-validation:
Best Parameters: {'dt__max_depth': 10, 'dt__min_samples_split': 5, 'dt__min_samples_leaf': 2, 'dt__criterion': 'gini', 'dt__max_features': None}
Best Threshold: 0.1
Best F1 Score: 0.9013453419645371
Final MCC: 0.8924103960373342
