In [1]:
# Cell 1: Preprocessing and Data Preparation

import os
import zipfile
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Function to extract zip files
def extract_zip(zip_file, extract_to):
    """Extracts a zip file to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Function to read .py files from the given directory
def getDataPoints(path):
    """Collects content of all .py files within the given directory."""
    dataPointsList = []
    if not os.path.exists(path):
        print(f"Directory does not exist: {path}")
        return dataPointsList

    for root, dirs, files in os.walk(path):
        for dataPointName in files:
            if dataPointName.endswith(".py"):
                file_path = os.path.join(root, dataPointName)
                with open(file_path, encoding="utf-8") as fileIn:
                    dp = fileIn.read().strip()
                    if dp:
                        dataPointsList.append(dp)
                    else:
                        print(f"Empty file skipped: {file_path}")
    return dataPointsList

# Function for vectorization using CountVectorizer
def flastVectorization(dataPoints):
    countVec = CountVectorizer(stop_words=None)
    return countVec.fit_transform(dataPoints)

# Paths to your zip files (you can modify these paths)
flakyZip = "/content/flaky_files (1).zip"
nonFlakyZip = "/content/all_nonflaky_files (1).zip"

# Create directories for extraction
extractDir = "extracted_files"
flakyDir = os.path.join(extractDir, 'flaky')
nonFlakyDir = os.path.join(extractDir, 'nonFlaky')

# Ensure directories exist
os.makedirs(flakyDir, exist_ok=True)
os.makedirs(nonFlakyDir, exist_ok=True)

# Extract the zip files
extract_zip(flakyZip, flakyDir)
extract_zip(nonFlakyZip, nonFlakyDir)

# Collect data points from flaky and non-flaky files
dataPointsFlaky = getDataPoints(flakyDir)
dataPointsNonFlaky = getDataPoints(nonFlakyDir)
dataPoints = dataPointsFlaky + dataPointsNonFlaky

# Create labels: 1 for flaky, 0 for non-flaky
dataLabelsList = np.array([1]*len(dataPointsFlaky) + [0]*len(dataPointsNonFlaky))

# Vectorize the data
Z = flastVectorization(dataPoints)

print(f"Data points shape: {Z.shape}")
print(f"Number of flaky files: {len(dataPointsFlaky)}")
print(f"Number of non-flaky files: {len(dataPointsNonFlaky)}")


Data points shape: (299, 11986)
Number of flaky files: 45
Number of non-flaky files: 254


Random Forest

In [2]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from itertools import product

# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

# Summarize and find best parameter combinations
def summarize_and_find_best(df_results):
    summary_df = df_results.groupby(['rf__n_estimators', 'rf__max_depth', 'rf__min_samples_split', 'rf__min_samples_leaf', 'rf__criterion', 'threshold']).agg({
        'accuracy': 'mean',
        'precision': 'mean',
        'recall': 'mean',
        'f1': 'mean',
        'mcc': 'mean'
    }).reset_index()

    # Finding the best parameter set based on the highest F1 score
    best_row = summary_df.loc[summary_df['f1'].idxmax()]

    return summary_df, best_row

# Combined SMOTE and Threshold-based Random Forest (No PCA)
def runRFWithSMOTEAndThreshold(X, y, outDir, n_splits, param_grid, thresholds):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    metrics_per_combination = []

    for params in product(*param_grid.values()):
        param_dict = dict(zip(param_grid.keys(), params))

        # Define pipeline with SMOTE and Random Forest
        pipeline = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('rf', RandomForestClassifier(
                n_estimators=param_dict['rf__n_estimators'],
                max_depth=param_dict['rf__max_depth'],
                min_samples_split=param_dict['rf__min_samples_split'],
                min_samples_leaf=param_dict['rf__min_samples_leaf'],
                criterion=param_dict['rf__criterion'],
                random_state=42,
                n_jobs=-1
            ))
        ])

        for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
            X_fold_train, X_fold_test = X[train_index].toarray(), X[test_index].toarray()
            y_fold_train, y_fold_test = y[train_index], y[test_index]

            # Train the pipeline
            pipeline.fit(X_fold_train, y_fold_train)

            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_fold_test)[:, 1]  # Random Forest probability for positive class

            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)

                # Calculate metrics
                accuracy = accuracy_score(y_fold_test, y_pred_threshold)
                precision = precision_score(y_fold_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_fold_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_fold_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_fold_test, y_pred_threshold)

                # Store results
                metrics_per_combination.append({
                    'rf__n_estimators': param_dict['rf__n_estimators'],
                    'rf__max_depth': param_dict['rf__max_depth'],
                    'rf__min_samples_split': param_dict['rf__min_samples_split'],
                    'rf__min_samples_leaf': param_dict['rf__min_samples_leaf'],
                    'rf__criterion': param_dict['rf__criterion'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })

    # Convert metrics to DataFrame and save results
    df_results = pd.DataFrame(metrics_per_combination)
    os.makedirs(outDir, exist_ok=True)
    outFile = os.path.join(outDir, "rf-smote-threshold-results.csv")
    df_results.to_csv(outFile, index=False)

    # Summarize results and return the best parameter set
    summary_df, best_params = summarize_and_find_best(df_results)

    # Save summarized results to a new CSV file
    summary_outFile = os.path.join(outDir, "rf-smote-threshold-summary.csv")
    summary_df.to_csv(summary_outFile, index=False)
    print(f"Summary of results saved to {summary_outFile}")

    return df_results, best_params

# Hyperparameters and thresholds
param_grid = {
    'rf__n_estimators': [10, 50, 100],                   # Number of trees in the forest
    'rf__max_depth': [10, 30, 50],                       # Maximum depth of the tree
    'rf__min_samples_split': [2, 5],                     # Minimum number of samples required to split a node
    'rf__min_samples_leaf': [1, 2],                      # Minimum number of samples required at a leaf node
    'rf__criterion': ['gini', 'entropy']                 # Function to measure the quality of a split
}

thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9

outDir = "results/"
df_results, best_params = runRFWithSMOTEAndThreshold(Z, dataLabelsList, outDir, n_splits=5, param_grid=param_grid, thresholds=thresholds)

print("\nRandom Forest with SMOTE and Threshold analysis completed.")
print("Best parameter set based on F1-score:")
print(best_params)


Summary of results saved to results/rf-smote-threshold-summary.csv

Random Forest with SMOTE and Threshold analysis completed.
Best parameter set based on F1-score:
rf__n_estimators               10
rf__max_depth                  10
rf__min_samples_split           2
rf__min_samples_leaf            1
rf__criterion             entropy
threshold                     0.6
accuracy                 0.956554
precision                0.877273
recall                   0.844444
f1                       0.849412
mcc                      0.831831
Name: 5, dtype: object


Decision Tree

In [4]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from itertools import product

# Custom MCC scorer function
def mcc_scorer(estimator, X, y_true):
    y_pred = estimator.predict(X)
    return matthews_corrcoef(y_true, y_pred)

# Summarize and find best parameter combinations
def summarize_and_find_best(df_results):
    summary_df = df_results.groupby(['dt__max_depth', 'dt__min_samples_split', 'dt__min_samples_leaf', 'dt__criterion', 'threshold']).agg({
        'accuracy': 'mean',
        'precision': 'mean',
        'recall': 'mean',
        'f1': 'mean',
        'mcc': 'mean'
    }).reset_index()

    # Finding the best parameter set based on the highest F1 score
    best_row = summary_df.loc[summary_df['f1'].idxmax()]

    return summary_df, best_row

# Combined SMOTE and Threshold-based Decision Tree (No PCA)
def runDTWithSMOTEAndThreshold(X, y, outDir, n_splits, param_grid, thresholds):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    metrics_per_combination = []

    for params in product(*param_grid.values()):
        param_dict = dict(zip(param_grid.keys(), params))

        # Define pipeline with SMOTE and Decision Tree
        pipeline = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('dt', DecisionTreeClassifier(
                max_depth=param_dict['dt__max_depth'],
                min_samples_split=param_dict['dt__min_samples_split'],
                min_samples_leaf=param_dict['dt__min_samples_leaf'],
                criterion=param_dict['dt__criterion'],
                random_state=42
            ))
        ])

        for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
            X_fold_train, X_fold_test = X[train_index].toarray(), X[test_index].toarray()
            y_fold_train, y_fold_test = y[train_index], y[test_index]

            # Train the pipeline
            pipeline.fit(X_fold_train, y_fold_train)

            # Predict probabilities
            y_pred_proba = pipeline.predict_proba(X_fold_test)[:, 1]  # Decision Tree probability for positive class

            for threshold in thresholds:
                y_pred_threshold = (y_pred_proba >= threshold).astype(int)

                # Calculate metrics
                accuracy = accuracy_score(y_fold_test, y_pred_threshold)
                precision = precision_score(y_fold_test, y_pred_threshold, zero_division=1)
                recall = recall_score(y_fold_test, y_pred_threshold, zero_division=1)
                f1 = f1_score(y_fold_test, y_pred_threshold, zero_division=1)
                mcc = matthews_corrcoef(y_fold_test, y_pred_threshold)

                # Store results
                metrics_per_combination.append({
                    'dt__max_depth': param_dict['dt__max_depth'],
                    'dt__min_samples_split': param_dict['dt__min_samples_split'],
                    'dt__min_samples_leaf': param_dict['dt__min_samples_leaf'],
                    'dt__criterion': param_dict['dt__criterion'],
                    'fold': fold + 1,
                    'threshold': threshold,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1,
                    'mcc': mcc
                })

    # Convert metrics to DataFrame and save results
    df_results = pd.DataFrame(metrics_per_combination)
    os.makedirs(outDir, exist_ok=True)
    outFile = os.path.join(outDir, "dt-smote-threshold-results.csv")
    df_results.to_csv(outFile, index=False)

    # Summarize results and return the best parameter set
    summary_df, best_params = summarize_and_find_best(df_results)

    # Save summarized results to a new CSV file
    summary_outFile = os.path.join(outDir, "dt-smote-threshold-summary.csv")
    summary_df.to_csv(summary_outFile, index=False)
    print(f"Summary of results saved to {summary_outFile}")

    return df_results, best_params

# Hyperparameters and thresholds
param_grid = {
    'dt__max_depth': [10, 20, 30],                  # Maximum depth of the decision tree
    'dt__min_samples_split': [2, 5],                # Minimum number of samples required to split a node
    'dt__min_samples_leaf': [1, 2],                 # Minimum number of samples required at a leaf node
    'dt__criterion': ['gini', 'entropy']            # Function to measure the quality of a split
}

thresholds = np.linspace(0.1, 0.9, 9)  # Thresholds from 0.1 to 0.9

outDir = "results/"
df_results, best_params = runDTWithSMOTEAndThreshold(Z, dataLabelsList, outDir, n_splits=5, param_grid=param_grid, thresholds=thresholds)

print("\nDecision Tree with SMOTE and Threshold analysis completed.")
print("Best parameter set based on F1-score:")
print(best_params)


Summary of results saved to results/dt-smote-threshold-summary.csv

Decision Tree with SMOTE and Threshold analysis completed.
Best parameter set based on F1-score:
dt__max_depth                  10
dt__min_samples_split           2
dt__min_samples_leaf            1
dt__criterion                gini
threshold                     0.1
accuracy                 0.963277
precision                    0.94
recall                   0.822222
f1                       0.872074
mcc                      0.856652
Name: 9, dtype: object
