In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the Excel file
file_path = r'D:\Z_E2E\CG\Defect_Tracer\defectTracer\Defect_pred_planner.xlsx'
sheets = pd.read_excel(file_path, sheet_name=None)

In [3]:
# Access key sheets
release1 = sheets['Release1']
test_execution_history_1 = sheets['Test Execution History_1']
rel1_defects = sheets['Rel1 Defects']

In [4]:
# Merge the sheets for Release 1
release1_merged = pd.merge(release1, test_execution_history_1, on='TC_ID', how='left')
release1_merged = pd.merge(release1_merged, rel1_defects[['TC_ID', 'Severity', 'Status', 'Environment']], on='TC_ID', how='left')


In [5]:
# Fill missing values for Severity and Execution Time
release1_merged['Severity'].fillna(0, inplace=True)  # Assuming 0 means no defect
release1_merged['Execution Time'].fillna(release1_merged['Execution Time'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  release1_merged['Severity'].fillna(0, inplace=True)  # Assuming 0 means no defect
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  release1_merged['Execution Time'].fillna(release1_merged['Execution Time'].mean(), inplace=True)
  release1_merged['Execution Time'].fillna(release1_m

In [6]:
# Create binary target variable for defect presence (1 if Severity > 0, else 0)
release1_merged['defect_present'] = release1_merged['Severity'].apply(lambda x: 1 if x > 0 else 0)


In [7]:
# One-hot encode the 'Result' and 'Environment_x' columns for model input
release1_encoded = pd.get_dummies(release1_merged, columns=['Result', 'Environment_x'], drop_first=True)


In [8]:
# Define the features (X) and target (y)
features = ['Coverage (%)', 'Execution Time', 'Result_Out of scope', 'Result_Passed', 'Environment_x_SIT', 'Environment_x_UAT']
X = release1_encoded[features]
y = release1_encoded['defect_present']


In [9]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [11]:
# Make predictions and evaluate the model
y_pred = rf_model.predict(X_test)


In [12]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


In [13]:
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         5

    accuracy                           1.00         7
   macro avg       1.00      1.00      1.00         7
weighted avg       1.00      1.00      1.00         7



In [14]:
# Perform cross-validation
from sklearn.model_selection import cross_val_score
cross_val_scores = cross_val_score(rf_model, X, y, cv=5)
cross_val_mean = cross_val_scores.mean()

In [15]:
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Score: {cross_val_mean}")


Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation Score: 1.0


In [16]:
# Perform feature importance analysis for the trained Random Forest model
importances = rf_model.feature_importances_
feature_names = X.columns

In [17]:
# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)


In [18]:
# Print feature importances
print("\nFeature Importances:")
print(feature_importances_df)


Feature Importances:
               Feature  Importance
3        Result_Passed    0.572844
1       Execution Time    0.266432
0         Coverage (%)    0.078619
2  Result_Out of scope    0.053340
5    Environment_x_UAT    0.018112
4    Environment_x_SIT    0.010654


In [19]:
# Optional: Hyperparameter Tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}


In [20]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Perform the grid search to find the best parameters
grid_search.fit(X, y)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best Cross-Validation Score: 1.0
