# AI Bug Predictor - Machine Learning Model

This notebook trains a machine learning model to predict bugs in source code.

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, roc_auc_score, roc_curve
)
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 2. Load and Explore Dataset

In [None]:
# Load the dataset
df = pd.read_csv('../dataset/bug_dataset_50k.csv')

print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())

print("\nDataset info:")
print(df.info())

print("\nBasic statistics:")
print(df.describe())

## 3. Data Preprocessing

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Handle missing values
df = df.fillna(df.median(numeric_only=True))

# Check class distribution
print("\nClass distribution:")
print(df['has_bug'].value_counts())
print(f"\nBug percentage: {df['has_bug'].mean():.2%}")

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

# Select numerical features to visualize
numerical_features = [
    'loc', 'cyclomatic_complexity', 'halstead_volume',
    'num_functions', 'num_loops', 'num_conditionals',
    'num_try_except', 'num_null_checks', 'nested_depth'
]

for idx, feature in enumerate(numerical_features[:9]):
    ax = axes[idx]
    sns.histplot(data=df, x=feature, hue='has_bug', ax=ax, bins=30, kde=True)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr(numeric_only=True)
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            square=True, cbar_kws={'shrink': 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 4. Feature Engineering

In [None]:
# Prepare features and target
X = df.drop('has_bug', axis=1)
y = df['has_bug']

# Feature selection - drop highly correlated features
correlation_threshold = 0.9
corr_matrix = X.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > correlation_threshold)]

print(f"Features to drop due to high correlation: {to_drop}")
X = X.drop(columns=to_drop)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nTraining bug percentage: {y_train.mean():.2%}")
print(f"Test bug percentage: {y_test.mean():.2%}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature importance (using Random Forest for initial assessment)
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance.head(15), x='importance', y='feature')
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## 5. Model Training

In [None]:
# Train Logistic Regression model
model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    class_weight='balanced',  # Handle class imbalance
    solver='lbfgs',
    C=1.0
)

model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

print("Model trained successfully!")
print(f"Training accuracy: {model.score(X_train_scaled, y_train):.4f}")
print(f"Test accuracy: {accuracy_score(y_test, y_pred):.4f}")

In [None]:
# Detailed evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Bug', 'Bug']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Bug', 'Bug'], 
            yticklabels=['No Bug', 'Bug'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

## 6. Model Analysis

In [None]:
# Feature coefficients analysis
coefficients = pd.DataFrame({
    'feature': X.columns,
    'coefficient': model.coef_[0]
}).sort_values('coefficient', ascending=False)

print("Top 10 Positive Coefficients (increase bug probability):")
print(coefficients.head(10))
print("\nTop 10 Negative Coefficients (decrease bug probability):")
print(coefficients.tail(10))

# Visualize coefficients
plt.figure(figsize=(12, 8))
colors = ['red' if c > 0 else 'blue' for c in coefficients['coefficient']]
plt.barh(coefficients['feature'], coefficients['coefficient'], color=colors)
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.xlabel('Coefficient Value')
plt.title('Logistic Regression Coefficients')
plt.tight_layout()
plt.show()

In [None]:
# Probability distribution analysis
plt.figure(figsize=(12, 5))

# Probability distribution for each class
plt.subplot(1, 2, 1)
sns.histplot(data=pd.DataFrame({
    'probability': y_pred_proba[y_test == 0],
    'class': 'No Bug'
}), x='probability', bins=30, kde=True, label='No Bug')
sns.histplot(data=pd.DataFrame({
    'probability': y_pred_proba[y_test == 1],
    'class': 'Bug'
}), x='probability', bins=30, kde=True, label='Bug')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Probability Distribution by Class')
plt.legend()

# Decision boundary analysis
plt.subplot(1, 2, 2)
thresholds = np.linspace(0, 1, 100)
accuracies = []
for threshold in thresholds:
    y_pred_thresh = (y_pred_proba >= threshold).astype(int)
    accuracies.append(accuracy_score(y_test, y_pred_thresh))

plt.plot(thresholds, accuracies)
plt.xlabel('Threshold')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Decision Threshold')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Model Persistence

In [None]:
# Save the model and scaler
model_data = {
    'model': model,
    'scaler': scaler,
    'feature_names': X.columns.tolist(),
    'accuracy': accuracy_score(y_test, y_pred),
    'roc_auc': roc_auc,
    'training_samples': len(X_train),
    'test_samples': len(X_test)
}

with open('model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print(f"Model saved successfully as 'model.pkl'")
print(f"Model Accuracy: {model_data['accuracy']:.4f}")
print(f"ROC AUC Score: {model_data['roc_auc']:.4f}")
print(f"Features used: {len(model_data['feature_names'])}")
print(f"Training samples: {model_data['training_samples']}")
print(f"Test samples: {model_data['test_samples']}")

In [None]:
# Test the saved model
with open('model.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

loaded_model = loaded_data['model']
loaded_scaler = loaded_data['scaler']

# Test prediction on sample data
sample_features = X_test_scaled[:5]
predictions = loaded_model.predict(sample_features)
probabilities = loaded_model.predict_proba(sample_features)[:, 1]

print("Sample predictions:")
for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
    print(f"Sample {i+1}: Predicted={'Bug' if pred == 1 else 'No Bug'} (Probability: {prob:.4f})")

## 8. Model Deployment Preparation

In [None]:
# Create a test function for the model
def predict_bug_probability(features_dict):
    """
    Predict bug probability for given features
    
    Args:
        features_dict: Dictionary of feature names and values
    
    Returns:
        Dictionary with prediction results
    """
    # Convert to array in correct order
    features = [features_dict.get(name, 0) for name in loaded_data['feature_names']]
    features_array = np.array(features).reshape(1, -1)
    
    # Scale features
    features_scaled = loaded_scaler.transform(features_array)
    
    # Predict
    probability = loaded_model.predict_proba(features_scaled)[0, 1]
    prediction = loaded_model.predict(features_scaled)[0]
    
    return {
        'has_bug': bool(prediction),
        'probability': float(probability),
        'severity': 'high' if probability > 0.7 else 'medium' if probability > 0.4 else 'low'
    }

# Test with sample features
sample_features_dict = {feature: np.random.rand() * 10 for feature in loaded_data['feature_names']}
result = predict_bug_probability(sample_features_dict)

print("Sample prediction:")
print(f"Features: {len(sample_features_dict)}")
print(f"Has Bug: {result['has_bug']}")
print(f"Probability: {result['probability']:.4f}")
print(f"Severity: {result['severity']}")

## 9. Summary and Conclusion

In [None]:
print("=" * 60)
print("AI BUG PREDICTOR - MODEL TRAINING SUMMARY")
print("=" * 60)
print()
print(f"Dataset Size: {len(df)} samples")
print(f"Bug Rate: {df['has_bug'].mean():.2%}")
print(f"Features Used: {len(loaded_data['feature_names'])}")
print(f"Model Type: Logistic Regression")
print(f"Test Accuracy: {loaded_data['accuracy']:.4f}")
print(f"ROC AUC Score: {loaded_data['roc_auc']:.4f}")
print()
print("Top 5 Important Features:")
for feature in coefficients.head(5)['feature']:
    print(f"  - {feature}")
print()
print("Model Successfully Saved to: model.pkl")
print("=" * 60)