In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import time
import os
import warnings
warnings.filterwarnings('ignore')

# Create directory for results
results_dir = 'model_results'
os.makedirs(results_dir, exist_ok=True)

# 1. Load the preprocessed dataset with scaled features and original targets
print("Loading preprocessed dataset...")
df = pd.read_csv('../../common_features_scaled_with_original_targets.csv')
print(f"Dataset shape: {df.shape}")

# 2. Check for any remaining non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=np.number).columns.tolist()
if non_numeric_cols:
    print(f"Warning: Found {len(non_numeric_cols)} non-numeric columns that need handling: {non_numeric_cols}")
else:
    print("All columns are numeric - good to proceed.")

# 3. Define potential target variables
target_variables = [
    'avg_resolution_hours',
    'median_resolution_hours',
    'min_resolution_hours',
    'max_resolution_hours', 
    'resolution_hours_std',
    'total_resolution_hours'
]

# Check which target variables exist in our dataset
available_targets = [target for target in target_variables if target in df.columns]
if len(available_targets) < len(target_variables):
    print(f"Note: Some target variables are not in the dataset. Available targets: {available_targets}")

# 4. Select primary target variable for prediction
primary_target = 'avg_resolution_hours'
if primary_target not in df.columns:
    print(f"Warning: Primary target '{primary_target}' not found in dataset!")
    if available_targets:
        primary_target = available_targets[0]
        print(f"Using '{primary_target}' as alternative target.")
    else:
        raise ValueError("No suitable target variable found in dataset!")

print(f"Selected primary target variable: {primary_target}")

# 5. Check for potential data leakage features more thoroughly
leakage_features = []

# Look for features containing terms related to what we're trying to predict
leakage_terms = ['resolution_hours', 'resolution_time', 'hours_', 'time_spent']

for col in X.columns:  # Check only in features, not in target_variables
    # Check if the column contains any of the leakage terms
    if any(term in col.lower() for term in leakage_terms):
        leakage_features.append(col)
    
    # Special check for features that are highly likely to leak information
    if 'avg_resolution' in col or 'median_resolution' in col or 'total_resolution' in col:
        if col not in leakage_features:  # Avoid duplicates
            leakage_features.append(col)

if leakage_features:
    print(f"Removing {len(leakage_features)} features to prevent data leakage:")
    for feature in leakage_features:
        print(f"  - {feature}")
    
    # Remove these features
    X = X.drop(columns=leakage_features)
    
    # Now redefine train/val/test with the updated X
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.5, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    print(f"Features shape after removing leakage: {X.shape}")
else:
    print("No potential data leakage features detected.")

# 6. Separate features and target
X = df.drop(columns=[col for col in target_variables if col in df.columns])
y = df[primary_target]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# 7. Data partitioning: 50% training, 25% validation, 25% testing
# First split: 50% training, 50% remaining
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.5, random_state=42)

# Second split: divide the remaining 50% into equal parts for validation and testing
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Data partitioning complete:")
print(f"  Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"  Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"  Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

# 8. Basic exploratory analysis of the target variable
plt.figure(figsize=(12, 4))

# Training set distribution
plt.subplot(131)
plt.hist(y_train, bins=30, alpha=0.7)
plt.title('Training Target Distribution')
plt.xlabel(primary_target)
plt.ylabel('Frequency')

# Validation set distribution
plt.subplot(132)
plt.hist(y_val, bins=30, alpha=0.7)
plt.title('Validation Target Distribution')
plt.xlabel(primary_target)

# Test set distribution
plt.subplot(133)
plt.hist(y_test, bins=30, alpha=0.7)
plt.title('Test Target Distribution')
plt.xlabel(primary_target)

plt.tight_layout()
plt.savefig(f'{results_dir}/target_distributions.png')
plt.close()

print(f"\nTarget variable statistics:")
print(f"  Training: mean={y_train.mean():.2f}, median={y_train.median():.2f}, min={y_train.min():.2f}, max={y_train.max():.2f}")
print(f"  Validation: mean={y_val.mean():.2f}, median={y_val.median():.2f}, min={y_val.min():.2f}, max={y_val.max():.2f}")
print(f"  Test: mean={y_test.mean():.2f}, median={y_test.median():.2f}, min={y_test.min():.2f}, max={y_test.max():.2f}")

# Save the splits for reproducibility
splits = {
    'X_train': X_train,
    'y_train': y_train,
    'X_val': X_val,
    'y_val': y_val,
    'X_test': X_test,
    'y_test': y_test
}

with open(f'{results_dir}/data_splits.pkl', 'wb') as f:
    pickle.dump(splits, f)

print("\nInitial setup complete. Ready for feature selection and model training.")

Loading preprocessed dataset...
Dataset shape: (711, 87)
Selected primary target variable: avg_resolution_hours
Removing 4 features to prevent data leakage:
  - remainder__priority_critical_type_bug_avg_resolution_hours
  - remainder__priority_blocker_type_bug_avg_resolution_hours
  - remainder__priority_high_type_bug_avg_resolution_hours
  - remainder__priority_low_type_bug_avg_resolution_hours
Features shape: (711, 77)
Target shape: (711,)
Data partitioning complete:
  Training set: 355 samples (49.9%)
  Validation set: 178 samples (25.0%)
  Test set: 178 samples (25.0%)

Target variable statistics:
  Training: mean=3924.86, median=2846.57, min=0.63, max=10937.65
  Validation: mean=3885.21, median=2713.98, min=1.37, max=10937.65
  Test: mean=3588.97, median=2499.94, min=0.14, max=10937.65

Initial setup complete. Ready for feature selection and model training.
