In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
import os

# Create output directory if it doesn't exist
results_dir = 'prepared_processed_data_2'
os.makedirs(results_dir, exist_ok=True)

# 1. Load the dataset
print("Loading dataset...")
df = pd.read_csv('./processed_data/common_features.csv')

# 2. Basic exploration
print(f"Dataset shape: {df.shape}")
print(f"Missing values: {df.isna().sum().sum()}")

# 3. Define time-related keywords to filter out columns
time_related_keywords = [
    'time', 'hour', 'date', 'day', 'week', 'month', 'duration', 
    'resolved', 'skewness', 'kurtosis', 'p25', 'p75', 'p90', 'iqr',
    'volatility', 'velocity', 'rate', 'balance', 'weekend', 'stability',
    'resolution', 'activity', 'experience', 'predictability'
]

# Filter out time-related columns
time_columns = [col for col in df.columns if any(keyword in col.lower() for keyword in time_related_keywords)]
non_time_columns = [col for col in df.columns if not any(keyword in col.lower() for keyword in time_related_keywords)]

print(f"\nRemoved {len(time_columns)} time-related columns")
print(f"Remaining {len(non_time_columns)} non-time-related columns")

# Create a filtered dataframe without time-related columns
df_filtered = df[non_time_columns]

# 4. Check data types and identify non-numeric columns
print("\n=== DATA TYPE ANALYSIS ===")
print(df_filtered.dtypes.value_counts())

non_numeric_columns = df_filtered.select_dtypes(exclude=np.number).columns.tolist()
print(f"\nNon-numeric columns ({len(non_numeric_columns)}):")
for col in non_numeric_columns:
    unique_values = df_filtered[col].nunique()
    print(f"  - {col}: {df_filtered[col].dtype}, {unique_values} unique values")
    if unique_values < 10:  # Show examples if not too many
        print(f"    Values: {df_filtered[col].unique()}")

# 5. Handle non-numeric columns
print("\n=== HANDLING NON-NUMERIC COLUMNS ===")
# Identify columns to drop (identifiers) and columns to encode (categorical)
cols_to_drop = []
cols_to_encode = []

for col in non_numeric_columns:
    # Check if it's an identifier column
    if any(keyword in col.lower() for keyword in ['project', 'key', 'name', 'id', 'source', 'repository', 'file']):
        cols_to_drop.append(col)
    else:
        # Must be a categorical column
        cols_to_encode.append(col)

# Drop identifier columns
if cols_to_drop:
    print(f"Dropping identifier columns: {cols_to_drop}")
    df_filtered = df_filtered.drop(columns=cols_to_drop)

# Encode categorical variables
if cols_to_encode:
    print(f"Encoding categorical columns: {cols_to_encode}")
    # Use pandas get_dummies for one-hot encoding
    df_filtered = pd.get_dummies(df_filtered, columns=cols_to_encode, drop_first=True)
    print(f"Expanded to {df_filtered.shape[1]} columns after encoding")

# 6. Function for interquartile-based imputation and outlier capping
def impute_and_cap_using_iqr(df, columns=None, fill_missing=True, cap_outliers=True, iqr_multiplier=1.5):
    """
    Impute missing values and cap outliers using interquartile range method.
    
    Parameters:
    -----------
    df : DataFrame
        Input dataframe
    columns : list or None
        Columns to process. If None, will process all numeric columns.
    fill_missing : bool
        Whether to fill missing values with median
    cap_outliers : bool
        Whether to cap outliers based on IQR
    iqr_multiplier : float
        Multiplier for IQR to define outlier boundaries
        
    Returns:
    --------
    DataFrame
        Processed dataframe
    """
    df_clean = df.copy()
    
    if columns is None:
        columns = df.select_dtypes(include=np.number).columns
    
    for col in columns:
        # Skip if column doesn't exist
        if col not in df.columns:
            continue
            
        # Calculate quartiles and IQR
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        
        # Define lower and upper bounds
        lower_bound = q1 - iqr_multiplier * iqr
        upper_bound = q3 + iqr_multiplier * iqr
        
        # Fill missing values with median if requested
        if fill_missing and df[col].isna().sum() > 0:
            median_val = df[col].median()
            df_clean[col] = df_clean[col].fillna(median_val)
            print(f"Filled {df[col].isna().sum()} missing values in {col} with median: {median_val:.4f}")
        
        # Cap outliers if requested
        if cap_outliers:
            # Count outliers before capping
            n_lower_outliers = (df_clean[col] < lower_bound).sum()
            n_upper_outliers = (df_clean[col] > upper_bound).sum()
            
            if n_lower_outliers > 0 or n_upper_outliers > 0:
                # Apply capping
                df_clean[col] = df_clean[col].clip(lower=lower_bound, upper=upper_bound)
                print(f"Capped {n_lower_outliers} lower and {n_upper_outliers} upper outliers in {col}")
    
    return df_clean

# 7. Categorize features based on their type
print("\n=== CATEGORIZING FEATURES ===")

# Percentage features
pct_features = [col for col in df_filtered.columns if 'pct' in col.lower()]

# Count features
count_features = [col for col in df_filtered.columns if 'count' in col.lower()]

# General feature categories
general_features = [
    col for col in df_filtered.columns 
    if col not in pct_features and col not in count_features and '_' in col
]

# Link-related features
link_features = [col for col in df_filtered.columns if 'link' in col.lower()]

# Priority features
priority_features = [col for col in df_filtered.columns if 'priority' in col.lower() and 'count' not in col.lower() and 'pct' not in col.lower()]

# Type features
type_features = [col for col in df_filtered.columns if 'type' in col.lower() and 'count' not in col.lower() and 'pct' not in col.lower()]

# Team features
team_features = [col for col in df_filtered.columns if 'team' in col.lower() or 'creator' in col.lower() or 'developer' in col.lower()]

# Combine all feature categories
feature_categories = {
    'pct_features': pct_features,
    'count_features': count_features,
    'general_features': general_features,
    'link_features': link_features,
    'priority_features': priority_features,
    'type_features': type_features,
    'team_features': team_features
}

# Print feature categories
for category, features in feature_categories.items():
    print(f"{category}: {len(features)} features")
    if len(features) > 0:
        print(f"  Sample: {', '.join(features[:3])}{'...' if len(features) > 3 else ''}")

# 8. Apply interquartile-based imputation to all numeric features
print("\n=== APPLYING INTERQUARTILE-BASED IMPUTATION AND OUTLIER CAPPING ===")
numeric_columns = df_filtered.select_dtypes(include=np.number).columns.tolist()
df_clean = impute_and_cap_using_iqr(df_filtered, columns=numeric_columns, fill_missing=True, cap_outliers=True)

# 9. Create the column transformer for feature scaling
print("\n=== CREATING FEATURE PREPROCESSING PIPELINE ===")

# Get the actual feature lists based on what's available in the dataframe
pct_feats = feature_categories['pct_features']
count_feats = feature_categories['count_features']
general_feats = feature_categories['general_features']
link_feats = feature_categories['link_features']
priority_feats = feature_categories['priority_features']
type_feats = feature_categories['type_features']
team_feats = feature_categories['team_features']

# Check if any feature categories are empty
for category, features in {'pct': pct_feats, 'count': count_feats, 
                          'general': general_feats, 'link': link_feats,
                          'priority': priority_feats, 'type': type_feats,
                          'team': team_feats}.items():
    if not features:
        print(f"Warning: No {category} features available for scaling")

# Create transformers only for non-empty feature lists
transformers = []
if pct_feats:
    transformers.append(('pct_minmax', MinMaxScaler(), pct_feats))
if count_feats:
    transformers.append(('count_std', StandardScaler(), count_feats))
if general_feats:
    transformers.append(('general_std', StandardScaler(), general_feats))
if link_feats:
    transformers.append(('link_std', StandardScaler(), link_feats))
if priority_feats:
    transformers.append(('priority_std', StandardScaler(), priority_feats))
if type_feats:
    transformers.append(('type_std', StandardScaler(), type_feats))
if team_feats:
    transformers.append(('team_robust', RobustScaler(), team_feats))

feature_preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder='passthrough'  # Any columns not specified will pass through
)

# 10. Apply the preprocessing to features
print("Scaling features...")
# Check if df_clean has any data
if df_clean.empty:
    print("Error: No features available for scaling!")
else:
    scaled_features = feature_preprocessor.fit_transform(df_clean)

    # 11. Create DataFrame with scaled features
    feature_names = feature_preprocessor.get_feature_names_out()
    df_scaled_features = pd.DataFrame(scaled_features, columns=feature_names, index=df_clean.index)

    # 12. Check if we've introduced any NaN values during scaling
    nan_count = df_scaled_features.isna().sum().sum()
    if nan_count > 0:
        print(f"Warning: {nan_count} NaN values introduced during scaling")
        # Replace NaNs with 0
        df_scaled_features = df_scaled_features.fillna(0)
        print("NaN values have been replaced with 0")

    # 13. Save the results
    print("\n=== SAVING RESULTS ===")

    # Save the clean but unscaled dataset (after IQR imputation and outlier capping)
    df_clean.to_csv(f'{results_dir}/common_features_iqr_cleaned_no_time.csv', index=False)
    print("1. Saved cleaned dataset (after IQR processing): common_features_iqr_cleaned_no_time.csv")

    # Save the preprocessor for later use
    with open(f'{results_dir}/jira_feature_preprocessor_no_time.pkl', 'wb') as f:
        pickle.dump(feature_preprocessor, f)
    print("2. Saved feature preprocessor: jira_feature_preprocessor_no_time.pkl")

    # Save the dataset with scaled features
    df_scaled_features.to_csv(f'{results_dir}/common_features_scaled_no_time.csv', index=False)
    print("3. Saved scaled features: common_features_scaled_no_time.csv")

    # 14. Print some summary statistics
    print("\n=== SUMMARY ===")
    print(f"Original dataset shape: {df.shape}")
    print(f"Filtered dataset (no time metrics): {df_filtered.shape}")
    print(f"Final processed dataset shape: {df_scaled_features.shape}")
    print(f"Removed {len(time_columns)} time-related columns")

print("\n=== PROCESSING COMPLETE ===")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
import os

# Create output directory if it doesn't exist
results_dir = 'prepared_processed_data_2'
os.makedirs(results_dir, exist_ok=True)

# Define target variables that should be preserved
target_variables = [
    'avg_resolution_hours', 
    'median_resolution_hours',
    'total_resolution_hours'
]

# 1. Load the dataset
print("Loading dataset...")
df = pd.read_csv('./processed_data/common_features.csv')

# 2. Basic exploration
print(f"Dataset shape: {df.shape}")
print(f"Missing values: {df.isna().sum().sum()}")

# 3. Define time-related keywords to filter out columns
time_related_keywords = [
    'time', 'hour', 'date', 'day', 'week', 'month', 'duration', 
    'resolved', 'skewness', 'kurtosis', 'p25', 'p75', 'p90', 'iqr',
    'volatility', 'velocity', 'rate', 'balance', 'weekend', 'stability',
    'resolution', 'activity', 'experience', 'predictability'
]

# Filter out time-related columns but preserve target variables
time_columns = [
    col for col in df.columns 
    if any(keyword in col.lower() for keyword in time_related_keywords) 
    and col not in target_variables
]

non_time_columns = [col for col in df.columns if col not in time_columns]

print(f"\nRemoved {len(time_columns)} time-related columns (preserving target variables)")
print(f"Remaining {len(non_time_columns)} columns including target variables")
print(f"Target variables preserved: {target_variables}")

# Create a filtered dataframe without time-related columns except target variables
df_filtered = df[non_time_columns]

# 4. Check data types and identify non-numeric columns
print("\n=== DATA TYPE ANALYSIS ===")
print(df_filtered.dtypes.value_counts())

non_numeric_columns = df_filtered.select_dtypes(exclude=np.number).columns.tolist()
print(f"\nNon-numeric columns ({len(non_numeric_columns)}):")
for col in non_numeric_columns:
    unique_values = df_filtered[col].nunique()
    print(f"  - {col}: {df_filtered[col].dtype}, {unique_values} unique values")
    if unique_values < 10:  # Show examples if not too many
        print(f"    Values: {df_filtered[col].unique()}")

# 5. Handle non-numeric columns
print("\n=== HANDLING NON-NUMERIC COLUMNS ===")
# Identify columns to drop (identifiers) and columns to encode (categorical)
cols_to_drop = []
cols_to_encode = []

for col in non_numeric_columns:
    # Check if it's an identifier column
    if any(keyword in col.lower() for keyword in ['project', 'key', 'name', 'id', 'source', 'repository', 'file']):
        cols_to_drop.append(col)
    else:
        # Must be a categorical column
        cols_to_encode.append(col)

# Drop identifier columns
if cols_to_drop:
    print(f"Dropping identifier columns: {cols_to_drop}")
    df_filtered = df_filtered.drop(columns=cols_to_drop)

# Encode categorical variables
if cols_to_encode:
    print(f"Encoding categorical columns: {cols_to_encode}")
    # Use pandas get_dummies for one-hot encoding
    df_filtered = pd.get_dummies(df_filtered, columns=cols_to_encode, drop_first=True)
    print(f"Expanded to {df_filtered.shape[1]} columns after encoding")

# 6. Function for interquartile-based imputation and outlier capping
def impute_and_cap_using_iqr(df, columns=None, fill_missing=True, cap_outliers=True, iqr_multiplier=1.5):
    """
    Impute missing values and cap outliers using interquartile range method.
    
    Parameters:
    -----------
    df : DataFrame
        Input dataframe
    columns : list or None
        Columns to process. If None, will process all numeric columns.
    fill_missing : bool
        Whether to fill missing values with median
    cap_outliers : bool
        Whether to cap outliers based on IQR
    iqr_multiplier : float
        Multiplier for IQR to define outlier boundaries
        
    Returns:
    --------
    DataFrame
        Processed dataframe
    """
    df_clean = df.copy()
    
    if columns is None:
        columns = df.select_dtypes(include=np.number).columns
    
    for col in columns:
        # Skip if column doesn't exist
        if col not in df.columns:
            continue
            
        # Calculate quartiles and IQR
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        
        # Define lower and upper bounds
        lower_bound = q1 - iqr_multiplier * iqr
        upper_bound = q3 + iqr_multiplier * iqr
        
        # Fill missing values with median if requested
        if fill_missing and df[col].isna().sum() > 0:
            median_val = df[col].median()
            df_clean[col] = df_clean[col].fillna(median_val)
            print(f"Filled {df[col].isna().sum()} missing values in {col} with median: {median_val:.4f}")
        
        # Cap outliers if requested
        if cap_outliers:
            # Count outliers before capping
            n_lower_outliers = (df_clean[col] < lower_bound).sum()
            n_upper_outliers = (df_clean[col] > upper_bound).sum()
            
            if n_lower_outliers > 0 or n_upper_outliers > 0:
                # Apply capping
                df_clean[col] = df_clean[col].clip(lower=lower_bound, upper=upper_bound)
                print(f"Capped {n_lower_outliers} lower and {n_upper_outliers} upper outliers in {col}")
    
    return df_clean

# 7. Categorize features based on their type
print("\n=== CATEGORIZING FEATURES ===")

# Separate target variables for special handling
target_vars = [col for col in df_filtered.columns if col in target_variables]

# Percentage features
pct_features = [col for col in df_filtered.columns if 'pct' in col.lower() and col not in target_vars]

# Count features
count_features = [col for col in df_filtered.columns if 'count' in col.lower() and col not in target_vars]

# General feature categories
general_features = [
    col for col in df_filtered.columns 
    if col not in pct_features and col not in count_features and '_' in col and col not in target_vars
]

# Link-related features
link_features = [col for col in df_filtered.columns if 'link' in col.lower() and col not in target_vars]

# Priority features
priority_features = [col for col in df_filtered.columns if 'priority' in col.lower() and 'count' not in col.lower() and 'pct' not in col.lower() and col not in target_vars]

# Type features
type_features = [col for col in df_filtered.columns if 'type' in col.lower() and 'count' not in col.lower() and 'pct' not in col.lower() and col not in target_vars]

# Team features
team_features = [col for col in df_filtered.columns if ('team' in col.lower() or 'creator' in col.lower() or 'developer' in col.lower()) and col not in target_vars]

# Combine all feature categories
feature_categories = {
    'target_variables': target_vars,
    'pct_features': pct_features,
    'count_features': count_features,
    'general_features': general_features,
    'link_features': link_features,
    'priority_features': priority_features,
    'type_features': type_features,
    'team_features': team_features
}

# Print feature categories
for category, features in feature_categories.items():
    print(f"{category}: {len(features)} features")
    if len(features) > 0:
        print(f"  Sample: {', '.join(features[:3])}{'...' if len(features) > 3 else ''}")

# 8. Apply interquartile-based imputation to all numeric features
print("\n=== APPLYING INTERQUARTILE-BASED IMPUTATION AND OUTLIER CAPPING ===")
numeric_columns = df_filtered.select_dtypes(include=np.number).columns.tolist()
df_clean = impute_and_cap_using_iqr(df_filtered, columns=numeric_columns, fill_missing=True, cap_outliers=True)

# 9. Create the column transformer for feature scaling
print("\n=== CREATING FEATURE PREPROCESSING PIPELINE ===")

# Get the actual feature lists based on what's available in the dataframe
target_vars = feature_categories['target_variables']
pct_feats = feature_categories['pct_features']
count_feats = feature_categories['count_features']
general_feats = feature_categories['general_features']
link_feats = feature_categories['link_features']
priority_feats = feature_categories['priority_features']
type_feats = feature_categories['type_features']
team_feats = feature_categories['team_features']

# Check if any feature categories are empty
for category, features in {'target': target_vars, 'pct': pct_feats, 'count': count_feats, 
                          'general': general_feats, 'link': link_feats,
                          'priority': priority_feats, 'type': type_feats,
                          'team': team_feats}.items():
    if not features:
        print(f"Warning: No {category} features available for scaling")

# Create transformers only for non-empty feature lists
transformers = []
if target_vars:
    transformers.append(('target_std', StandardScaler(), target_vars))
if pct_feats:
    transformers.append(('pct_minmax', MinMaxScaler(), pct_feats))
if count_feats:
    transformers.append(('count_std', StandardScaler(), count_feats))
if general_feats:
    transformers.append(('general_std', StandardScaler(), general_feats))
if link_feats:
    transformers.append(('link_std', StandardScaler(), link_feats))
if priority_feats:
    transformers.append(('priority_std', StandardScaler(), priority_feats))
if type_feats:
    transformers.append(('type_std', StandardScaler(), type_feats))
if team_feats:
    transformers.append(('team_robust', RobustScaler(), team_feats))

feature_preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder='passthrough'  # Any columns not specified will pass through
)

# 10. Apply the preprocessing to features
print("Scaling features...")
# Check if df_clean has any data
if df_clean.empty:
    print("Error: No features available for scaling!")
else:
    scaled_features = feature_preprocessor.fit_transform(df_clean)

    # 11. Create DataFrame with scaled features
    feature_names = feature_preprocessor.get_feature_names_out()
    df_scaled_features = pd.DataFrame(scaled_features, columns=feature_names, index=df_clean.index)

    # 12. Check if we've introduced any NaN values during scaling
    nan_count = df_scaled_features.isna().sum().sum()
    if nan_count > 0:
        print(f"Warning: {nan_count} NaN values introduced during scaling")
        # Replace NaNs with 0
        df_scaled_features = df_scaled_features.fillna(0)
        print("NaN values have been replaced with 0")

    # 13. Save the results
    print("\n=== SAVING RESULTS ===")

    # Save the clean but unscaled dataset (after IQR imputation and outlier capping)
    df_clean.to_csv(f'{results_dir}/common_features_iqr_cleaned_with_targets.csv', index=False)
    print("1. Saved cleaned dataset (after IQR processing): common_features_iqr_cleaned_with_targets.csv")

    # Save the preprocessor for later use
    with open(f'{results_dir}/jira_feature_preprocessor_with_targets.pkl', 'wb') as f:
        pickle.dump(feature_preprocessor, f)
    print("2. Saved feature preprocessor: jira_feature_preprocessor_with_targets.pkl")

    # Save the dataset with scaled features
    df_scaled_features.to_csv(f'{results_dir}/common_features_scaled_with_targets.csv', index=False)
    print("3. Saved scaled features: common_features_scaled_with_targets.csv")

    # Also save just the target variables separately for easy access
    if target_vars:
        df_targets = df_clean[target_vars]
        df_targets.to_csv(f'{results_dir}/target_variables.csv', index=False)
        print("4. Saved target variables separately: target_variables.csv")

    # 14. Print some summary statistics
    print("\n=== SUMMARY ===")
    print(f"Original dataset shape: {df.shape}")
    print(f"Filtered dataset (with target variables): {df_filtered.shape}")
    print(f"Final processed dataset shape: {df_scaled_features.shape}")
    print(f"Removed {len(time_columns)} time-related columns (preserved {len(target_vars)} target variables)")
    if target_vars:
        print(f"Target variables preserved: {', '.join(target_vars)}")

print("\n=== PROCESSING COMPLETE ===")

In [11]:
import pandas as pd
import dtale

# Replace this path with the location of your CSV file
csv_file_path = "./prepared_processed_data_2/common_features_iqr_cleaned_with_targets.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file_path)

# Start a D-Tale session and open it in the browser
d = dtale.show(df, ignore_duplicate=True, allow_cell_edits=False)
d.open_browser()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('./prepared_processed_data_2/common_features_iqr_cleaned_with_targets.csv')
print(f"Dataset shape: {df.shape}")

# Identify the resolution hours features (target variables)
resolution_hours_features = [
    'avg_resolution_hours', 
    'median_resolution_hours', 
    'total_resolution_hours'
]

# 1. Analyze the distribution of total_resolution_hours
plt.figure(figsize=(14, 8))
plt.subplot(2, 2, 1)
sns.histplot(df['total_resolution_hours'], bins=50)
plt.title('Distribution of Total Resolution Hours')
plt.xlabel('Hours')
plt.ylabel('Frequency')

plt.subplot(2, 2, 2)
sns.boxplot(y=df['total_resolution_hours'])
plt.title('Boxplot of Total Resolution Hours')
plt.ylabel('Hours')

plt.subplot(2, 2, 3)
# Take log for better visualization
sns.histplot(np.log1p(df['total_resolution_hours']), bins=50)
plt.title('Log Distribution of Total Resolution Hours')
plt.xlabel('Log(Hours+1)')
plt.ylabel('Frequency')

plt.subplot(2, 2, 4)
# Calculate upper whisker for visualization
q1 = np.percentile(df['total_resolution_hours'], 25)
q3 = np.percentile(df['total_resolution_hours'], 75)
iqr = q3 - q1
upper_whisker = q3 + 1.5 * iqr
print(f"Upper whisker boundary: {upper_whisker:.2f} hours")
# Zoom in on the box plot for better visibility
sns.boxplot(y=df['total_resolution_hours'].clip(upper=upper_whisker*2))
plt.title('Zoomed Boxplot (Up to 2x Upper Whisker)')
plt.ylabel('Hours')

plt.tight_layout()
plt.savefig('total_resolution_hours_distribution.png')
plt.close()

# 2. Advanced outlier identification approaches

# Option 1: Project Size Segmentation
print("\n--- Project Size Segmentation ---")
# Define project size categories based on total_issues
size_bins = [0, 10, 50, 100, 500, float('inf')]
size_labels = ['Very Small', 'Small', 'Medium', 'Large', 'Very Large']
df['project_size'] = pd.cut(df['total_issues'], bins=size_bins, labels=size_labels)

# Calculate statistics by project size
size_stats = df.groupby('project_size')[resolution_hours_features].agg(['mean', 'median', 'std', 'count'])
print("Resolution hours statistics by project size:")
print(size_stats)

# Visualize resolution hours by project size
plt.figure(figsize=(14, 8))
sns.boxplot(x='project_size', y='total_resolution_hours', data=df)
plt.title('Total Resolution Hours by Project Size')
plt.ylabel('Hours')
plt.yscale('log')  # Log scale for better visualization
plt.savefig('resolution_hours_by_project_size.png')
plt.close()

# Option 2: Clustering-based Segmentation
print("\n--- Clustering-based Segmentation ---")
# Select features for clustering
clustering_features = ['total_issues', 'total_resolution_hours']
# Filter out NaN values
df_cluster = df[clustering_features].dropna()
# Apply log transformation to resolution hours for better clustering
df_cluster_scaled = df_cluster.copy()
df_cluster_scaled['total_resolution_hours'] = np.log1p(df_cluster['total_resolution_hours'])
# Standardize features
scaler = RobustScaler()
df_cluster_scaled = pd.DataFrame(
    scaler.fit_transform(df_cluster_scaled),
    columns=df_cluster_scaled.columns
)

# Apply KMeans clustering
n_clusters = 4  # Can be adjusted based on data visualization
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(df_cluster_scaled)
df_cluster['cluster'] = clusters

# Visualize clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    df_cluster['total_issues'],
    df_cluster['total_resolution_hours'],
    c=df_cluster['cluster'],
    cmap='viridis',
    alpha=0.7
)
plt.colorbar(scatter, label='Cluster')
plt.title('Clusters of Projects by Issues and Resolution Hours')
plt.xlabel('Total Issues')
plt.ylabel('Total Resolution Hours')
plt.xscale('log')
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.savefig('project_clusters.png')
plt.close()

# Calculate statistics by cluster
cluster_stats = df_cluster.groupby('cluster')[clustering_features].agg(['mean', 'median', 'std', 'count'])
print("Project statistics by cluster:")
print(cluster_stats)

# 3. Recommended approach: Stratified modeling with specialized handling for each segment

# Identify extreme outliers based on statistical measures
def detect_extreme_outliers(df, column, method='iqr', threshold=3):
    """
    Identify extreme outliers that should be handled specially
    """
    values = df[column].values
    
    if method == 'iqr':
        q1 = np.percentile(values, 25)
        q3 = np.percentile(values, 75)
        iqr = q3 - q1
        upper_bound = q3 + threshold * iqr
        outlier_indices = np.where(values > upper_bound)[0]
        
    elif method == 'zscore':
        mean = np.mean(values)
        std = np.std(values)
        z_scores = (values - mean) / std
        outlier_indices = np.where(np.abs(z_scores) > threshold)[0]
        
    elif method == 'percentile':
        percentile = 100 - (100 / threshold)  # e.g., threshold=10 means top 10%
        upper_bound = np.percentile(values, percentile)
        outlier_indices = np.where(values > upper_bound)[0]
    
    return outlier_indices, df.index[outlier_indices].tolist()

# Apply multiple detection methods
print("\n--- Extreme Outlier Detection ---")
iqr_outliers, iqr_indices = detect_extreme_outliers(df, 'total_resolution_hours', method='iqr', threshold=3)
zscore_outliers, zscore_indices = detect_extreme_outliers(df, 'total_resolution_hours', method='zscore', threshold=3)
percentile_outliers, percentile_indices = detect_extreme_outliers(df, 'total_resolution_hours', method='percentile', threshold=20)  # Top 5%

print(f"IQR method: {len(iqr_outliers)} extreme outliers detected")
print(f"Z-score method: {len(zscore_outliers)} extreme outliers detected")
print(f"Percentile method: {len(percentile_outliers)} extreme outliers detected")

# Find the intersection of outliers detected by multiple methods
common_outliers = set(iqr_indices) & set(zscore_indices) & set(percentile_indices)
print(f"Number of outliers detected by all three methods: {len(common_outliers)}")

# 4. Recommended approach for handling outliers in software effort estimation:

print("\n--- Recommended Approach ---")
print("1. Stratified modeling - Create separate models for different project size categories")
print("2. Specialized transformations for each stratum")
print("3. Consider removing or special handling for the most extreme outliers detected by multiple methods")

# Create a cleaned dataset with different levels of outlier removal
df_no_extreme_outliers = df[~df.index.isin(list(common_outliers))].copy()
print(f"\nDataset shape after removing extreme outliers: {df_no_extreme_outliers.shape}")

# Apply log transformation to resolution hours
for feature in resolution_hours_features:
    df_no_extreme_outliers[f'{feature}_log'] = np.log1p(df_no_extreme_outliers[feature])

# Compare distributions before and after outlier removal
plt.figure(figsize=(14, 10))

for i, feature in enumerate(resolution_hours_features):
    # Original distribution
    plt.subplot(3, 3, i*3+1)
    sns.histplot(df[feature], bins=30)
    plt.title(f'Original {feature}')
    plt.xlabel('Hours')
    
    # After extreme outlier removal
    plt.subplot(3, 3, i*3+2)
    sns.histplot(df_no_extreme_outliers[feature], bins=30)
    plt.title(f'After Extreme Outlier Removal')
    plt.xlabel('Hours')
    
    # Log transformation after outlier removal
    plt.subplot(3, 3, i*3+3)
    sns.histplot(df_no_extreme_outliers[f'{feature}_log'], bins=30)
    plt.title(f'Log-transformed (No Extreme Outliers)')
    plt.xlabel('Log(Hours+1)')

plt.tight_layout()
plt.savefig('distribution_comparison.png')
plt.close()

# 5. Save the prepared datasets for modeling
df_no_extreme_outliers.to_csv('se_effort_no_extreme_outliers.csv', index=False)

# Create stratified datasets based on project size
for size in size_labels:
    size_df = df_no_extreme_outliers[df_no_extreme_outliers['project_size'] == size].copy()
    if len(size_df) > 0:
        size_df.to_csv(f'se_effort_{size.lower().replace(" ", "_")}_projects.csv', index=False)
        print(f"Created dataset for {size} projects: {size_df.shape[0]} records")

print("\nAnalysis and dataset preparation complete!")
print("Generated files:")
print("- total_resolution_hours_distribution.png: Original data distribution")
print("- resolution_hours_by_project_size.png: Boxplots by project size")
print("- project_clusters.png: Clustering visualization")
print("- distribution_comparison.png: Effect of outlier removal")
print("- se_effort_no_extreme_outliers.csv: Dataset with extreme outliers removed")
print("- se_effort_[size]_projects.csv: Stratified datasets by project size")

print("\nRecommendations for modeling:")
print("1. For general-purpose modeling: Use 'se_effort_no_extreme_outliers.csv' with log transformation")
print("2. For size-specific modeling: Use the stratified datasets")
print("3. Consider building ensemble models that specialize in different project sizes")
print("4. Use the extreme outliers list to flag potentially problematic projects during prediction")

In [None]:
import pandas as pd
import dtale

# Replace this path with the location of your CSV file
csv_file_path = "./se_effort_no_extreme_outliers.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file_path)

# Start a D-Tale session and open it in the browser
d = dtale.show(df, ignore_duplicate=True, allow_cell_edits=False)
d.open_browser()

FileNotFoundError: [Errno 2] No such file or directory: './processed_data/se_effort_no_extreme_outliers.csv'