In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

def load_data(filepath):
    """Load project data from CSV file."""
    df = pd.read_csv(filepath)
    print(f"Loaded {len(df)} projects with {len(df.columns)} columns")
    return df

def create_project_scale_features(df):
    """Create features related to project scale."""
    # Convert date strings to datetime
    if 'project_start_date' in df.columns and 'project_latest_date' in df.columns:
        df['project_start_date'] = pd.to_datetime(df['project_start_date'])
        df['project_latest_date'] = pd.to_datetime(df['project_latest_date'])
        
        # Project duration
        df['project_duration_days'] = (df['project_latest_date'] - df['project_start_date']).dt.days
    
    # Project size categorization
    if 'issue_count' in df.columns:
        size_conditions = [
            (df['issue_count'] < 500),
            (df['issue_count'] >= 500) & (df['issue_count'] < 5000),
            (df['issue_count'] >= 5000) & (df['issue_count'] < 20000),
            (df['issue_count'] >= 20000)
        ]
        size_values = ['Small', 'Medium', 'Large', 'Very Large']
        df['project_size'] = np.select(size_conditions, size_values)
    
    # Team size ratio
    contributor_cols = ['fields.creator.key_<lambda>', 'fields.reporter.key_<lambda>']
    if all(col in df.columns for col in contributor_cols):
        df['unique_contributors'] = df[contributor_cols].sum(axis=1)
        df['contributor_to_issue_ratio'] = df['unique_contributors'] / df['issue_count']
    
    # Activity density
    if 'project_duration_days' in df.columns and 'issue_count' in df.columns:
        df['avg_issues_per_day'] = df['issue_count'] / df['project_duration_days'].clip(lower=1)
    
    # Project age category
    if 'project_duration_days' in df.columns:
        age_conditions = [
            (df['project_duration_days'] < 365),
            (df['project_duration_days'] >= 365) & (df['project_duration_days'] < 365*3),
            (df['project_duration_days'] >= 365*3) & (df['project_duration_days'] < 365*5),
            (df['project_duration_days'] >= 365*5)
        ]
        age_values = ['New', 'Established', 'Mature', 'Legacy']
        df['project_age_category'] = np.select(age_conditions, age_values)
    
    return df

def create_issue_type_features(df):
    """Create features related to issue type distribution."""
    # Get all issue type columns
    issue_type_columns = [col for col in df.columns if 'fields.issuetype.name_<lambda>_' in col]
    
    if not issue_type_columns:
        return df
    
    # Calculate total issues by type for each project
    df['total_issues_by_type'] = df[issue_type_columns].sum(axis=1)
    
    # Calculate ratios for major issue types
    for issue_type in ['Bug', 'Task', 'Improvement', 'Enhancement', 'Story', 'Sub-task', 'Feature Request', 'New Feature']:
        col_name = f'fields.issuetype.name_<lambda>_{issue_type}'
        if col_name in df.columns:
            df[f'{issue_type.lower().replace(" ", "_")}_ratio'] = df[col_name] / df['total_issues_by_type']
    
    # Feature ratio (combine all feature-related types)
    feature_cols = [
        'fields.issuetype.name_<lambda>_New Feature',
        'fields.issuetype.name_<lambda>_Feature',
        'fields.issuetype.name_<lambda>_Feature Request'
    ]
    
    feature_cols = [col for col in feature_cols if col in df.columns]
    
    if feature_cols:
        df['feature_ratio'] = df[feature_cols].sum(axis=1) / df['total_issues_by_type']
    
    # Create bug to feature ratio (maintenance vs innovation)
    if 'bug_ratio' in df.columns and 'feature_ratio' in df.columns:
        df['bug_to_feature_ratio'] = df['bug_ratio'] / df['feature_ratio'].replace(0, 0.0001)
    
    return df

def create_resolution_features(df):
    """Create features related to issue resolution efficiency."""
    # Resolution efficiency (inverse of median time)
    if 'median_resolution_hours' in df.columns:
        df['resolution_efficiency'] = 1 / df['median_resolution_hours'].replace(0, float('inf'))
        df['resolution_efficiency'] = df['resolution_efficiency'].replace([float('inf')], 0)
    
    # Resolution variability (coefficient of variation)
    if 'resolution_hours_std' in df.columns and 'avg_resolution_hours' in df.columns:
        df['resolution_variability'] = df['resolution_hours_std'] / df['avg_resolution_hours'].replace(0, 1)
    
    # Bug resolution time (weighted by bug ratio)
    if 'bug_ratio' in df.columns and 'avg_resolution_hours' in df.columns:
        df['bug_resolution_hours'] = df['avg_resolution_hours'] * df['bug_ratio']
    
    # Fast vs slow resolution categorization
    if 'resolution_efficiency' in df.columns:
        resolution_terciles = df['resolution_efficiency'].quantile([0.33, 0.66]).values
        
        resolution_conditions = [
            (df['resolution_efficiency'] <= resolution_terciles[0]),
            (df['resolution_efficiency'] > resolution_terciles[0]) & (df['resolution_efficiency'] <= resolution_terciles[1]),
            (df['resolution_efficiency'] > resolution_terciles[1])
        ]
        resolution_categories = ['Slow', 'Medium', 'Fast']
        df['resolution_speed_category'] = np.select(resolution_conditions, resolution_categories)
    
    return df

def create_complexity_features(df):
    """Create features related to project complexity."""
    # Use existing complexity metrics from the original dataset
    complexity_features = [
        'avg_components_per_issue',
        'avg_labels_per_issue',
        'avg_links_per_issue'
    ]
    
    complexity_features = [f for f in complexity_features if f in df.columns]
    
    # Calculate complexity score (sum of normalized complexity features)
    if complexity_features:
        df_temp = df[complexity_features].copy()
        
        # Standardize each feature
        scaler = StandardScaler()
        df_temp_scaled = pd.DataFrame(
            scaler.fit_transform(df_temp),
            columns=complexity_features
        )
        
        # Combine into a single score
        df['complexity_score'] = df_temp_scaled.mean(axis=1)
        
        # Create complexity categories
        complexity_terciles = df['complexity_score'].quantile([0.33, 0.66]).values
        
        complexity_conditions = [
            (df['complexity_score'] <= complexity_terciles[0]),
            (df['complexity_score'] > complexity_terciles[0]) & (df['complexity_score'] <= complexity_terciles[1]),
            (df['complexity_score'] > complexity_terciles[1])
        ]
        complexity_categories = ['Low', 'Medium', 'High']
        df['complexity_category'] = np.select(complexity_conditions, complexity_categories)
    
    return df

def create_team_dynamics_features(df):
    """Create features related to team dynamics."""
    # Contributor efficiency
    if 'unique_contributors' in df.columns and 'issue_count' in df.columns:
        df['contributor_efficiency'] = df['issue_count'] / df['unique_contributors'].replace(0, 1)
    
    # Contributor density
    if 'unique_contributors' in df.columns and 'project_duration_days' in df.columns:
        df['contributor_density'] = df['unique_contributors'] / df['project_duration_days'].replace(0, 1)
    
    # Team size categories
    if 'unique_contributors' in df.columns:
        team_terciles = df['unique_contributors'].quantile([0.33, 0.66]).values
        
        team_conditions = [
            (df['unique_contributors'] <= team_terciles[0]),
            (df['unique_contributors'] > team_terciles[0]) & (df['unique_contributors'] <= team_terciles[1]),
            (df['unique_contributors'] > team_terciles[1])
        ]
        team_categories = ['Small Team', 'Medium Team', 'Large Team']
        df['team_size_category'] = np.select(team_conditions, team_categories)
    
    return df

def cluster_projects(df, n_clusters=5):
    """Cluster projects based on numeric features."""
    # Select numeric columns for clustering
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    
    # Exclude ID columns, counts, and target variables
    exclude_patterns = ['project_id', 'id', '_count', 'total_']
    cluster_features = [col for col in numeric_cols if not any(pattern in col for pattern in exclude_patterns)]
    
    # Create a copy with only clustering features
    cluster_data = df[cluster_features].copy()
    
    # Handle missing values
    cluster_data = cluster_data.fillna(cluster_data.median())
    
    # Scale features
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(cluster_data)
    
    # Apply KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df['cluster'] = kmeans.fit_predict(scaled_data)
    
    # Apply PCA for visualization
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(scaled_data)
    
    # Add PCA components to dataframe
    df['pca_x'] = pca_result[:, 0]
    df['pca_y'] = pca_result[:, 1]
    
    # Visualize clusters
    plt.figure(figsize=(12, 10))
    
    for i in range(n_clusters):
        cluster_data = df[df['cluster'] == i]
        plt.scatter(
            cluster_data['pca_x'], 
            cluster_data['pca_y'],
            s=100, 
            alpha=0.7,
            label=f'Cluster {i} ({len(cluster_data)} projects)'
        )
    
    plt.title('Project Clusters', fontsize=16)
    plt.xlabel('Principal Component 1', fontsize=14)
    plt.ylabel('Principal Component 2', fontsize=14)
    plt.legend()
    plt.grid(alpha=0.3)
    plt.savefig('project_clusters.png', dpi=300)
    
    return df

def analyze_clusters(df):
    """Analyze the characteristics of each cluster."""
    # Select features for analysis
    feature_cols = [
        'issue_count', 'project_duration_days', 'unique_contributors',
        'avg_issues_per_day', 'contributor_to_issue_ratio', 
        'bug_ratio', 'task_ratio', 'improvement_ratio', 'feature_ratio',
        'resolution_efficiency', 'resolution_variability',
        'complexity_score', 'contributor_efficiency'
    ]
    
    feature_cols = [col for col in feature_cols if col in df.columns]
    
    # Calculate overall stats for comparison
    overall_stats = df[feature_cols].mean()
    
    # Analyze each cluster
    cluster_profiles = {}
    
    for cluster in df['cluster'].unique():
        cluster_data = df[df['cluster'] == cluster]
        
        # Basic statistics
        profile = {
            'size': len(cluster_data),
            'percentage': round(len(cluster_data) / len(df) * 100, 1),
            'sample_projects': cluster_data['project_name'].sample(min(5, len(cluster_data))).tolist()
        }
        
        # Feature statistics and differences from overall
        feature_stats = {}
        
        for feature in feature_cols:
            if feature in cluster_data.columns:
                feature_mean = cluster_data[feature].mean()
                overall_mean = overall_stats[feature]
                
                # Calculate percentage difference
                if overall_mean != 0:
                    pct_diff = ((feature_mean - overall_mean) / overall_mean) * 100
                else:
                    pct_diff = 0
                
                feature_stats[feature] = {
                    'mean': feature_mean,
                    'overall_mean': overall_mean,
                    'pct_diff': pct_diff
                }
        
        # Find most distinctive features (largest percentage differences)
        distinctive_features = sorted(
            [(f, feature_stats[f]['pct_diff']) for f in feature_stats],
            key=lambda x: abs(x[1]),
            reverse=True
        )[:5]  # Top 5 most distinctive
        
        profile['distinctive_features'] = distinctive_features
        profile['feature_stats'] = feature_stats
        
        # Generate a simple description based on distinctive features
        description_parts = []
        for feature, pct_diff in distinctive_features[:3]:  # Use top 3 for description
            direction = "higher" if pct_diff > 0 else "lower"
            description_parts.append(f"{feature} is {abs(pct_diff):.1f}% {direction} than average")
        
        profile['description'] = "Projects where " + ", ".join(description_parts)
        
        # Add categorical distributions
        for cat_col in ['project_size', 'project_age_category', 'resolution_speed_category', 
                        'complexity_category', 'team_size_category']:
            if cat_col in df.columns:
                profile[f'{cat_col}_distribution'] = cluster_data[cat_col].value_counts().to_dict()
        
        # Store the profile
        cluster_profiles[f"Cluster {cluster}"] = profile
        
        # Print brief profile
        print(f"\nCluster {cluster} ({len(cluster_data)} projects, {profile['percentage']}%):")
        print(f"  Sample projects: {', '.join(profile['sample_projects'][:3])}")
        print("  Key characteristics:")
        for feature, pct_diff in distinctive_features[:3]:
            direction = "higher" if pct_diff > 0 else "lower"
            print(f"    - {feature}: {abs(pct_diff):.1f}% {direction} than average")
    
    return cluster_profiles

def generate_features(input_file):
    """Generate all features for project classification."""
    # Load data
    df = load_data(input_file)
    
    # Create features
    print("Creating project scale features...")
    df = create_project_scale_features(df)
    
    print("Creating issue type features...")
    df = create_issue_type_features(df)
    
    print("Creating resolution features...")
    df = create_resolution_features(df)
    
    print("Creating complexity features...")
    df = create_complexity_features(df)
    
    print("Creating team dynamics features...")
    df = create_team_dynamics_features(df)
    
    # Cluster projects
    print("Clustering projects...")
    df = cluster_projects(df)
    
    # Analyze clusters
    print("Analyzing clusters...")
    cluster_profiles = analyze_clusters(df)
    
    # Save results
    output_file = input_file.replace('.csv', '_features.csv')
    df.to_csv(output_file, index=False)
    print(f"Features saved to {output_file}")
    
    return df, cluster_profiles

if __name__ == "__main__":
    # Use the file directly
    input_file = "../DataSets/data_export_1741699774916.csv"
    df, clusters = generate_features(input_file)



Loaded ../DataSets/data_export_1741699774916.csv: 159 rows, 100 columns
Starting data cleaning...
Replacing 1 negative values in min_resolution_hours with 0
Dropped 6 columns with constant values
Cleaning complete. Resulting dataset: 159 rows, 94 columns
Added team metrics: ['creator_count', 'reporter_count', 'team_size_estimate', 'issues_per_team_member', 'resolution_hours_per_team_member', 'team_role_diversity']

Analyzing feature importance for predicting total_resolution_hours...
Identified 8 features that account for 90% of importance
Top 5 most important features: issue_count, issuetype.name_Suggestion, issuetype.name_Bug, issuetype.name_Support Request, issuetype.name_Public Security Vulnerability
Analyzed correlations between 8 key features

Using Elbow Method to determine optimal number of clusters...
Elbow method suggests optimal number of clusters: 3

Using Silhouette Analysis to determine optimal number of clusters...
Silhouette analysis suggests optimal number of cluster