In [None]:
import pandas as pd
import os
import glob
import re
from collections import defaultdict
import json

def inventory_project_columns(base_dir, output_inventory_file=None):
    """
    Inventories all columns across project files, analyzing naming patterns
    to identify similar columns with different names.
    
    Parameters:
    -----------
    base_dir : str
        Base directory containing repository folders with project-level data
    output_inventory_file : str, optional
        Path to save the column inventory as JSON
    
    Returns:
    --------
    dict
        Dictionary with column pattern information
    """
    # Check if the base directory exists
    if not os.path.exists(base_dir):
        raise ValueError(f"Base directory not found: {base_dir}")
    
    # Get a list of all repository folders
    repo_folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
    if not repo_folders:
        raise ValueError(f"No repository folders found in {base_dir}")
    
    print(f"Found {len(repo_folders)} repository folders in {base_dir}")
    
    # Dictionary to track column information
    all_columns = set()
    column_patterns = defaultdict(set)
    column_by_repo = defaultdict(set)
    repo_by_column = defaultdict(set)
    
    # Process each repository folder
    for repo_folder in repo_folders:
        repo_path = os.path.join(base_dir, repo_folder)
        
        # Find all project-level CSV files in the repository folder
        csv_files = glob.glob(os.path.join(repo_path, "project_level_*.csv"))
        
        if not csv_files:
            print(f"No project-level CSV files found in {repo_path}, skipping...")
            continue
        
        print(f"Processing repository: {repo_folder} - Found {len(csv_files)} project-level files")
        
        # Process each CSV file in the repository folder
        for csv_file in csv_files:
            try:
                # Read just the header to get column names
                df_header = pd.read_csv(csv_file, nrows=0)
                
                # Add columns to the tracking dictionaries
                for col in df_header.columns:
                    all_columns.add(col)
                    column_by_repo[repo_folder].add(col)
                    repo_by_column[col].add(repo_folder)
                    
                    # Categorize columns by pattern
                    if re.match(r'priority_.*_count', col):
                        column_patterns['priority_count'].add(col)
                    elif re.match(r'priority_.*_pct', col):
                        column_patterns['priority_pct'].add(col)
                    elif re.match(r'type_.*_count', col):
                        column_patterns['type_count'].add(col)
                    elif re.match(r'type_.*_pct', col):
                        column_patterns['type_pct'].add(col)
                    elif re.match(r'priority_.*_type_.*_count', col):
                        column_patterns['priority_type_count'].add(col)
                    elif re.match(r'priority_.*_type_.*_avg_resolution_hours', col):
                        column_patterns['priority_type_resolution'].add(col)
                    elif re.match(r'type_.*_resolution_rate', col):
                        column_patterns['type_resolution_rate'].add(col)
                    elif col.startswith('pct_'):
                        column_patterns['percentage_metrics'].add(col)
                    elif col.endswith('_ratio'):
                        column_patterns['ratio_metrics'].add(col)
                    else:
                        column_patterns['other'].add(col)
                
                print(f"  Processed column names from {os.path.basename(csv_file)}")
                
            except Exception as e:
                print(f"  Error processing {csv_file}: {str(e)}")
    
    # Convert sets to lists for JSON serialization
    inventory = {
        'total_unique_columns': len(all_columns),
        'column_patterns': {k: sorted(list(v)) for k, v in column_patterns.items()},
        'repositories': {
            'count': len(repo_folders),
            'names': repo_folders
        },
        'columns_by_repo': {k: sorted(list(v)) for k, v in column_by_repo.items()},
        'repos_by_column': {k: sorted(list(v)) for k, v in repo_by_column.items()}
    }
    
    # Add pattern analysis to identify similar columns with different names
    similar_columns = defaultdict(list)
    
    # Analyze priority columns
    priority_cols = column_patterns['priority_count'].union(column_patterns['priority_pct'])
    priority_patterns = {}
    for col in priority_cols:
        # Extract priority level and metric
        matches = re.match(r'priority_([\w-]+)(?:_-_p\d+|___p\d+)?_(count|pct)', col)
        if matches:
            priority_level = matches.group(1)
            metric = matches.group(2)
            key = f"priority_{priority_level}_{metric}"
            if key not in priority_patterns:
                priority_patterns[key] = []
            priority_patterns[key].append(col)
    
    # Add to similar columns
    for key, cols in priority_patterns.items():
        if len(cols) > 1:
            similar_columns[key] = cols
    
    # Analyze issue type columns
    type_cols = column_patterns['type_count'].union(column_patterns['type_pct'])
    type_patterns = {}
    for col in type_cols:
        # Extract issue type and metric
        matches = re.match(r'type_([\w-]+)_(count|pct)', col)
        if matches:
            issue_type = matches.group(1).replace('-', '_')  # Standardize - vs _
            metric = matches.group(2)
            key = f"type_{issue_type}_{metric}"
            if key not in type_patterns:
                type_patterns[key] = []
            type_patterns[key].append(col)
    
    # Add to similar columns
    for key, cols in type_patterns.items():
        if len(cols) > 1:
            similar_columns[key] = cols
    
    # Add similar columns to inventory
    inventory['similar_columns'] = {k: v for k, v in similar_columns.items()}
    
    # Print summary
    print(f"\nColumn Inventory Complete!")
    print(f"Total unique columns across all repositories: {len(all_columns)}")
    print("\nColumn pattern categories:")
    for pattern, cols in column_patterns.items():
        print(f"  {pattern}: {len(cols)} columns")
    
    print("\nSimilar columns with different naming patterns:")
    for standard_name, variants in list(similar_columns.items())[:10]:  # Show first 10
        print(f"  {standard_name}: {len(variants)} variants")
        for v in variants[:3]:  # Show first 3 variants
            print(f"    - {v}")
        if len(variants) > 3:
            print(f"    - ... and {len(variants) - 3} more")
    
    if len(similar_columns) > 10:
        print(f"  ... and {len(similar_columns) - 10} more pattern groups")
    
    # Save inventory to file if specified
    if output_inventory_file:
        output_dir = os.path.dirname(output_inventory_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
        
        with open(output_inventory_file, 'w') as f:
            json.dump(inventory, f, indent=2)
        
        print(f"\nInventory saved to {output_inventory_file}")
    
    return inventory
import numpy as np
import pandas as pd

def gini(array):
    """
    Calculate the Gini coefficient of a numpy array.
    """
    array = np.array(array)
    if np.amin(array) < 0:
        array -= np.amin(array)
    array = array + 1e-7  # small constant to avoid division by zero
    array = np.sort(array)
    index = np.arange(1, array.shape[0] + 1)
    n = array.shape[0]
    return (np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))

def add_team_metrics(aggregation_df, issues_df):
    """
    Compute and append team metrics to the aggregation DataFrame.
    
    Parameters:
      aggregation_df : pandas.DataFrame
          Aggregated project-level DataFrame (typically a single row).
      issues_df : pandas.DataFrame
          DataFrame containing detailed issue/creator/assignee information from your CSVs.
    
    Returns:
      aggregation_df : pandas.DataFrame
          The input aggregation_df augmented with the new team metrics.
    """
    # 1. Team Size and Composition
    team_size_creators = issues_df['creator'].nunique() if 'creator' in issues_df.columns else 0
    team_size_assignees = issues_df['assignee'].nunique() if 'assignee' in issues_df.columns else 0
    team_size_combined = team_size_creators + team_size_assignees

    # Assuming there is a boolean column 'is_core' to indicate core team membership
    if 'is_core' in issues_df.columns:
        core_team_count = issues_df.loc[issues_df['is_core'] == True, 'creator'].nunique()
        core_team_ratio = core_team_count / team_size_combined if team_size_combined > 0 else np.nan
    else:
        core_team_ratio = np.nan

    # 2. Creator Contribution and Workload
    if 'creator' in issues_df.columns:
        creator_counts = issues_df.groupby('creator').size()
        creator_workload_gini = gini(creator_counts.values) if len(creator_counts) > 0 else np.nan
        avg_issues_per_creator = creator_counts.mean()
        top_creator_contribution = creator_counts.max() / len(issues_df) if len(issues_df) > 0 else np.nan
        creator_activity_variance = creator_counts.var()
        creator_activity_std = creator_counts.std()
    else:
        creator_workload_gini = avg_issues_per_creator = top_creator_contribution = creator_activity_variance = creator_activity_std = np.nan

    # Diversity in contributions: average number of unique issue types per creator
    if 'issue_type' in issues_df.columns and 'creator' in issues_df.columns:
        creator_diversity = issues_df.groupby('creator')['issue_type'].nunique().mean()
    else:
        creator_diversity = np.nan

    # 3. Team Specialization and Complexity
    if 'issue_type' in issues_df.columns:
        issue_type_counts = issues_df['issue_type'].value_counts()
        team_type_specialization_index = issue_type_counts.max() / issue_type_counts.sum()
    else:
        team_type_specialization_index = np.nan

    # Example: Use a 'complexity' column if available to compute team complexity capacity
    team_complexity_capacity = issues_df['complexity'].mean() if 'complexity' in issues_df.columns else np.nan

    # Team resolution predictability: Inverse of the standard deviation of resolution times
    if 'resolution_time' in issues_df.columns:
        std_resolution = issues_df['resolution_time'].std()
        team_resolution_predictability = 1 / std_resolution if std_resolution and std_resolution != 0 else np.nan
    else:
        team_resolution_predictability = np.nan

    # 4. Developer and Creator Ratios
    if 'issue_type' in issues_df.columns and 'creator' in issues_df.columns and 'assignee' in issues_df.columns:
        # Bug-related contributions
        bug_issues = issues_df[issues_df['issue_type'] == 'bug']
        if len(bug_issues) > 0:
            bug_creator_concentration = bug_issues.groupby('creator').size().max() / len(bug_issues)
        else:
            bug_creator_concentration = np.nan

        # Bug handling ratios (comparing average bug issue counts for creators vs. assignees)
        avg_bug_creator = bug_issues.groupby('creator').size().mean() if len(bug_issues) > 0 else np.nan
        avg_bug_assignee = bug_issues.groupby('assignee').size().mean() if len(bug_issues) > 0 else np.nan
        bug_developer_ratio = avg_bug_assignee / avg_bug_creator if avg_bug_creator and avg_bug_creator != 0 else np.nan

        # Feature handling ratios
        feature_issues = issues_df[issues_df['issue_type'] == 'feature']
        avg_feature_creator = feature_issues.groupby('creator').size().mean() if len(feature_issues) > 0 else np.nan
        avg_feature_assignee = feature_issues.groupby('assignee').size().mean() if len(feature_issues) > 0 else np.nan
        feature_developer_ratio = avg_feature_assignee / avg_feature_creator if avg_feature_creator and avg_feature_creator != 0 else np.nan
    else:
        bug_creator_concentration = bug_developer_ratio = feature_developer_ratio = np.nan

    # 5. Creator Experience and Efficiency
    avg_creator_experience_days = issues_df['creator_experience_days'].mean() if 'creator_experience_days' in issues_df.columns else np.nan
    avg_creator_issue_count = issues_df.groupby('creator').size().mean() if 'creator' in issues_df.columns else np.nan
    avg_creator_specialization = issues_df['creator_specialization'].mean() if 'creator_specialization' in issues_df.columns else np.nan
    creator_link_density_mean = issues_df['creator_link_density'].mean() if 'creator_link_density' in issues_df.columns else np.nan
    creator_link_density_std = issues_df['creator_link_density'].std() if 'creator_link_density' in issues_df.columns else np.nan
    creator_resolution_time_variability = issues_df['resolution_time'].std() if 'resolution_time' in issues_df.columns else np.nan
    creator_onboarding_volatility = issues_df['onboarding_time'].std() if 'onboarding_time' in issues_df.columns else np.nan
    
    # Average new creators per month using 'creator_join_date'
    if 'creator_join_date' in issues_df.columns:
        issues_df['join_month'] = pd.to_datetime(issues_df['creator_join_date']).dt.to_period('M')
        avg_new_creators_per_month = issues_df.groupby('join_month')['creator'].nunique().mean()
    else:
        avg_new_creators_per_month = np.nan

    # Combine all computed metrics into a dictionary
    new_metrics = {
        'team_size_creators': team_size_creators,
        'team_size_assignees': team_size_assignees,
        'team_size_combined': team_size_combined,
        'core_team_ratio': core_team_ratio,
        'creator_workload_gini': creator_workload_gini,
        'creator_diversity': creator_diversity,
        'avg_issues_per_creator': avg_issues_per_creator,
        'top_creator_contribution': top_creator_contribution,
        'creator_activity_variance': creator_activity_variance,
        'creator_activity_std': creator_activity_std,
        'team_type_specialization_index': team_type_specialization_index,
        'team_complexity_capacity': team_complexity_capacity,
        'team_resolution_predictability': team_resolution_predictability,
        'bug_creator_concentration': bug_creator_concentration,
        'bug_developer_ratio': bug_developer_ratio,
        'feature_developer_ratio': feature_developer_ratio,
        'avg_creator_experience_days': avg_creator_experience_days,
        'avg_creator_issue_count': avg_creator_issue_count,
        'avg_creator_specialization': avg_creator_specialization,
        'creator_link_density_mean': creator_link_density_mean,
        'creator_link_density_std': creator_link_density_std,
        'creator_resolution_time_variability': creator_resolution_time_variability,
        'creator_onboarding_volatility': creator_onboarding_volatility,
        'avg_new_creators_per_month': avg_new_creators_per_month
    }

    # Append these new metrics to the aggregation DataFrame.
    for key, value in new_metrics.items():
        aggregation_df[key] = value

    return aggregation_df

# Example integration:
# At the end of your CombineProjectsInventory.ipynb pipeline, you can add:
# aggregated_df = add_team_metrics(aggregated_df, issues_df)
# This will enrich your aggregated data with the additional team metrics.

def combine_all_projects_raw(base_dir, output_file):
    """
    Combines all project files without any column standardization,
    keeping all original columns and populating with NaN where needed.
    
    Parameters:
    -----------
    base_dir : str
        Base directory containing repository folders with project-level data
    output_file : str
        Path to save the combined data
    
    Returns:
    --------
    pandas.DataFrame
        Combined DataFrame with all original columns
    """
    # Check if the base directory exists
    if not os.path.exists(base_dir):
        raise ValueError(f"Base directory not found: {base_dir}")
    
    # Get a list of all repository folders
    repo_folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
    if not repo_folders:
        raise ValueError(f"No repository folders found in {base_dir}")
    
    print(f"Found {len(repo_folders)} repository folders in {base_dir}")
    
    # List to store all project data rows
    all_projects = []
    
    # Process each repository folder
    for repo_folder in repo_folders:
        repo_path = os.path.join(base_dir, repo_folder)
        
        # Find all project-level CSV files in the repository folder
        csv_files = glob.glob(os.path.join(repo_path, "project_level_*.csv"))
        
        if not csv_files:
            print(f"No project-level CSV files found in {repo_path}, skipping...")
            continue
        
        print(f"Processing repository: {repo_folder} - Found {len(csv_files)} project-level files")
        
        # Process each CSV file in the repository folder
        for csv_file in csv_files:
            try:
                # Read the project data
                df = pd.read_csv(csv_file)
                
                # Add repository and source file information
                df['repository'] = repo_folder
                df['source_file'] = os.path.basename(csv_file)
                
                # Add to the list of projects
                all_projects.append(df)
                
                print(f"  Added project data from {os.path.basename(csv_file)}")
                
            except Exception as e:
                print(f"  Error processing {csv_file}: {str(e)}")
    
    if not all_projects:
        print("No project data was found or successfully processed.")
        return pd.DataFrame()
    
    # Concatenate all project DataFrames with all columns
    combined_df = pd.concat(all_projects, ignore_index=True)
    
    # Count NaN values per column
    nan_counts = combined_df.isna().sum().sort_values(ascending=False)
    nan_percentages = (nan_counts / len(combined_df) * 100).round(2)
    
    # Print columns with highest NaN percentages
    print("\nColumns with highest NaN percentages:")
    for col, pct in nan_percentages.head(20).items():
        print(f"  {col}: {pct}% missing ({nan_counts[col]} out of {len(combined_df)} rows)")
    
    # Save to file
    output_dir = os.path.dirname(output_file)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    
    combined_df.to_csv(output_file, index=False)
    print(f"Combined raw project data saved to {output_file}")
    
    return combined_df

# Example usage
if __name__ == "__main__":
    # Define paths
    PROJECT_DATA_DIR = "./project_level_data"
    COMBINED_RAW_FILE = "./project_level_data/combined/combined_projects_raw.csv"
    COLUMN_INVENTORY_FILE = "./project_level_data/combined/column_inventory.json"
    
    # First, inventory all columns across projects
    inventory = inventory_project_columns(PROJECT_DATA_DIR, COLUMN_INVENTORY_FILE)
    
    # Then combine all projects without standardization to see the raw data
    combined_df = combine_all_projects_raw(PROJECT_DATA_DIR, COMBINED_RAW_FILE)
    
    print(f"\nRaw combination complete with {len(combined_df)} projects and {len(combined_df.columns)} columns")

In [None]:
import pandas as pd
import dtale

# Replace this path with the location of your CSV file
csv_file_path = "./project_level_data/combined/combined_projects_raw.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file_path)

# Start a D-Tale session and open it in the browser
d = dtale.show(df, ignore_duplicate=True, allow_cell_edits=False)
d.open_browser()

FileNotFoundError: [Errno 2] No such file or directory: './project_level_data/combined_projects_raw.csv'