In [None]:
import pandas as pd
import numpy as np
import re
import os
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

class ProjectDataProcessor:
    """
    A complete pipeline for processing and standardizing project data
    from multiple repositories with different column naming patterns.
    """
    
    def __init__(self, input_file, output_dir):
        """
        Initialize the processor with input file and output directory.
        
        Parameters:
        -----------
        input_file : str
            Path to the combined raw CSV file
        output_dir : str
            Directory where processed outputs will be saved
        """
        self.input_file = input_file
        self.output_dir = output_dir
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Initialize data storage
        self.raw_data = None
        self.column_inventory = None
        self.standardized_data = None
        
        # Column pattern categories
        self.pattern_categories = {
            'core': [],  # Core project metadata
            'temporal': [],  # Time-related metrics
            'priority': [],  # Priority-related columns
            'issue_type': [],  # Issue type columns
            'priority_type': [],  # Combined priority-type columns
            'dependency': [],  # Dependency-related columns
            'resolution': [],  # Resolution metrics
            'efficiency': [],  # Efficiency metrics
            'other': []  # Uncategorized columns
        }
        
        # Column mappings from original to standardized names
        self.column_mappings = {}
    
    def load_data(self):
        """Load the raw combined data"""
        print(f"Loading data from {self.input_file}...")
        self.raw_data = pd.read_csv(self.input_file)
        print(f"Loaded {len(self.raw_data)} rows with {len(self.raw_data.columns)} columns")
        return self
    
    def analyze_columns(self):
        """Analyze column patterns and distribution"""
        if self.raw_data is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        print("Analyzing column patterns...")
        
        # Track column presence by repository
        repo_columns = defaultdict(list)
        column_repos = defaultdict(list)
        
        # Count missing values per column
        missing_counts = self.raw_data.isna().sum()
        missing_pct = (missing_counts / len(self.raw_data) * 100).round(1)
        
        # Get unique repositories
        repositories = self.raw_data['repository'].unique()
        
        # Analyze which columns exist in which repositories
        for repo in repositories:
            repo_data = self.raw_data[self.raw_data['repository'] == repo]
            
            # Find columns that have at least one non-null value in this repo
            non_null_cols = [col for col in self.raw_data.columns 
                            if not repo_data[col].isna().all()]
            
            repo_columns[repo] = non_null_cols
            
            # Update column_repos
            for col in non_null_cols:
                column_repos[col].append(repo)
        
        # Categorize columns by naming pattern
        pattern_counts = defaultdict(int)
        
        for col in self.raw_data.columns:
            # Identify core columns
            if col in ['project_id', 'project_key', 'project_name', 'repository', 'source_file', 
                      'total_issues']:
                self.pattern_categories['core'].append(col)
            
            # Identify temporal columns
            elif any(term in col for term in ['date', 'duration', 'days', 'month', 'time', 'hours']):
                self.pattern_categories['temporal'].append(col)
                pattern_counts['temporal'] += 1
            
            # Identify priority columns
            elif col.startswith('priority_') and not '_type_' in col:
                self.pattern_categories['priority'].append(col)
                pattern_counts['priority'] += 1
            
            # Identify issue type columns
            elif col.startswith('type_') and not '_resolution_' in col:
                self.pattern_categories['issue_type'].append(col)
                pattern_counts['issue_type'] += 1
            
            # Identify combined priority-type columns
            elif '_type_' in col and 'priority_' in col:
                self.pattern_categories['priority_type'].append(col)
                pattern_counts['priority_type'] += 1
            
            # Identify dependency columns
            elif any(term in col for term in ['link', 'inward', 'outward', 'dependencies']):
                self.pattern_categories['dependency'].append(col)
                pattern_counts['dependency'] += 1
            
            # Identify resolution columns
            elif any(term in col for term in ['resolution', 'resolved']):
                self.pattern_categories['resolution'].append(col)
                pattern_counts['resolution'] += 1
            
            # Identify efficiency columns
            elif any(term in col for term in ['efficiency', 'ratio', 'balance', 'velocity']):
                self.pattern_categories['efficiency'].append(col)
                pattern_counts['efficiency'] += 1
            
            # Other columns
            else:
                self.pattern_categories['other'].append(col)
                pattern_counts['other'] += 1
        
        # Identify pattern variants for standardization
        self._identify_column_variants()
        
        # Create column inventory
        self.column_inventory = {
            'total_columns': len(self.raw_data.columns),
            'repositories': {
                'count': len(repositories),
                'names': list(repositories)
            },
            'columns_by_repository': {repo: cols for repo, cols in repo_columns.items()},
            'repositories_by_column': {col: repos for col, repos in column_repos.items()},
            'missing_percentages': {col: pct for col, pct in missing_pct.items()},
            'pattern_categories': {cat: len(cols) for cat, cols in self.pattern_categories.items()},
            'column_variants': self.column_mappings
        }
        
        # Save inventory to file
        inventory_file = os.path.join(self.output_dir, 'column_inventory_analyzed.json')
        with open(inventory_file, 'w') as f:
            # Convert sets to lists for JSON serialization
            inventory_json = {k: v if not isinstance(v, set) else list(v) 
                             for k, v in self.column_inventory.items()}
            
            # Handle nested dictionaries with sets
            for k, v in inventory_json.items():
                if isinstance(v, dict):
                    inventory_json[k] = {k2: v2 if not isinstance(v2, set) else list(v2) 
                                        for k2, v2 in v.items()}
            
            json.dump(inventory_json, f, indent=2)
        
        print(f"Column analysis complete. Results saved to {inventory_file}")
        
        # Create visualizations for column analysis
        self._create_column_analysis_visualizations()
        
        return self
    
    def _identify_column_variants(self):
        """Identify column naming variants to create standardization mappings"""
        # Group similar columns
        # 1. Priority columns
        priority_pattern = r'priority_([\w-]+)(?:_-_p\d+|___p\d+)?_(count|pct)'
        priority_columns = [col for col in self.raw_data.columns if re.match(priority_pattern, col)]
        
        for col in priority_columns:
            match = re.match(priority_pattern, col)
            if match:
                priority_level = match.group(1)
                metric_type = match.group(2)
                standard_name = f"priority_{priority_level}_{metric_type}"
                self.column_mappings[col] = standard_name
        
        # 2. Issue type columns
        type_pattern = r'type_([\w-]+)_(count|pct)'
        type_columns = [col for col in self.raw_data.columns if re.match(type_pattern, col)]
        
        for col in type_columns:
            match = re.match(type_pattern, col)
            if match:
                issue_type = match.group(1).replace('-', '_')  # Standardize - vs _
                metric_type = match.group(2)
                standard_name = f"type_{issue_type}_{metric_type}"
                self.column_mappings[col] = standard_name
        
        # 3. Priority-type combined columns
        priority_type_pattern = r'priority_([\w-]+)(?:_-_p\d+|___p\d+)?_type_([\w-]+)_(count|avg_resolution_hours)'
        priority_type_columns = [col for col in self.raw_data.columns if re.match(priority_type_pattern, col)]
        
        for col in priority_type_columns:
            match = re.match(priority_type_pattern, col)
            if match:
                priority_level = match.group(1)
                issue_type = match.group(2).replace('-', '_')
                metric_type = match.group(3)
                standard_name = f"priority_{priority_level}_type_{issue_type}_{metric_type}"
                self.column_mappings[col] = standard_name
        
        # 4. Resolution rate columns
        resolution_pattern = r'type_([\w-]+)_resolution_rate'
        resolution_columns = [col for col in self.raw_data.columns if re.match(resolution_pattern, col)]
        
        for col in resolution_columns:
            match = re.match(resolution_pattern, col)
            if match:
                issue_type = match.group(1).replace('-', '_')
                standard_name = f"type_{issue_type}_resolution_rate"
                self.column_mappings[col] = standard_name
    
    def _create_column_analysis_visualizations(self):
        """Create visualizations of column analysis"""
        # Create a visualizations directory
        viz_dir = os.path.join(self.output_dir, 'visualizations')
        os.makedirs(viz_dir, exist_ok=True)
        
        # 1. Column category distribution
        plt.figure(figsize=(12, 6))
        categories = list(self.pattern_categories.keys())
        category_counts = [len(self.pattern_categories[cat]) for cat in categories]
        
        sns.barplot(x=categories, y=category_counts)
        plt.title('Column Distribution by Category')
        plt.xlabel('Category')
        plt.ylabel('Number of Columns')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(viz_dir, 'column_categories.png'))
        plt.close()
        
        # 2. Repository column counts
        repo_column_counts = {repo: len(cols) for repo, cols in 
                             self.column_inventory['columns_by_repository'].items()}
        
        plt.figure(figsize=(14, 6))
        repos = list(repo_column_counts.keys())
        counts = list(repo_column_counts.values())
        
        # Sort by count
        sorted_indices = np.argsort(counts)[::-1]
        sorted_repos = [repos[i] for i in sorted_indices]
        sorted_counts = [counts[i] for i in sorted_indices]
        
        # Take top 20 for readability
        display_repos = sorted_repos[:20]
        display_counts = sorted_counts[:20]
        
        sns.barplot(x=display_repos, y=display_counts)
        plt.title('Number of Non-Null Columns by Repository (Top 20)')
        plt.xlabel('Repository')
        plt.ylabel('Number of Columns')
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.savefig(os.path.join(viz_dir, 'repository_columns.png'))
        plt.close()
        
        # 3. Missing value distribution
        missing_pct = pd.Series(self.column_inventory['missing_percentages'])
        missing_pct = missing_pct.sort_values(ascending=False)
        
        # Plot histogram of missing percentages
        plt.figure(figsize=(10, 6))
        sns.histplot(missing_pct.values, bins=20)
        plt.title('Distribution of Missing Value Percentages')
        plt.xlabel('Percentage of Missing Values')
        plt.ylabel('Number of Columns')
        plt.tight_layout()
        plt.savefig(os.path.join(viz_dir, 'missing_values_histogram.png'))
        plt.close()
        
        # 4. Top missing columns
        plt.figure(figsize=(14, 6))
        top_missing = missing_pct.head(20)
        
        sns.barplot(x=top_missing.index, y=top_missing.values)
        plt.title('Top 20 Columns with Highest Missing Percentages')
        plt.xlabel('Column')
        plt.ylabel('Missing Percentage')
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.savefig(os.path.join(viz_dir, 'top_missing_columns.png'))
        plt.close()
        
        print(f"Column analysis visualizations saved to {viz_dir}")
    
    def standardize_columns(self):
        """Standardize column names based on identified patterns"""
        if self.raw_data is None or self.column_inventory is None:
            raise ValueError("Data not loaded or columns not analyzed. Call load_data() and analyze_columns() first.")
        
        print("Standardizing column names...")
        
        # Create a copy of the raw data
        standardized_df = self.raw_data.copy()
        
        # Apply column mappings
        rename_map = {col: new_name for col, new_name in self.column_mappings.items() 
                     if col in standardized_df.columns}
        
        # Perform the renaming
        standardized_df = standardized_df.rename(columns=rename_map)
        
        # After renaming, some columns may be duplicated - combine them
        # Group by the new column names
        new_columns = standardized_df.columns
        duplicate_cols = [col for col in set(new_columns) if list(new_columns).count(col) > 1]
        
        if duplicate_cols:
            print(f"Found {len(duplicate_cols)} duplicate column sets after standardization.")
            
            # For each duplicate column set, combine the values
            for col in duplicate_cols:
                # Get all original columns that map to this standardized name
                original_cols = [c for c in standardized_df.columns if c == col]
                
                if len(original_cols) <= 1:
                    continue
                
                # Keep the first one and drop the rest
                keep_col = original_cols[0]
                drop_cols = original_cols[1:]
                
                # For each row, if the keep_col is NaN, try to fill it from the other columns
                for drop_col in drop_cols:
                    # Fill NaN values in keep_col with values from drop_col
                    standardized_df[keep_col] = standardized_df[keep_col].fillna(standardized_df[drop_col])
                
                # Drop the duplicate columns
                standardized_df = standardized_df.drop(columns=drop_cols)
                
                print(f"  Combined duplicate column set: {col} ({len(original_cols)} variants)")
        
        # Store the standardized data
        self.standardized_data = standardized_df
        
        # Save standardized data
        standardized_file = os.path.join(self.output_dir, 'standardized_data.csv')
        standardized_df.to_csv(standardized_file, index=False)
        
        print(f"Column standardization complete. Reduced from {len(self.raw_data.columns)} to {len(standardized_df.columns)} columns.")
        print(f"Standardized data saved to {standardized_file}")
        
        return self
    
    def create_feature_subsets(self):
        """Create various feature subsets for different analysis purposes"""
        if self.standardized_data is None:
            raise ValueError("Data not standardized. Call standardize_columns() first.")
        
        print("Creating feature subsets...")
        
        # Get standardized data
        df = self.standardized_data
        
        # 1. Core features dataset
        core_features = ['project_id', 'project_key', 'project_name', 'repository', 'total_issues',
                         'project_start_date', 'project_duration_days', 'avg_resolution_hours', 
                         'num_resolved_issues', 'pct_resolved_issues']
        
        # Only include columns that exist
        core_features = [col for col in core_features if col in df.columns]
        
        core_df = df[core_features]
        core_file = os.path.join(self.output_dir, 'core_features.csv')
        core_df.to_csv(core_file, index=False)
        print(f"Core features dataset saved with {len(core_features)} columns")
        
        # 2. Temporal features dataset
        temporal_features = core_features + [col for col in self.pattern_categories['temporal'] 
                                            if col in df.columns]
        
        temporal_df = df[temporal_features]
        temporal_file = os.path.join(self.output_dir, 'temporal_features.csv')
        temporal_df.to_csv(temporal_file, index=False)
        print(f"Temporal features dataset saved with {len(temporal_features)} columns")
        
        # 3. Priority features dataset
        priority_features = core_features + [col for col in self.pattern_categories['priority'] 
                                           if col in df.columns]
        
        priority_df = df[priority_features]
        priority_file = os.path.join(self.output_dir, 'priority_features.csv')
        priority_df.to_csv(priority_file, index=False)
        print(f"Priority features dataset saved with {len(priority_features)} columns")
        
        # 4. Issue type features dataset
        type_features = core_features + [col for col in self.pattern_categories['issue_type'] 
                                       if col in df.columns]
        
        type_df = df[type_features]
        type_file = os.path.join(self.output_dir, 'issue_type_features.csv')
        type_df.to_csv(type_file, index=False)
        print(f"Issue type features dataset saved with {len(type_features)} columns")
        
        # 5. Resolution and efficiency features dataset
        resolution_features = core_features + [col for col in self.pattern_categories['resolution'] + 
                                             self.pattern_categories['efficiency'] if col in df.columns]
        
        resolution_df = df[resolution_features]
        resolution_file = os.path.join(self.output_dir, 'resolution_efficiency_features.csv')
        resolution_df.to_csv(resolution_file, index=False)
        print(f"Resolution and efficiency features dataset saved with {len(resolution_features)} columns")
        
        # 6. Common features dataset (columns present in at least 50% of repositories)
        repo_count = len(df['repository'].unique())
        threshold = repo_count * 0.35
        
        common_cols = [col for col in df.columns 
                      if len(self.column_inventory['repositories_by_column'].get(col, [])) >= threshold]
        
        common_df = df[common_cols]
        common_file = os.path.join(self.output_dir, 'common_features.csv')
        common_df.to_csv(common_file, index=False)
        print(f"Common features dataset saved with {len(common_cols)} columns (present in ≥50% of repos)")
        
        # 7. Complete dataset with missing values filled
        filled_df = df.copy()
        
        # Fill numeric columns with 0
        num_cols = filled_df.select_dtypes(include=['number']).columns
        filled_df[num_cols] = filled_df[num_cols].fillna(0)
        
        # Fill string/object columns with 'Unknown'
        obj_cols = filled_df.select_dtypes(include=['object']).columns
        filled_df[obj_cols] = filled_df[obj_cols].fillna('Unknown')
        
        # Fill datetime columns with the earliest date in the dataset
        date_cols = [col for col in filled_df.columns if 'date' in col.lower()]
        for col in date_cols:
            if col in filled_df.columns:
                try:
                    filled_df[col] = pd.to_datetime(filled_df[col])
                    min_date = filled_df[col].min()
                    filled_df[col] = filled_df[col].fillna(min_date)
                except:
                    pass
        
        filled_file = os.path.join(self.output_dir, 'filled_data.csv')
        filled_df.to_csv(filled_file, index=False)
        print(f"Filled dataset saved with all {len(filled_df.columns)} columns and no missing values")
        
        return self
    
    def create_repository_specific_datasets(self):
        """Create separate datasets for each repository"""
        if self.standardized_data is None:
            raise ValueError("Data not standardized. Call standardize_columns() first.")
        
        print("Creating repository-specific datasets...")
        
        # Get standardized data
        df = self.standardized_data
        
        # Create directory for repo-specific datasets
        repos_dir = os.path.join(self.output_dir, 'repositories')
        os.makedirs(repos_dir, exist_ok=True)
        
        # Get unique repositories
        repositories = df['repository'].unique()
        
        # Create dataset for each repository
        for repo in repositories:
            # Get data for this repository
            repo_data = df[df['repository'] == repo]
            
            # Drop columns that are all NaN
            repo_data = repo_data.dropna(axis=1, how='all')
            
            # Create file name (replace invalid characters)
            safe_repo_name = re.sub(r'[\\/*?:"<>|]', "_", repo)
            repo_file = os.path.join(repos_dir, f"{safe_repo_name}.csv")
            
            # Save to file
            repo_data.to_csv(repo_file, index=False)
            
            print(f"Created dataset for {repo} with {len(repo_data.columns)} columns")
        
        print(f"Repository-specific datasets saved to {repos_dir}")
        
        return self
    
    def run_pipeline(self):
        """Run the complete data processing pipeline"""
        return (self.load_data()
                .analyze_columns()
                .standardize_columns()
                .create_feature_subsets()
                .create_repository_specific_datasets())

# Example usage
if __name__ == "__main__":
    # Define paths
    INPUT_FILE = "./project_level_data/combined/combined_projects_raw.csv"
    OUTPUT_DIR = "./processed_data"
    
    # Run the pipeline
    processor = ProjectDataProcessor(INPUT_FILE, OUTPUT_DIR)
    processor.run_pipeline()

Loading data from ./project_level_data/combined/combined_projects_raw.csv...
Loaded 971 rows with 1506 columns
Analyzing column patterns...
Column analysis complete. Results saved to ./processed_data/column_inventory_analyzed.json
Column analysis visualizations saved to ./processed_data/visualizations
Standardizing column names...
Found 22 duplicate column sets after standardization.
  Combined duplicate column set: type_blog___new_blog_request_pct (2 variants)
  Combined duplicate column set: type_sub_task_pct (2 variants)
  Combined duplicate column set: type_new_tlp___common_tasks_pct (2 variants)
  Combined duplicate column set: type_blogs___access_to_existing_blog_pct (2 variants)
  Combined duplicate column set: type_backport_sub_task_pct (2 variants)
  Combined duplicate column set: type_sub_task_count (2 variants)
  Combined duplicate column set: type_dev_sub_task_count (2 variants)
  Combined duplicate column set: type_simple_sub_task_pct (2 variants)
  Combined duplicate colu