In [1]:
import pandas as pd
import numpy as np
import os

def merge_task_and_project_data(task_data_path, project_data_path, output_dir='merged_task_data'):
    """
    Merge task-level and project-level datasets
    
    Parameters:
    - task_data_path: Path to the cleaned task-level dataset
    - project_data_path: Path to the project-level dataset
    - output_dir: Directory to save merged dataset
    
    Returns:
    - Merged DataFrame
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # 1. Load Datasets
    print("Loading datasets...")
    task_df = pd.read_csv(task_data_path)
    project_df = pd.read_csv(project_data_path)
    
    # Print initial dataset info
    print(f"Task Dataset Shape: {task_df.shape}")
    print(f"Project Dataset Shape: {project_df.shape}")
    
    # 2. Identify Project ID Columns
    # Try to find project ID columns in both datasets
    task_project_id_candidates = [
        'fields.project.id', 
        'project_id', 
        'remainder__project_id'
    ]
    
    project_id_candidates = [
        'project_id', 
        'remainder__project_id', 
        'fields.project.id'
    ]
    
    # Find matching project ID column
    task_project_id_col = next((col for col in task_project_id_candidates if col in task_df.columns), None)
    project_id_col = next((col for col in project_id_candidates if col in project_df.columns), None)
    
    if not task_project_id_col or not project_id_col:
        raise ValueError(f"Could not find matching project ID columns. Task columns: {task_df.columns}, Project columns: {project_df.columns}")
    
    print(f"Using Task Project ID Column: {task_project_id_col}")
    print(f"Using Project ID Column: {project_id_col}")
    
    # 3. Rename columns for consistent merging
    task_df = task_df.rename(columns={task_project_id_col: 'project_id'})
    project_df = project_df.rename(columns={project_id_col: 'project_id'})
    
    # 4. Select Relevant Project Features
    project_context_features = [
        # Project size/scope
        'time_power__project_duration_days',
        'count_std__total_issues',
        
        # Project composition
        'pct_minmax__type_bug_pct',
        'pct_minmax__type_task_pct',
        'pct_minmax__type_new_feature_pct',
        'pct_minmax__type_epic_pct',
        'pct_minmax__type_improvement_pct',
        'pct_minmax__type_story_pct',
        'pct_minmax__type_documentation_pct',
        
        # Priority distribution
        'pct_minmax__priority_critical_pct',
        'pct_minmax__priority_blocker_pct',
        'pct_minmax__priority_high_pct',
        'pct_minmax__priority_low_pct',
        
        # Team metrics
        'remainder__team_size_creators',
        'remainder__team_size_assignees',
        'remainder__team_size_combined',
        
        # Complexity metrics
        'stat_robust__weighted_priority_score',
        'stat_robust__issue_type_entropy',
        'stat_robust__high_to_low_priority_ratio',
        'stat_robust__bug_ratio',
        
        # Resolution time metrics
        'avg_resolution_hours',
        'median_resolution_hours',
        'total_resolution_hours'
    ]
    
    # Filter to features that actually exist in the project dataframe
    project_context_features = [f for f in project_context_features if f in project_df.columns]
    
    # Select features to merge
    project_merge_df = project_df[['project_id'] + project_context_features]
    
    # 5. Merge Datasets
    merged_df = pd.merge(
        task_df, 
        project_merge_df, 
        on='project_id', 
        how='inner'  # Use inner join to ensure only matching projects
    )
    
    # 6. Log-transform resolution hours if not already done
    if 'log_resolution_hours' not in merged_df.columns:
        if 'resolution_hours' in merged_df.columns:
            merged_df['log_resolution_hours'] = np.log1p(merged_df['resolution_hours'])
    
    # 7. Basic Data Validation
    print(f"\nMerged Dataset Shape: {merged_df.shape}")
    print(f"Number of Unique Projects: {merged_df['project_id'].nunique()}")
    print("\nMerged Dataset Columns:")
    print(merged_df.columns.tolist())
    
    # 8. Save Merged Dataset
    output_path = os.path.join(output_dir, 'merged_project_task_data.csv')
    merged_df.to_csv(output_path, index=False)
    print(f"\nMerged dataset saved to: {output_path}")
    
    return merged_df

# Example usage
if __name__ == "__main__":
    # Update these paths to match your specific dataset locations
    task_data_path = './cleaned_jira_dataset.csv'
    project_data_path = 'prepared_processed_data/common_features_scaled_with_original_targets.csv'
    
    # Merge the datasets
    merged_data = merge_task_and_project_data(
        task_data_path, 
        project_data_path
    )

Loading datasets...


  task_df = pd.read_csv(task_data_path)


Task Dataset Shape: (2259837, 40)
Project Dataset Shape: (971, 178)
Using Task Project ID Column: fields.project.id
Using Project ID Column: remainder__project_id

Merged Dataset Shape: (1575833, 63)
Number of Unique Projects: 711

Merged Dataset Columns:
['id', 'fields.issuetype.id', 'fields.issuetype.name', 'project_id', 'fields.project.key', 'fields.project.name', 'fields.created', 'fields.priority.name', 'fields.priority.id', 'fields.updated', 'fields.status.name', 'fields.creator.active', 'priority_id', 'issue_type_id', 'is_completed', 'type_task', 'type_bug', 'inward_count', 'outward_count', 'is_resolved', 'age_days', 'type_sub_task', 'created_is_weekend', 'created_hour', 'created_month', 'created_year', 'resolution_hours', 'log_resolution_hours', 'is_type_bug', 'is_type_task', 'is_type_story', 'is_type_improvement', 'is_type_new_feature', 'is_type_epic', 'is_type_sub-task', 'is_priority_blocker', 'is_priority_critical', 'is_priority_major', 'is_priority_minor', 'is_priority_triv

In [None]:
import pandas as pd
import dtale

# Replace this path with the location of your CSV file
csv_file_path = "./merged_task_data/merged_project_task_data.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file_path)

# Start a D-Tale session and open it in the browser
d = dtale.show(df, ignore_duplicate=True, allow_cell_edits=False)
d.open_browser()

Loading task dataset...


  task_df = pd.read_csv(task_data_path)


Training Random Forest Regressor (Task-Only)...

Task-Only Model Performance:
Mean Squared Error: 117444005.7987
Root Mean Squared Error: 10837.1586
Mean Absolute Error: 4047.7512
R-squared: 0.0145
Mean Magnitude of Relative Error: inf

Task-Only Analysis complete. Check the 'task_only_results' directory.
