In [1]:
import pandas as pd
import dtale

# Replace this path with the location of your CSV file
csv_file_path = "../FeatureCleaning/jira_extracted_data/Hyperledger/10001_Sawtooth.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file_path)

# Start a D-Tale session and open it in the browser
d = dtale.show(df, ignore_duplicate=True, allow_cell_edits=False)
d.open_browser()

In [None]:
import os
import pandas as pd
import glob

# Configuration
base_folder = "../FeatureCleaning/jira_extracted_data"  # Change this to your base folder containing all repositories
output_file = "./TaskLevel/consolidated_task_data.csv"

# Collect all task data into a single dataframe
all_tasks = []

# Traverse through all repository folders
for repo_folder in glob.glob(os.path.join(base_folder, "*")):
    if os.path.isdir(repo_folder):
        repo_name = os.path.basename(repo_folder)
        
        # Find all CSV files in this repository
        csv_files = glob.glob(os.path.join(repo_folder, "*.csv"))
        
        print(f"Processing repository: {repo_name} ({len(csv_files)} files)")
        
        # Process each CSV file
        for csv_file in csv_files:
            file_name = os.path.basename(csv_file)
            print(f"  Reading file: {file_name}")
            
            # Read the CSV file
            try:
                df = pd.read_csv(csv_file)
                
                # Add repository and file information
                df['repository'] = repo_name
                df['source_file'] = file_name
                
                # Append to our collection
                all_tasks.append(df)
                print(f"    Added {len(df)} rows")
            except Exception as e:
                print(f"    Error reading {file_name}: {e}")

# Combine all dataframes
if all_tasks:
    combined_df = pd.concat(all_tasks, ignore_index=True)
    
    # Save the consolidated file
    combined_df.to_csv(output_file, index=False)
    print(f"\nSuccessfully created consolidated task data file:")
    print(f"- File: {output_file}")
    print(f"- Total rows: {len(combined_df)}")
    print(f"- Columns: {', '.join(combined_df.columns)}")
else:
    print("No data found to consolidate")

In [6]:
import pandas as pd
import dtale

# Replace this path with the location of your CSV file
csv_file_path = "./TaskLevel/consolidated_task_data.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file_path)

# Start a D-Tale session and open it in the browser
d = dtale.show(df, ignore_duplicate=True, allow_cell_edits=False)
d.open_browser()


Columns (10,23,26) have mixed types. Specify dtype option on import or set low_memory=False.



In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import time

# Configuration
csv_file_path = "./TaskLevel/consolidated_task_data.csv"
output_dir = "./TaskLevel/analysis_output"
missing_threshold = 35  # Threshold for dropping columns (%)
chunk_size = 50000  # Process in chunks to manage memory

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

def analyze_full_dataset(filepath, chunk_size=50000):
    """
    Analyze the entire dataset in chunks to find missing values and data types
    
    Args:
        filepath: Path to the CSV file
        chunk_size: Size of chunks to process at once
        
    Returns:
        DataFrame with missing value analysis and data type information
    """
    print(f"Starting analysis of {filepath}")
    start_time = time.time()
    
    # Check if file exists
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    
    # First pass: get column names and count total rows
    print("First pass: counting rows and getting column names...")
    total_rows = 0
    
    # Get column names from the first row
    columns = pd.read_csv(filepath, nrows=0).columns.tolist()
    
    # Count rows without loading entire file
    for chunk in pd.read_csv(filepath, chunksize=chunk_size):
        total_rows += len(chunk)
        
    print(f"Total rows: {total_rows}")
    print(f"Total columns: {len(columns)}")
    
    # Initialize counters for missing values and data type analysis
    missing_counts = {col: 0 for col in columns}
    
    # Data type detection structures
    non_null_sample_values = defaultdict(list)
    unique_values = defaultdict(set)
    numeric_cols = set()
    likely_date_cols = set()
    likely_boolean_cols = set()
    
    # Second pass: analyze missing values and collect sample data for type detection
    print("Second pass: analyzing missing values and data types...")
    chunk_count = 0
    
    for chunk in pd.read_csv(filepath, chunksize=chunk_size):
        chunk_count += 1
        
        # Update missing value counts
        for col in columns:
            if col in chunk.columns:
                missing_counts[col] += chunk[col].isna().sum()
        
        # Collect data for type detection (from first few chunks only)
        if chunk_count <= 5:  # Limit sample collection to first 5 chunks
            for col in columns:
                if col in chunk.columns:
                    # Skip columns with all missing values in this chunk
                    if chunk[col].isna().all():
                        continue
                    
                    # Get non-null values for sampling
                    non_null_vals = chunk[col].dropna()
                    if len(non_null_vals) > 0:
                        # Add a small sample to our collection
                        sample = non_null_vals.sample(min(20, len(non_null_vals)))
                        non_null_sample_values[col].extend(sample.tolist())
                        
                        # Track unique values (up to a limit)
                        if len(unique_values[col]) < 1000:  # Limit unique value tracking
                            unique_values[col].update(sample.tolist())
                        
                        # Check if column is numeric
                        if pd.api.types.is_numeric_dtype(chunk[col]):
                            numeric_cols.add(col)
                        
                        # Check if column might be a date
                        if col not in likely_date_cols and pd.api.types.is_object_dtype(chunk[col]):
                            date_samples = pd.to_datetime(sample, errors='coerce')
                            if date_samples.notna().sum() / len(date_samples) > 0.9:
                                likely_date_cols.add(col)
                        
                        # Check if column might be boolean
                        if col not in likely_boolean_cols:
                            bool_values = [True, False, 0, 1, "0", "1", "true", "false", "True", "False"]
                            if all(str(val).lower() in [str(bv).lower() for bv in bool_values] for val in sample):
                                likely_boolean_cols.add(col)
        
        # Report progress
        if chunk_count % 10 == 0:
            elapsed = time.time() - start_time
            print(f"Processed {chunk_count} chunks ({chunk_count * chunk_size} rows) in {elapsed:.2f} seconds")
    
    # Calculate missing percentages
    missing_percent = {col: (count / total_rows) * 100 for col, count in missing_counts.items()}
    
    # Determine data types
    data_types = {}
    for col in columns:
        if col in likely_date_cols:
            data_types[col] = 'datetime'
        elif col in likely_boolean_cols:
            data_types[col] = 'boolean'
        elif col in numeric_cols:
            data_types[col] = 'numeric'
        elif col in unique_values and len(unique_values[col]) < 20:
            data_types[col] = 'categorical'
        elif col in unique_values and len(unique_values[col]) < 100:
            data_types[col] = 'categorical_high_cardinality'
        else:
            data_types[col] = 'text'
    
    # Create DataFrame with missing value statistics
    missing_df = pd.DataFrame({
        'column': list(missing_counts.keys()),
        'missing_count': list(missing_counts.values()),
        'missing_percent': [missing_percent[col] for col in missing_counts.keys()],
        'data_type': [data_types.get(col, 'unknown') for col in missing_counts.keys()]
    })
    
    # Sort by missing percentage
    missing_df = missing_df.sort_values('missing_percent', ascending=False)
    
    # Add action based on threshold
    missing_df['action'] = np.where(
        missing_df['missing_percent'] > missing_threshold, 
        'drop', 
        'impute'
    )
    
    # Add sample values
    missing_df['sample_values'] = missing_df['column'].apply(
        lambda col: str(non_null_sample_values.get(col, [])[:3])
    )
    
    # Add unique value counts
    missing_df['unique_count'] = missing_df['column'].apply(
        lambda col: len(unique_values.get(col, set()))
    )
    
    elapsed = time.time() - start_time
    print(f"Analysis completed in {elapsed:.2f} seconds")
    
    return missing_df, total_rows

def determine_imputation_strategies(missing_df):
    """
    Determine appropriate imputation strategies based on data types and missing percentages
    
    Args:
        missing_df: DataFrame with missing value analysis
        
    Returns:
        DataFrame with recommended imputation strategies
    """
    # Add imputation strategy column
    missing_df['imputation_strategy'] = 'none'
    
    # Strategies based on data type
    for idx, row in missing_df.iterrows():
        col = row['column']
        dtype = row['data_type']
        missing_pct = row['missing_percent']
        
        if row['action'] == 'drop':
            missing_df.loc[idx, 'imputation_strategy'] = 'drop_column'
            continue
        
        # Low missing values (<5%)
        if missing_pct < 5:
            if dtype == 'numeric':
                missing_df.loc[idx, 'imputation_strategy'] = 'median'
            elif dtype in ['categorical', 'categorical_high_cardinality', 'boolean']:
                missing_df.loc[idx, 'imputation_strategy'] = 'mode'
            elif dtype == 'datetime':
                missing_df.loc[idx, 'imputation_strategy'] = 'drop_rows'
            else:
                missing_df.loc[idx, 'imputation_strategy'] = 'drop_rows'
                
        # Medium missing values (5-15%)
        elif missing_pct < 15:
            if dtype == 'numeric':
                missing_df.loc[idx, 'imputation_strategy'] = 'median_by_group'
            elif dtype in ['categorical', 'categorical_high_cardinality', 'boolean']:
                missing_df.loc[idx, 'imputation_strategy'] = 'mode_by_group'
            elif dtype == 'datetime':
                missing_df.loc[idx, 'imputation_strategy'] = 'drop_rows'
            else:
                missing_df.loc[idx, 'imputation_strategy'] = 'drop_rows'
                
        # High missing values (15-35%)
        else:
            if dtype == 'numeric':
                missing_df.loc[idx, 'imputation_strategy'] = 'median_by_group'
            elif dtype in ['categorical', 'boolean']:
                missing_df.loc[idx, 'imputation_strategy'] = 'new_category'
            elif dtype == 'categorical_high_cardinality':
                missing_df.loc[idx, 'imputation_strategy'] = 'mode_by_group'
            elif dtype == 'datetime':
                missing_df.loc[idx, 'imputation_strategy'] = 'drop_rows'
            else:
                missing_df.loc[idx, 'imputation_strategy'] = 'drop_rows'
    
    return missing_df

def generate_imputation_code(missing_df):
    """
    Generate Python code for implementing the recommended imputation strategies
    
    Args:
        missing_df: DataFrame with missing value analysis and strategies
        
    Returns:
        String containing Python code for imputation
    """
    # Group columns by imputation strategy
    strategy_groups = {}
    for _, row in missing_df.iterrows():
        strategy = row['imputation_strategy']
        if strategy not in strategy_groups:
            strategy_groups[strategy] = []
        
        strategy_groups[strategy].append((row['column'], row['data_type']))
    
    # Generate code
    code_lines = [
        "import pandas as pd",
        "import numpy as np",
        "from sklearn.impute import SimpleImputer",
        "",
        "# Function to impute missing values based on recommended strategies",
        "def impute_missing_values(df, grouping_cols=None):",
        "    \"\"\"",
        "    Impute missing values using recommended strategies",
        "    ",
        "    Args:",
        "        df: DataFrame to process",
        "        grouping_cols: Columns to group by for group-based imputation",
        "    ",
        "    Returns:",
        "        DataFrame with imputed values",
        "    \"\"\"",
        "    # Make a copy to avoid modifying the original",
        "    imputed_df = df.copy()",
        "",
        "    # Use default grouping columns if none provided",
        "    if grouping_cols is None:",
        "        # Check if these columns exist in the dataframe",
        "        possible_groups = ['fields.issuetype.name', 'fields.priority.name', 'fields.project.key']",
        "        grouping_cols = [col for col in possible_groups if col in df.columns]",
        "",
        "    # If no grouping columns are available, use median/mode without grouping",
        "    has_groups = len(grouping_cols) > 0",
        "",
    ]
    
    # Add drop column code if needed
    if 'drop_column' in strategy_groups and strategy_groups['drop_column']:
        cols_to_drop = [col for col, _ in strategy_groups['drop_column']]
        code_lines.extend([
            "    # 1. Drop columns with too many missing values",
            f"    cols_to_drop = {cols_to_drop}",
            "    print(f\"Dropping {len(cols_to_drop)} columns with >35% missing values\")",
            "    imputed_df = imputed_df.drop(columns=[col for col in cols_to_drop if col in imputed_df.columns])",
            "",
        ])
    
    # Add median imputation code if needed
    if 'median' in strategy_groups and strategy_groups['median']:
        numeric_cols = [col for col, dtype in strategy_groups['median'] if dtype == 'numeric']
        if numeric_cols:
            code_lines.extend([
                "    # 2. Simple median imputation for numeric columns",
                f"    median_cols = {numeric_cols}",
                "    existing_median_cols = [col for col in median_cols if col in imputed_df.columns]",
                "    if existing_median_cols:",
                "        print(f\"Applying median imputation to {len(existing_median_cols)} columns\")",
                "        imputer = SimpleImputer(strategy='median')",
                "        imputed_df[existing_median_cols] = imputer.fit_transform(imputed_df[existing_median_cols])",
                "",
            ])
    
    # Add mode imputation code if needed
    if 'mode' in strategy_groups and strategy_groups['mode']:
        categorical_cols = [col for col, dtype in strategy_groups['mode'] 
                          if dtype in ['categorical', 'categorical_high_cardinality', 'boolean']]
        if categorical_cols:
            code_lines.extend([
                "    # 3. Simple mode imputation for categorical columns",
                f"    mode_cols = {categorical_cols}",
                "    existing_mode_cols = [col for col in mode_cols if col in imputed_df.columns]",
                "    if existing_mode_cols:",
                "        print(f\"Applying mode imputation to {len(existing_mode_cols)} columns\")",
                "        for col in existing_mode_cols:",
                "            mode_val = imputed_df[col].mode()[0] if not imputed_df[col].mode().empty else None",
                "            imputed_df[col] = imputed_df[col].fillna(mode_val)",
                "",
            ])
    
    # Add grouped median imputation code if needed
    if 'median_by_group' in strategy_groups and strategy_groups['median_by_group']:
        grouped_numeric_cols = [col for col, dtype in strategy_groups['median_by_group'] if dtype == 'numeric']
        if grouped_numeric_cols:
            code_lines.extend([
                "    # 4. Grouped median imputation for numeric columns",
                f"    grouped_median_cols = {grouped_numeric_cols}",
                "    existing_grouped_median_cols = [col for col in grouped_median_cols if col in imputed_df.columns]",
                "    if existing_grouped_median_cols and has_groups:",
                "        print(f\"Applying grouped median imputation to {len(existing_grouped_median_cols)} columns\")",
                "        for col in existing_grouped_median_cols:",
                "            # Calculate medians by group",
                "            group_medians = imputed_df.groupby(grouping_cols)[col].median()",
                "            # For each combination of grouping values, fill with the group median",
                "            for group_values, median_value in group_medians.items():",
                "                if not isinstance(group_values, tuple):",
                "                    group_values = (group_values,)",
                "                if pd.notna(median_value):",
                "                    # Create a mask for this group",
                "                    mask = pd.Series(True, index=imputed_df.index)",
                "                    for i, group_col in enumerate(grouping_cols):",
                "                        mask = mask & (imputed_df[group_col] == group_values[i])",
                "                    # Apply the group median to missing values in this group",
                "                    mask = mask & imputed_df[col].isna()",
                "                    imputed_df.loc[mask, col] = median_value",
                "            # For any remaining NaNs, use overall median",
                "            overall_median = imputed_df[col].median()",
                "            imputed_df[col] = imputed_df[col].fillna(overall_median)",
                "    elif existing_grouped_median_cols:",
                "        # Fall back to simple median if no grouping columns",
                "        imputer = SimpleImputer(strategy='median')",
                "        imputed_df[existing_grouped_median_cols] = imputer.fit_transform(imputed_df[existing_grouped_median_cols])",
                "",
            ])
    
    # Add grouped mode imputation code if needed
    if 'mode_by_group' in strategy_groups and strategy_groups['mode_by_group']:
        grouped_cat_cols = [col for col, dtype in strategy_groups['mode_by_group'] 
                           if dtype in ['categorical', 'categorical_high_cardinality', 'boolean']]
        if grouped_cat_cols:
            code_lines.extend([
                "    # 5. Grouped mode imputation for categorical columns",
                f"    grouped_mode_cols = {grouped_cat_cols}",
                "    existing_grouped_mode_cols = [col for col in grouped_mode_cols if col in imputed_df.columns]",
                "    if existing_grouped_mode_cols and has_groups:",
                "        print(f\"Applying grouped mode imputation to {len(existing_grouped_mode_cols)} columns\")",
                "        for col in existing_grouped_mode_cols:",
                "            # Calculate modes by group",
                "            for group_values, group_df in imputed_df.groupby(grouping_cols):",
                "                if not isinstance(group_values, tuple):",
                "                    group_values = (group_values,)",
                "                # Get mode for this group",
                "                mode_series = group_df[col].mode()",
                "                if not mode_series.empty:",
                "                    mode_value = mode_series[0]",
                "                    # Create a mask for this group",
                "                    mask = pd.Series(True, index=imputed_df.index)",
                "                    for i, group_col in enumerate(grouping_cols):",
                "                        mask = mask & (imputed_df[group_col] == group_values[i])",
                "                    # Apply the group mode to missing values in this group",
                "                    mask = mask & imputed_df[col].isna()",
                "                    imputed_df.loc[mask, col] = mode_value",
                "            # For any remaining NaNs, use overall mode",
                "            mode_val = imputed_df[col].mode()[0] if not imputed_df[col].mode().empty else None",
                "            imputed_df[col] = imputed_df[col].fillna(mode_val)",
                "    elif existing_grouped_mode_cols:",
                "        # Fall back to simple mode if no grouping columns",
                "        for col in existing_grouped_mode_cols:",
                "            mode_val = imputed_df[col].mode()[0] if not imputed_df[col].mode().empty else None",
                "            imputed_df[col] = imputed_df[col].fillna(mode_val)",
                "",
            ])
    
    # Add new category imputation code if needed
    if 'new_category' in strategy_groups and strategy_groups['new_category']:
        new_cat_cols = [col for col, dtype in strategy_groups['new_category'] 
                       if dtype in ['categorical', 'boolean']]
        if new_cat_cols:
            code_lines.extend([
                "    # 6. New category imputation for categorical columns",
                f"    new_category_cols = {new_cat_cols}",
                "    existing_new_cat_cols = [col for col in new_category_cols if col in imputed_df.columns]",
                "    if existing_new_cat_cols:",
                "        print(f\"Applying new category imputation to {len(existing_new_cat_cols)} columns\")",
                "        for col in existing_new_cat_cols:",
                "            # Fill missing with a new category 'Unknown'",
                "            imputed_df[col] = imputed_df[col].fillna('Unknown')",
                "",
            ])
    
    # Add code to drop rows with remaining NaNs in key columns
    code_lines.extend([
        "    # 7. Finally, drop rows with remaining NaNs in essential columns",
        "    essential_columns = ['fields.issuetype.name', 'fields.created', 'key']",
        "    existing_essential = [col for col in essential_columns if col in imputed_df.columns]",
        "    if existing_essential:",
        "        before_rows = len(imputed_df)",
        "        imputed_df = imputed_df.dropna(subset=existing_essential)",
        "        dropped_rows = before_rows - len(imputed_df)",
        "        print(f\"Dropped {dropped_rows} rows with missing values in essential columns\")",
        "",
        "    return imputed_df",
        "",
        "# Example usage:",
        "# df = pd.read_csv('your_file.csv')",
        "# imputed_df = impute_missing_values(df)",
        "# imputed_df.to_csv('imputed_data.csv', index=False)",
    ])
    
    return "\n".join(code_lines)

# Main execution
try:
    # Check if file exists
    if not os.path.exists(csv_file_path):
        print(f"Error: File not found at {csv_file_path}")
    else:
        # Analyze full dataset
        missing_analysis, total_rows = analyze_full_dataset(csv_file_path, chunk_size)
        
        # Determine imputation strategies
        missing_analysis = determine_imputation_strategies(missing_analysis)
        
        # Save missing value analysis
        analysis_output = os.path.join(output_dir, 'missing_value_analysis.csv')
        missing_analysis.to_csv(analysis_output, index=False)
        print(f"Missing value analysis saved to {analysis_output}")
        
        # Generate imputation code
        imputation_code = generate_imputation_code(missing_analysis)
        code_output = os.path.join(output_dir, 'imputation_code.py')
        with open(code_output, 'w') as f:
            f.write(imputation_code)
        print(f"Imputation code generated and saved to {code_output}")
        
        # Create summary visualizations
        plt.figure(figsize=(10, 8))
        
        # Create histogram of missing value percentages
        plt.subplot(2, 1, 1)
        sns.histplot(missing_analysis['missing_percent'], bins=20)
        plt.axvline(x=missing_threshold, color='red', linestyle='--', 
                    label=f'Drop threshold ({missing_threshold}%)')
        plt.title('Distribution of Missing Value Percentages')
        plt.xlabel('Missing Percentage')
        plt.ylabel('Number of Columns')
        plt.legend()
        
        # Create bar chart of action counts
        plt.subplot(2, 1, 2)
        action_counts = missing_analysis['action'].value_counts()
        sns.barplot(x=action_counts.index, y=action_counts.values)
        plt.title('Recommended Actions for Columns')
        plt.xlabel('Action')
        plt.ylabel('Number of Columns')
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'missing_value_summary.png'))
        
        # Create pie chart of data types
        plt.figure(figsize=(10, 6))
        type_counts = missing_analysis['data_type'].value_counts()
        plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%')
        plt.title('Column Data Types')
        plt.savefig(os.path.join(output_dir, 'data_type_distribution.png'))
        
        # Print summary statistics
        print("\n=== MISSING VALUE ANALYSIS SUMMARY ===")
        print(f"Total rows in dataset: {total_rows}")
        print(f"Total columns: {len(missing_analysis)}")
        print(f"Columns to drop (>{missing_threshold}% missing): {missing_analysis['action'].value_counts()['drop']}")
        print(f"Columns to impute: {missing_analysis['action'].value_counts()['impute']}")
        
        # Print data type distribution
        print("\nData type distribution:")
        for dtype, count in type_counts.items():
            print(f"  - {dtype}: {count} columns")
        
        # Print top 10 columns with highest missing percentages
        print("\nTop 10 columns with highest missing percentages:")
        for _, row in missing_analysis.head(10).iterrows():
            print(f"  - {row['column']}: {row['missing_percent']:.2f}% missing ({row['data_type']})")

except Exception as e:
    print(f"An error occurred: {str(e)}")

In [None]:
import pandas as pd
import numpy as np
import os
import json

# Configuration
input_file = "./TaskLevel/consolidated_task_data_filtered.csv"
output_file = "./TaskLevel/processed_task_data.csv"
analysis_file = "./TaskLevel/data_analysis_report.json"  # Adjusted path
sample_size = None  # Set this to None to process all rows

# Load analysis results with error handling
try:
    with open(analysis_file, 'r') as f:
        analysis = json.load(f)
    print(f"Loaded data analysis from {analysis_file}")
except FileNotFoundError:
    print(f"Warning: Analysis file {analysis_file} not found. Using default data types.")
    analysis = {'columns': {}, 'missing_value_columns': []}

# Define data types (with fallback if analysis file wasn't found)
dtype_map = {}
if analysis['columns']:
    for col, info in analysis['columns'].items():
        if info['dtype'] == 'categorical' or info['dtype'] == 'text':
            dtype_map[col] = 'string'
        elif info['dtype'] == 'float64' or info['dtype'] == 'int64':
            dtype_map[col] = 'float64'  # Use float64 for all numeric to handle NaN
else:
    # Fallback type mapping for critical columns
    critical_columns = {
        'fields.issuetype.name': 'string',
        'fields.priority.name': 'string',
        'fields.project.key': 'string',
        'fields.status.name': 'string',
        'fields.creator.name': 'string',
        'is_completed': 'float64',
        'is_resolved': 'float64',
        'type_task': 'float64',
        'type_bug': 'float64',
        'resolution_time_days': 'float64',
        'age_days': 'float64',
        'type_sub_task': 'float64'
    }
    dtype_map.update(critical_columns)

# Define columns to drop
columns_to_drop = []
if 'missing_value_columns' in analysis:
    columns_to_drop = [col['column'] for col in analysis['missing_value_columns'] 
                      if col['missing_percentage'] > 35]
columns_to_drop.extend(['fields.creator.avatarUrls.48x48', 'fields.creator.avatarUrls.24x24', 
                        'fields.creator.avatarUrls.16x16', 'fields.creator.avatarUrls.32x32', 
                        'fields.creator.self'])

# Set pandas options for safer processing
pd.set_option('mode.use_inf_as_na', True)

try:
    # Read data with appropriate types and error handling
    if sample_size:
        print(f"Reading {sample_size} rows from {input_file}")
        df = pd.read_csv(input_file, dtype=dtype_map, nrows=sample_size, low_memory=False)
    else:
        print(f"Reading all rows from {input_file}")
        df = pd.read_csv(input_file, dtype=dtype_map, low_memory=False)

    print(f"Loaded {len(df)} rows")

    # Drop high-missing columns
    print(f"Dropping columns with high missing rates or URL data")
    columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    if columns_to_drop:
        df = df.drop(columns=columns_to_drop)
        print(f"Dropped {len(columns_to_drop)} columns")

    # Explicitly convert date columns
    date_columns = ['fields.created', 'fields.updated', 'fields.resolutiondate']
    print("Converting date columns to datetime...")
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    print("Creating features for effort estimation...")

    # Process each feature carefully with try/except blocks
    try:
        # 1. Process datetime fields for temporal features
        if 'fields.created' in df.columns:
            print("Creating temporal features...")
            df['created_day_of_week'] = df['fields.created'].dt.dayofweek
            df['created_is_weekend'] = (df['fields.created'].dt.dayofweek >= 5).astype(float)
            df['created_hour'] = df['fields.created'].dt.hour
            df['created_month'] = df['fields.created'].dt.month
            df['created_year'] = df['fields.created'].dt.year
    except Exception as e:
        print(f"Error creating temporal features: {str(e)}")

    try:
        # 2. Calculate resolution hours
        if all(col in df.columns for col in ['fields.created', 'fields.resolutiondate']):
            print("Calculating resolution hours...")
            df['resolution_hours'] = np.nan
            mask = df['fields.resolutiondate'].notna()
            if mask.any():
                df.loc[mask, 'resolution_hours'] = (
                    (df.loc[mask, 'fields.resolutiondate'] - df.loc[mask, 'fields.created']).dt.total_seconds() / 3600
                )
            df['log_resolution_hours'] = np.log1p(df['resolution_hours'].fillna(0).clip(lower=0))
    except Exception as e:
        print(f"Error calculating resolution hours: {str(e)}")

    try:
        # 3. Create one-hot encodings
        if 'fields.issuetype.name' in df.columns:
            print("Creating issue type encodings...")
            for issue_type in ['Bug', 'Task', 'Story', 'Improvement', 'New Feature', 'Epic', 'Sub-task']:
                col_name = f'is_type_{issue_type.lower().replace(" ", "_")}'
                df[col_name] = (df['fields.issuetype.name'].str.lower() == issue_type.lower()).astype(float)
    except Exception as e:
        print(f"Error creating issue type encodings: {str(e)}")

    try:
        if 'fields.priority.name' in df.columns:
            print("Creating priority encodings...")
            for priority in ['Blocker', 'Critical', 'Major', 'Minor', 'Trivial']:
                col_name = f'is_priority_{priority.lower()}'
                df[col_name] = df['fields.priority.name'].str.lower().str.contains(priority.lower(), na=False).astype(float)
    except Exception as e:
        print(f"Error creating priority encodings: {str(e)}")

    # Imputation section with more robust error handling
    print("Performing imputation for missing values...")

    # Calculate age in days for all issues (needed for imputation)
    try:
        if 'fields.created' in df.columns and 'age_days' not in df.columns:
            print("Calculating age_days...")
            current_time = pd.Timestamp.now()
            if 'fields.created' in df.columns and len(df) > 0 and df['fields.created'].iloc[0] is not None:
                if hasattr(df['fields.created'].iloc[0], 'tzinfo') and df['fields.created'].iloc[0].tzinfo is not None:
                    current_time = current_time.tz_localize('UTC')
            df['age_days'] = (current_time - df['fields.created']).dt.total_seconds() / (24 * 3600)
    except Exception as e:
        print(f"Error calculating age_days: {str(e)}")
        df['age_days'] = 30.0  # Fallback value

    # For unresolved issues, impute resolution hours
    try:
        unresolved_mask = df['fields.resolutiondate'].isna()
        if unresolved_mask.any():
            print(f"Imputing resolution hours for {unresolved_mask.sum()} unresolved issues")
            
            # First get global_median as fallback
            resolved_mask = ~unresolved_mask
            if resolved_mask.any() and 'resolution_hours' in df.columns:
                global_median = df.loc[resolved_mask, 'resolution_hours'].median()
                if pd.isna(global_median):  # If still no valid median
                    global_median = 24.0  # Default to 24 hours
            else:
                global_median = 24.0  # Default value
                
            # Try group-based imputation
            try:
                if all(col in df.columns for col in ['fields.issuetype.name', 'fields.priority.name']):
                    # Group by issue type and priority
                    for name, group in df[resolved_mask].groupby(['fields.issuetype.name', 'fields.priority.name']):
                        if len(group) > 0 and 'resolution_hours' in group.columns:
                            median_hours = group['resolution_hours'].median()
                            if pd.isna(median_hours):
                                median_hours = global_median
                                
                            # Create safe mask for this group
                            if isinstance(name, tuple) and len(name) == 2:
                                issue_type, priority = name
                                type_mask = (df['fields.issuetype.name'] == issue_type)
                                prio_mask = (df['fields.priority.name'] == priority)
                                group_mask = type_mask & prio_mask & unresolved_mask
                                
                                if group_mask.any():
                                    # Apply age-based adjustment
                                    age_factor = 1.0 + 0.1 * (df.loc[group_mask, 'age_days'] / 30.0).clip(0, 10.0)
                                    df.loc[group_mask, 'resolution_hours'] = median_hours * age_factor
            except Exception as e:
                print(f"Error in group-based imputation: {str(e)}")
                
            # For any remaining missing values, use global median
            missing_hours = df['resolution_hours'].isna() & unresolved_mask
            if missing_hours.any():
                age_factor = 1.0 + 0.1 * (df.loc[missing_hours, 'age_days'] / 30.0).clip(0, 10.0)
                df.loc[missing_hours, 'resolution_hours'] = global_median * age_factor
                
            # Update log transform
            if 'log_resolution_hours' in df.columns:
                imputed_mask = unresolved_mask & df['resolution_hours'].notna() & (df['resolution_hours'] > 0)
                if imputed_mask.any():
                    df.loc[imputed_mask, 'log_resolution_hours'] = np.log1p(df.loc[imputed_mask, 'resolution_hours'])
    except Exception as e:
        print(f"Error imputing unresolved issues: {str(e)}")

    # Save processed dataset
    print(f"Saving processed dataset with {len(df.columns)} columns to {output_file}")
    df.to_csv(output_file, index=False)

    # Print summary of new features
    try:
        original_columns = set(pd.read_csv(input_file, nrows=0).columns)
        new_columns = [col for col in df.columns if col not in original_columns]
        
        print(f"\nCreated {len(new_columns)} new features:")
        for col in sorted(new_columns):
            non_null = df[col].notna().sum()
            print(f"- {col}: {non_null} non-null values ({non_null/len(df)*100:.1f}%)")
    except Exception as e:
        print(f"Error generating feature summary: {str(e)}")

    print("\nProcessing complete!")

except Exception as e:
    print(f"Critical error in processing: {str(e)}")
    import traceback
    traceback.print_exc()

Loaded data analysis from ./TaskLevel/data_analysis_report.json
Reading 1000 rows from ./TaskLevel/consolidated_task_data_filtered.csv
Loaded 1000 rows
Dropping columns with high missing rates or URL data
Dropped 9 columns
Converting date columns to datetime...
Creating features for effort estimation...
Creating temporal features...
Calculating resolution hours...
Creating issue type encodings...
Creating priority encodings...
Performing imputation for missing values...
Imputing resolution hours for 274 unresolved issues
Saving processed dataset with 55 columns to ./TaskLevel/processed_task_data.csv

Created 19 new features:
- created_day_of_week: 1000 non-null values (100.0%)
- created_hour: 1000 non-null values (100.0%)
- created_is_weekend: 1000 non-null values (100.0%)
- created_month: 1000 non-null values (100.0%)
- created_year: 1000 non-null values (100.0%)
- is_priority_blocker: 1000 non-null values (100.0%)
- is_priority_critical: 1000 non-null values (100.0%)
- is_priority_ma


use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.



In [24]:
import pandas as pd
import dtale

# Replace this path with the location of your CSV file
csv_file_path = "./TaskLevel/processed_task_data.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file_path)

# Start a D-Tale session and open it in the browser
d = dtale.show(df, ignore_duplicate=True, allow_cell_edits=False)
d.open_browser()