In [None]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import json
import ast
import gc
import os
import tempfile
from datetime import datetime

# Connect to the database
client = MongoClient("mongodb://admin:password@localhost:27017/")
db = client['JiraRepos']

# Load the Jira Data Sources JSON
with open('../0. DataDefinition/jira_data_sources.json') as f:
    jira_data_sources = json.load(f)

def fix_data_types(df, numeric_threshold=0.9):
    """
    Convert DataFrame columns (stored as strings) to appropriate data types,
    excluding any date formatting.

    For each column that is not list-like:
      - If at least `numeric_threshold` fraction of values can be converted to numeric,
        the column is converted to a numeric dtype.
      - Otherwise, the column is cast to 'category' dtype.
    """
    for col in df.columns:
        if df[col].apply(lambda x: isinstance(x, list)).any():
            continue
        numeric_series = pd.to_numeric(df[col], errors='coerce')
        if numeric_series.notnull().mean() >= numeric_threshold:
            df[col] = numeric_series
        else:
            df[col] = df[col].astype('category')
    return df

def extract_jira_with_features(jira_name, project_name=None, batch_size=1200, output_file=None, max_batches=None):
    """
    Extract Jira data with basic feature engineering.
    
    Parameters:
        jira_name (str): Name of the Jira repository
        project_name (list): Optional list of project IDs to filter
        batch_size (int): Size of batches for processing
        output_file (str): Optional path to save the output CSV
        max_batches (int): Optional maximum number of batches to process
    
    Returns:
        pd.DataFrame: Processed dataframe with extracted features
    """
    print(f"\nExtracting data from: {jira_name} ...")
    
    # Get total count
    total_issues = db[jira_name].count_documents({})
    if total_issues == 0:
        print(f"⚠️ No documents found for '{jira_name}', skipping.")
        return None
    
    # Define fields we need
    needed_fields = {
        # Essential fields
        "id": 1,
        "key": 1,
        
        # Classification fields
        "fields.issuetype.name": 1,
        "fields.issuetype.id": 1,
        "fields.priority.name": 1, 
        "fields.priority.id": 1,
        "fields.status.name": 1,
        
        # Temporal data
        "fields.created": 1,
        "fields.updated": 1,
        "fields.resolutiondate": 1,
        
        # Estimation fields
        "fields.timeoriginalestimate": 1,
        "fields.timeestimate": 1,
        "fields.timespent": 1,
        
        # Issue links
        "fields.issuelinks": 1,
        
        # Project context
        "fields.project.id": 1,
        "fields.project.key": 1,
        "fields.project.name": 1,

        # Team context
        "fields.asignee": 1,
        "fields.creator": 1
    }
    
    # Create query filter
    query_filter = {}
    if project_name:
        if isinstance(project_name, list):
            query_filter["fields.project.id"] = {"$in": project_name}
        else:
            query_filter["fields.project.id"] = project_name
    
    # Process in batches and store in CSV for memory efficiency
    temp_dir = tempfile.mkdtemp()
    batch_files = []
    total_processed = 0
    
    try:
        batch_count = 0
        
        while True:
            # Check if we've reached the maximum number of batches
            if max_batches is not None and batch_count >= max_batches:
                print(f"Reached maximum batch limit ({max_batches}). Stopping extraction.")
                break
                
            # Create a fresh cursor for each batch
            cursor = db[jira_name].find(
                query_filter, 
                needed_fields
            ).skip(batch_count * batch_size).limit(batch_size)
            
            batch = list(cursor)
            if not batch:
                break
            
            # Process batch
            batch_df = pd.json_normalize(batch, sep='.')
            
            # Fix data types as in your notebook
            batch_df = fix_data_types(batch_df)
            
            # Add repository name
            batch_df['repository'] = jira_name
            
            # Save to CSV
            batch_file = os.path.join(temp_dir, f"{jira_name}_batch_{batch_count}.csv")
            batch_df.to_csv(batch_file, index=False)
            batch_files.append(batch_file)
            
            batch_count += 1
            total_processed += len(batch)
            print(f"  - Processed batch {batch_count} ({len(batch)} issues, {total_processed} total)")
            
            # Free memory
            del batch
            del batch_df
            gc.collect()
        
        if not batch_files:
            print("No data collected.")
            return None
        
        # Read all CSV files and combine with better memory management
        print(f"Combining {len(batch_files)} processed batches...")
        final_df = None
        
        # Process one batch at a time to avoid loading all data into memory at once
        for i, file in enumerate(batch_files):
            try:
                print(f"Processing batch file {i+1}/{len(batch_files)}")
                chunk = pd.read_csv(file, low_memory=False)
                
                if final_df is None:
                    final_df = chunk
                else:
                    final_df = pd.concat([final_df, chunk], ignore_index=True)
                    
                # Explicitly delete the chunk to free memory
                del chunk
                gc.collect()
            except Exception as e:
                print(f"Error reading batch file {file}: {e}")
        
        if final_df is None or len(final_df) == 0:
            print("No valid data after combining batches.")
            return None
            
        print(f"Combined DataFrame has {len(final_df)} rows and {len(final_df.columns)} columns.")
        
        # --- Feature Engineering ---
        
        # 1. Process date fields using functions from your notebook
        def parse_date_str(x):
            """
            Parse a date string. If the string is "Missing", empty, or cannot be parsed, return pd.NaT.
            """
            if pd.isnull(x):
                return pd.NaT
            s = str(x).strip()
            if s.lower() == "missing" or s == "":
                return pd.NaT
            
            return x
        
        def convert_date_columns_dateparser(df, date_columns):
            """Convert date columns from string to datetime using custom parse_date_str"""
            for col in date_columns:
                if col in df.columns:
                    df[col] = df[col].apply(parse_date_str)
                    df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)
            return df
        
        def drop_invalid_dates(df, date_columns):
            """Drop rows where any of the specified date columns are NaT"""
            return df.dropna(subset=date_columns)
        
        # Apply date processing functions
        date_columns = ["fields.created", "fields.updated", "fields.resolutiondate"]
        final_df = convert_date_columns_dateparser(final_df, date_columns)
        
        # We'll keep all rows for now, even those with missing dates
        # Comment this in if you want to drop rows with invalid dates:
        # final_df = drop_invalid_dates(final_df, date_columns)
        
        # 2. Extract priority information into separate columns
        if 'fields.priority.name' in final_df.columns:
            final_df['priority_name'] = final_df['fields.priority.name']
        if 'fields.priority.id' in final_df.columns:
            final_df['priority_id'] = final_df['fields.priority.id']
        
        # 3. Extract issue type and status information into separate columns
        if 'fields.issuetype.name' in final_df.columns:
            final_df['issue_type'] = final_df['fields.issuetype.name']
        if 'fields.issuetype.id' in final_df.columns:
            final_df['issue_type_id'] = final_df['fields.issuetype.id']
        if 'fields.status.name' in final_df.columns:
            final_df['status'] = final_df['fields.status.name']
        
        # Add is_completed field based on status
        if 'status' in final_df.columns:
            # Common completion status terms (may need adjustment based on your workflow)
            completed_statuses = ['Done', 'Closed', 'Resolved', 'Complete', 'Completed', 'Fixed', 
                                 'Finished', 'Released', 'Delivered', 'Verified']
            
            # Create the is_completed flag (case-insensitive matching)
            final_df['is_completed'] = final_df['status'].apply(
                lambda x: 1 if any(s.lower() in str(x).lower() for s in completed_statuses) else 0
            )
        
        # 4. Create binary features for each issue type (renamed to type_{type})
        if 'issue_type' in final_df.columns:
            issue_types = final_df['issue_type'].dropna().unique()
            for issue_type in issue_types:
                safe_name = str(issue_type).lower().replace(' ', '_').replace('-', '_')
                final_df[f'type_{safe_name}'] = (final_df['issue_type'] == issue_type).astype(int)
        
        # 5. Create binary features for each priority level (renamed to priority_{priority})
        if 'priority_name' in final_df.columns:
            priorities = final_df['priority_name'].dropna().unique()
            for priority in priorities:
                if pd.notna(priority):
                    safe_name = str(priority).lower().replace(' ', '_').replace('-', '_')
                    final_df[f'priority_{safe_name}'] = (final_df['priority_name'] == priority).astype(int)
        
        # 6. Process issue links
        if 'fields.issuelinks' in final_df.columns:
            def parse_issuelinks(issuelinks):
                """Parse the issuelinks field and return counts by type"""
                if pd.isna(issuelinks) or not issuelinks:
                    return {'inward_count': 0, 'outward_count': 0}
                
                # Convert string representation to list if needed
                if isinstance(issuelinks, str):
                    try:
                        issuelinks = ast.literal_eval(issuelinks)
                    except:
                        return {'inward_count': 0, 'outward_count': 0}
                
                # Count inward and outward links
                inward_count = sum(1 for link in issuelinks if 'inwardIssue' in link)
                outward_count = sum(1 for link in issuelinks if 'outwardIssue' in link)
                
                return {
                    'inward_count': inward_count,
                    'outward_count': outward_count
                }
            
            # Apply function and expand results
            link_counts = final_df['fields.issuelinks'].apply(parse_issuelinks)
            link_df = pd.json_normalize(link_counts)
            
            # Add columns to main dataframe
            for col in link_df.columns:
                final_df[col] = link_df[col]
        
        # 7. Create temporal features focusing on task estimation
        if all(col in final_df.columns for col in ['fields.created', 'fields.resolutiondate']):
            # Ensure both dates are timezone-aware and in UTC before calculations
            final_df['fields.created'] = pd.to_datetime(final_df['fields.created'], utc=True)
            final_df['fields.resolutiondate'] = pd.to_datetime(final_df['fields.resolutiondate'], utc=True)
            
            # Calculate time to resolution in days (only for resolved issues)
            # Note: resolution_time_days will be NaN for unresolved issues (expected behavior)
            resolved_mask = ~final_df['fields.resolutiondate'].isna()
            
            # Add resolved status flag
            final_df['is_resolved'] = resolved_mask.astype(int)
            
            # Calculate resolution time only for resolved issues
            final_df['resolution_time_days'] = np.nan
            final_df.loc[resolved_mask, 'resolution_time_days'] = (
                final_df.loc[resolved_mask, 'fields.resolutiondate'] - 
                final_df.loc[resolved_mask, 'fields.created']
            ).dt.total_seconds() / (24 * 3600)
            
            # Handle negative values (data errors)
            final_df.loc[final_df['resolution_time_days'] < 0, 'resolution_time_days'] = np.nan
            
            # Calculate age in days for all issues (resolved and unresolved)
            current_time = pd.Timestamp.now(tz='UTC')
            final_df['age_days'] = (
                current_time - final_df['fields.created']
            ).dt.total_seconds() / (24 * 3600)
        
        # 8. Estimation accuracy features
        if all(col in final_df.columns for col in ['fields.timeoriginalestimate', 'fields.timespent']):
            # Calculate ratio of time spent to estimated time (both in seconds)
            mask = (final_df['fields.timeoriginalestimate'] > 0) & (final_df['fields.timespent'] > 0)
            final_df['estimation_ratio'] = np.nan
            final_df.loc[mask, 'estimation_ratio'] = (
                final_df.loc[mask, 'fields.timespent'] / 
                final_df.loc[mask, 'fields.timeoriginalestimate']
            )
        
        # Save the output if requested
        if output_file:
            print(f"Saving data to {output_file}")
            final_df.to_csv(output_file, index=False)
        
        return final_df
    
    finally:
        # Clean up temporary files
        for file in batch_files:
            try:
                if os.path.exists(file):
                    os.remove(file)
            except Exception as e:
                print(f"Error removing temp file {file}: {e}")
        
        try:
            if os.path.exists(temp_dir):
                os.rmdir(temp_dir)
        except Exception as e:
            print(f"Error removing temp directory {temp_dir}: {e}")


def extract_all_repositories(repositories=None, output_dir="jira_extracted_data", max_batches=None):
    """
    Extract data from multiple repositories, creating separate files for each project.
    
    Parameters:
        repositories (list): List of repository names to process
        output_dir (str): Directory to save output files
        max_batches (int): Maximum number of batches to process per project
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Use all repositories if none specified
    if repositories is None:
        repositories = list(jira_data_sources.keys())
    
    for repository in repositories:
        print(f"\n{'='*50}")
        print(f"Processing repository: {repository}")
        print(f"{'='*50}")
        
        # First, extract all project IDs for the repository
        print(f"Fetching project IDs for {repository}...")
        try:
            project_ids = list(db[repository].distinct("fields.project.id"))
            print(f"Found {len(project_ids)} projects in {repository}")
            
            # Create a repo-specific directory
            repo_dir = os.path.join(output_dir, repository)
            os.makedirs(repo_dir, exist_ok=True)
            
            # For each project ID, extract data and save to a separate CSV
            for project_id in project_ids:
                # Get the project name (for a more descriptive filename)
                project_name_doc = db[repository].find_one(
                    {"fields.project.id": project_id}, 
                    {"fields.project.name": 1}
                )
                
                project_name = "unknown"
                if project_name_doc and "fields" in project_name_doc and "project" in project_name_doc["fields"]:
                    project_name = project_name_doc["fields"]["project"].get("name", "unknown")
                    # Make the project name safe for a filename
                    project_name = project_name.replace(' ', '_').replace('/', '_').replace('\\', '_')
                
                print(f"\nProcessing project: {project_name} (ID: {project_id})")
                
                # Create a unique output filename
                output_file = os.path.join(repo_dir, f"{project_id}_{project_name}.csv")
                
                # Extract data for this project
                df = extract_jira_with_features(
                    jira_name=repository,
                    project_name=project_id,
                    output_file=output_file,
                    max_batches=max_batches
                )
                
                if df is not None:
                    print(f"Extraction complete for project {project_name}. Shape of data: {df.shape}")
                    print(f"Data saved to: {output_file}")
                else:
                    print(f"Extraction failed or no data found for project {project_name}.")
        except Exception as e:
            print(f"Error processing repository {repository}: {e}")
    
    print("\nAll repositories processed.")


if __name__ == "__main__":
    # Option to specify repositories to process
    # "MariaDB", "Mojang"
    repositories_to_process = ["Apache"]  # Change this list as needed
    
    # Use the helper function to process all repositories
    extract_all_repositories(
        repositories=repositories_to_process,
        output_dir="jira_extracted_data",
        max_batches=None  # Set to a number to limit batches or None for all
    )


Processing repository: Jira
Fetching project IDs for Jira...
Found 30 projects in Jira

Processing project: Jira_Server_and_Data_Center (ID: 10240)

Extracting data from: Jira ...
  - Processed batch 1 (1200 issues, 1200 total)
  - Processed batch 2 (1200 issues, 2400 total)
  - Processed batch 3 (1200 issues, 3600 total)
  - Processed batch 4 (1200 issues, 4800 total)
  - Processed batch 5 (1200 issues, 6000 total)
  - Processed batch 6 (1200 issues, 7200 total)
  - Processed batch 7 (1200 issues, 8400 total)
  - Processed batch 8 (1200 issues, 9600 total)
  - Processed batch 9 (1200 issues, 10800 total)
  - Processed batch 10 (1200 issues, 12000 total)
  - Processed batch 11 (1200 issues, 13200 total)
  - Processed batch 12 (1200 issues, 14400 total)
  - Processed batch 13 (1200 issues, 15600 total)
  - Processed batch 14 (1200 issues, 16800 total)
  - Processed batch 15 (1200 issues, 18000 total)
  - Processed batch 16 (1200 issues, 19200 total)
  - Processed batch 17 (1200 issues,

In [1]:
import pandas as pd
import dtale

# Replace this path with the location of your CSV file
csv_file_path = "./jira_extracted_data/MongoDB/10000_Core_Server.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(csv_file_path)

# Start a D-Tale session and open it in the browser
d = dtale.show(df, ignore_duplicate=True, allow_cell_edits=False)
d.open_browser()