### Import Libraries

In [2]:
# Database & File IO
from pymongo import MongoClient
import json5 as json

# Standard Data Manipulation
from collections import defaultdict
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)  # We want to see all data
from statistics import mean, median

# Tracking Time
from time import time

### Load Data Sources

In [None]:
# Connect to the database
client = MongoClient("mongodb://admin:password@localhost:27017/")
db = client['JiraRepos']

# Load the Jira Data Sources JSON
with open('../0. DataDefinition/jira_data_sources.json') as f:
    jira_data_sources = json.load(f)

# Load the Jira Issue Types Information (Downloaded using the DataDownload script)
with open('../0. DataDefinition/jira_issuetype_information.json') as f:
    jira_issuetype_information = json.load(f)

# Load the Jira Issue Link Types Information (Downloaded using the DataDownload script)
with open('../0. DataDefinition/jira_issuelinktype_information.json') as f:
    jira_issuelinktype_information = json.load(f)

# Load the Jira Thematic Analysis JSON
# with open('./jira_issuetype_thematic_analysis.json') as f:
#     issuetype_themes_codes = json.load(f)

### Define Helpful Globals

In [4]:
ALL_JIRAS = [jira_name for jira_name in jira_data_sources.keys()]

### Define Data Structures

In [5]:
# These are the global dataframes that we will perform our analysis on.
df_jiras = pd.DataFrame(
    np.nan,
    columns=['Born', 'Issues', 'DIT', 'UIT', 'Links', 'DLT', 'ULT', 'Changes', 'Ch/I', 'UP', 'Comments', 'Co/I'],
    index=ALL_JIRAS + ['Sum', 'Median', 'Std Dev']
)

### Query Data for Stats

In [None]:
def populate_df_jiras(df_jiras, jiras=ALL_JIRAS):
    
    def extract_number_of_issues(jira_name):
        # Query for the count of all issues
        num_issues = db[jira_name].count_documents({})
        # Return value
        return num_issues
            
    def extract_number_of_documented_issuetypes(jira_name):
        # Extract the number of documented issue types from the downloaded issuetype_information JSON downloaded earlier, and return
        return len(jira_issuetype_information[jira_name])
        
    def extract_number_of_used_issuetypes(jira_name):
        # Query for unique set of issuetypes in the final state of the issue
        query_result = list(db[jira_name].aggregate([
            # We only need the issuetype name for the final state evaluation
            { '$project': { '_id': 0, 'issuetype_name': '$fields.issuetype.name' } },
            # Create a unique set of these names
            { '$group': { '_id': None, 'issuetype_names': { '$addToSet': '$issuetype_name' } } }
        ]))
        # Extract the query
        unique_issuetypes_final = set(query_result[0]['issuetype_names']) if query_result else set()
        # Query for unique set of issuetypes in the issue history
        query_result = list(db[jira_name].aggregate([
            # Unwind the histories and items to work with individual change items
            { '$unwind': '$changelog.histories' },
            { '$unwind': '$changelog.histories.items' },
            # We only want the changes to the 'issuetype' field
            { '$match': { 'changelog.histories.items.field': 'issuetype' } },
            # Select and rename the nested 'fromString' attribute. We only care what the issueType was BEFORE changing.
            # We have the subsequent 'toString' values in the next change 'fromString' or the final state extracted above.
            { '$project': { '_id': 0, 'issuetype_name': '$changelog.histories.items.fromString' } },
            # Create a unique set of these names
            { '$group': { '_id': None, 'issuetype_names': { '$addToSet': '$issuetype_name' } } }
        ]))
        # Extract the query
        unique_issuetypes_history = set(query_result[0]['issuetype_names']) if query_result else 0
        # Union the two sets together, and count the items, and return
        return len(set.union(unique_issuetypes_final, unique_issuetypes_history))
    
    def extract_number_of_issuelinks(jira_name):
        # Extract the issuelinks
        issuelinks_result = list(db[jira_name].aggregate([
            # Limit to issues with issuelinks
            { '$match': { 'fields.issuelinks': { '$exists': True, '$ne': [] } } },
            # Limit the object data to just the issuelink ids, and rename/condense into a single field
            { '$project': { '_id': 0, 'issuelink_ids_issue': '$fields.issuelinks.id' } },
            # Create a new "row" for each issue link, since issues can have multiple issuelinks each
            { '$unwind': '$issuelink_ids_issue' },
            # Create a unique set of issuelink ids. Issuelinks link multiple issues together, but we only want to count this link once.
            { '$group': { '_id': None, 'issuelink_unique_ids': { '$addToSet': '$issuelink_ids_issue' } } }
        ]))
        num_issuelinks = len(set(issuelinks_result[0]['issuelink_unique_ids'])) if issuelinks_result else 0
        # Extract the subtasks
        subtasks_result = list(db[jira_name].aggregate([
            # Limit to issues with subtasks
            { '$match': { 'fields.subtasks': { '$exists': True, '$ne': [] } } },
            # Limit the object data to just the size of the subtask arrays.
            { '$project': { '_id': 0, 'num_issue_subtasks': { '$size': '$fields.subtasks' } } },
            # Count the subtask arrays across the entire jira dataset
            { '$group': { '_id': None, 'num_subtasks': { '$sum': '$num_issue_subtasks' } } }
        ]))
        num_subtasks = subtasks_result[0]['num_subtasks'] if subtasks_result else 0
        # Extract the epic links
        epiclinkfield_dict = {
            'Apache': 'customfield_12311120',
            'Hyperledger': 'customfield_10006',
            'IntelDAOS': 'customfield_10092',
            'JFrog': 'customfield_10806',
            'Jira': 'customfield_12931',
            'JiraEcosystem': 'customfield_12180',
            'MariaDB': 'customfield_10600',
            'Mindville': 'customfield_10000',
            'Mojang': 'customfield_11602',
            'MongoDB': 'customfield_10857',
            'Qt': 'customfield_10400',
            'RedHat': 'customfield_12311140',
            'Sakai': 'customfield_10772',
            'SecondLife': 'customfield_10871',
            'Sonatype': 'customfield_11500',
            'Spring': 'customfield_10680'
        }
        epiclinks_result = list(db[jira_name].aggregate([
            # Rename the field since every Jira uses a different customfield name
            { '$project': { 'epiclink_field': f"$fields.{epiclinkfield_dict[jira_name]}" } },
            # Limit to issues with epiclink fields
            { '$match': { 'epiclink_field': { '$exists': True, '$ne': None } } },
            # Count the number of records in the aggregation
            { '$count': 'num_epiclinks' }
        ]))
        num_epiclinks = epiclinks_result[0]['num_epiclinks'] if epiclinks_result else 0  # Some repos have no epiclinks, so we need to catch this
        # Total the number of issuelinks by summing the three values above, and return
        return sum([num_issuelinks, num_subtasks, num_epiclinks])
    
    def extract_number_of_documented_issuelinktypes(jira_name):
        # Extract the number of documented issue link types from the downloaded issuelinktype_information JSON downloaded earlier, and return
        return len(jira_issuelinktype_information[jira_name]) if jira_name in jira_issuelinktype_information else 0
    
    def extract_number_of_used_issuelinktypes(jira_name):
        # Query for unique set of issuelinktypes in the final state of the issue
        query_result = list(db[jira_name].aggregate([
            # Unwind the issuelinks into individual records
            { '$unwind': '$fields.issuelinks' },
            # Select and rename the issuelink type name to prepare for the group operator
            { '$project': { '_id': 0, 'issuelinktype_name': '$fields.issuelinks.type.name' } },
            # Create a unique set of the issuelink type names
            { '$group': { '_id': None, 'issuelinktype_names': { '$addToSet': '$issuelinktype_name' } } }
        ]))
        # Extract the query, and return value
        return len(set(query_result[0]['issuelinktype_names'])) if query_result else 0
    
    def extract_born(jira_name):
        # Get the first N issues in each repo to check for the initial "birth" of the repo
        created_dates = [issue['fields']['created'] for issue in
            db[jira_name].aggregate([
                # We only need the created field
                { '$project': { '_id': 0, 'fields.created': 1 } },
                # Sort the items by created date (ascending) to get the earliest dates first
                { '$sort': { 'fields.created': 1 } },
                # We only technically need the first item, but practically there are issues that need to be manually reviewed below
                { '$limit': 500 }
            ])
        ]
        # Manual analaysis of the created dates revealed a number of broken or testing issues that should be ignored
        if jira_name == 'Apache':
            created_dates = created_dates[289:]
        elif jira_name == 'Jira':
            created_dates = created_dates[1:]
        elif jira_name == 'IntelDAOS':
            created_dates = created_dates[1:]
        elif jira_name == 'Qt':
            created_dates = created_dates[7:]
        # Return value
        return created_dates[0][:4]
    
    def extract_number_of_changes(jira_name):
        # Query for the number of changes
        query_result = list(db[jira_name].aggregate([
            # We only need one attribute of the change to count it
            { '$project': { '_id': 0, 'changelog.histories.items.field': 1 } },
            # Unwind the histories and items arrays into single elements so we can count them
            { '$unwind': '$changelog.histories' },
            { '$unwind': '$changelog.histories.items' },
            # Count number of elements in our aggregation, which is now the number of items
            { '$count': 'num_changes' }
        ]))
        # Extract the query result and return
        return query_result[0]['num_changes'] if query_result else 0
    
    def extract_number_of_unique_projects(jira_name):
        # Query for a unique set of project ids in the final state of the issue
        query_result = list(db[jira_name].aggregate([
            # Limit to just the final project name on each issue
            { '$project': { '_id': 0, 'project_name': '$fields.project.name' } },
            # Create a unique set of project names across the entire Jira
            { '$group': { '_id': None, 'project_names': { '$addToSet': '$project_name' } } }
        ]))
        # Extract the query result
        unique_projects_final = set(query_result[0]['project_names']) if query_result else set()
        # Query for a unique set of project ids in the issue history
        query_result = list(db[jira_name].aggregate([
            # Unwind the histories and items to work with individual change items
            { '$unwind': '$changelog.histories' },
            { '$unwind': '$changelog.histories.items' },
            # Select only changes where the project field was changed
            { '$match': { 'changelog.histories.items.field': 'project' } },
            # Rename the nested 'fromString' field containing the previously slected project
            { '$project': { '_id': 0, 'project_name': '$changelog.histories.items.fromString' } },
            # Create a unique set of these project names
            { '$group': { '_id': None, 'project_names': { '$addToSet': '$project_name' } } }
        ]))
        # Extract the query result
        unique_projects_history = set(query_result[0]['project_names']) if query_result else set()
        # Union the two sets together, count the items, and return
        return len(set.union(unique_projects_final, unique_projects_history))
    
    def extract_number_of_comments(jira_name):
        # Query for the number of changes
        query_result = list(db[jira_name].aggregate([
            # Get issues with comments
            { '$match': { 'fields.comments': { '$ne': None } } },
            # We only need one attribute of the change to count it
            { '$project': { '_id': 0, 'num_comments_per_issue': { '$size': '$fields.comments' } } },
            # Group the sizes so we can sum all to a single value for the repo
            { '$group': { '_id': None, 'num_comments': { '$sum': '$num_comments_per_issue' } } },
        ]))
        # Extract the query result and return
        return query_result[0]['num_comments'] if query_result else 0
    
    print('This script takes ~90 minutes when executed across all Jiras.')
    
    # Populate the table with the answers to our questions
    for jira_name in jiras:
        print(f"\tWorking on Jira: {jira_name} ...")
        
        ## Issues and their Types ##
        
        # Attribute: Issues (number of issues)
        df_jiras.loc[jira_name, 'Issues'] = extract_number_of_issues(jira_name)
        # Attribute: DIT (documented issue types)
        df_jiras.loc[jira_name, 'DIT'] = extract_number_of_documented_issuetypes(jira_name)
        # Attribute: UIT (used issue types)
        df_jiras.loc[jira_name, 'UIT'] = extract_number_of_used_issuetypes(jira_name)
        
        ## Issue Links and their Types ##

        # Attribute: Links (number of links)
        df_jiras.loc[jira_name, 'Links'] = extract_number_of_issuelinks(jira_name)
        # Attribute: DLT (documented link types)
        df_jiras.loc[jira_name, 'DLT'] = extract_number_of_documented_issuelinktypes(jira_name)
        # Attribute: ULD (used link types)
        df_jiras.loc[jira_name, 'ULT'] = extract_number_of_used_issuelinktypes(jira_name)
        
        ## General Information ##
        
        # Attribute: Born (first issue added)
        df_jiras.loc[jira_name, 'Born'] = extract_born(jira_name)
        # Attribute: Changes (number of changes)
        df_jiras.loc[jira_name, 'Changes'] = extract_number_of_changes(jira_name)
        # Attribute: Ch/I (number of changes per issue)
        df_jiras.loc[jira_name, 'Ch/I'] = round(df_jiras.loc[jira_name, 'Changes'] / df_jiras.loc[jira_name, 'Issues'])
        # Attribute: UP (unique projects)
        df_jiras.loc[jira_name, 'UP'] = extract_number_of_unique_projects(jira_name)
        # Attribute: Comments (number of comments)
        df_jiras.loc[jira_name, 'Comments'] = extract_number_of_comments(jira_name)
        # Attribute: Co/I (number of comments per issue)
        df_jiras.loc[jira_name, 'Co/I'] = round(df_jiras.loc[jira_name, 'Comments'] / df_jiras.loc[jira_name, 'Issues'])
        
        
    print('Complete')
    return df_jiras
        
df_jiras = populate_df_jiras(
    df_jiras,
    ## Test to see if the script works (database created, data inside, etc.) ##
    # jiras=['Hyperledger'],
    ## To test the script in less than 90 minutes, uncomment the following line and see the result of a few select Jira repos ##
    # jiras=['Hyperledger', 'IntelDAOS', 'JFrog', 'Sakai', 'SecondLife', 'Sonatype', 'Spring'],
)


In [None]:
import json
import dtale
import pandas as pd
from pymongo import MongoClient

def explore_jiras_in_dtale(selected_jiras=None):
    """
    Connects to 'JiraRepos' MongoDB, iterates through Jira collections,
    creates a DataFrame for each, and opens a D-Tale session so you can
    explore all columns interactively.

    Parameters
    ----------
    selected_jiras : list of str, optional
        If provided, only these Jira repos will be loaded. Otherwise,
        all Jiras from the config file will be loaded.
    """

    # 1. Connect to MongoDB
    client = MongoClient("mongodb://admin:password@localhost:27017/")
    db = client["JiraRepos"]
    
    # 2. Load your JIRA data sources config
    with open("../0. DataDefinition/jira_data_sources.json") as f:
        jira_data_sources = json.load(f)
    
    # 3. Determine all available JIRA names from the config
    all_jiras = list(jira_data_sources.keys())

    # If specific repos were requested, filter them
    if selected_jiras is not None and len(selected_jiras) > 0:
        # Only keep repos that exist in the config
        all_jiras = [j for j in all_jiras if j in selected_jiras]
        if not all_jiras:
            print(f"⚠️ No valid Jiras found in {selected_jiras}. Check your input.")
            return

    # 4. For each JIRA collection, create a DataFrame & open D-Tale
    for jira_name in all_jiras:
        print(f"Loading issues from collection: {jira_name} ...")
        issues = list(db[jira_name].find())
        
        if not issues:
            print(f"⚠️ No documents found for '{jira_name}', skipping.")
            continue
        
        # Create a raw DataFrame with all columns (all fields from each document)
        df = pd.DataFrame(issues)
        
        # Show in D-Tale
        print(f"Opening D-Tale for {jira_name}... (close browser tab to move on)")
        d = dtale.show(
            df,
            ignore_duplicate=True,   # avoid errors if run multiple times
            allow_cell_edits=False   # optional, disallow edits if you prefer
        )
        # Optionally open a browser tab automatically
        d.open_browser()
        
        # If you want to block execution until you close D-Tale, you can do:
        # d.wait_for_close()
        # Otherwise, the script moves on to next Jira.

    print("✅ D-Tale sessions launched for the selected Jira DataFrames.")

# --- If you want to run this script directly, test with some custom repos ---
if __name__ == "__main__":
    # Example usage #1: Use ALL repos defined in the config
    explore_jiras_in_dtale()

    # Example usage #2: Only visualize 'Hyperledger' and 'SecondLife' 
    # (comment out the first call if you want only the second)
    # explore_jiras_in_dtale(["Hyperledger", "SecondLife"])

In [None]:
import json
import dtale
import pandas as pd
from pymongo import MongoClient

def explore_all_fields_in_dtale(selected_jiras=None):
    """
    Connects to 'JiraRepos' in MongoDB, iterates through JIRA collections,
    flattens each document (including nested fields), creates a DataFrame,
    and opens a D-Tale session so you can explore all columns interactively.
    
    Parameters
    ----------
    selected_jiras : list of str, optional
        If provided, only these Jira repos will be loaded. Otherwise,
        all Jiras from the config file will be loaded.
    """
    # 1. Connect to MongoDB
    client = MongoClient("mongodb://admin:password@localhost:27017/")
    db = client["JiraRepos"]
    
    # 2. Load your JIRA data sources config
    with open("../0. DataDefinition/jira_data_sources.json") as f:
        jira_data_sources = json.load(f)
    
    # 3. Determine all available JIRA names from the config
    all_jiras = list(jira_data_sources.keys())
    
    # Filter if specific repos were requested
    if selected_jiras is not None and len(selected_jiras) > 0:
        all_jiras = [j for j in all_jiras if j in selected_jiras]
        if not all_jiras:
            print(f"⚠️ No valid Jiras found in {selected_jiras}. Check your input.")
            return
    
    # 4. For each JIRA collection, load, flatten, and display in D-Tale
    for jira_name in all_jiras:
        print(f"Loading issues from collection: {jira_name} ...")
        issues = list(db[jira_name].find())
        
        if not issues:
            print(f"⚠️ No documents found for '{jira_name}', skipping.")
            continue
        
        # Flatten nested JSON using pd.json_normalize.
        # You can adjust the separator if desired (default is '.')
        df = pd.json_normalize(issues, sep='.')
        
        # Show in D-Tale
        print(f"Opening D-Tale for {jira_name}... (close browser tab to move on)")
        d = dtale.show(
            df,
            ignore_duplicate=True,   # avoid errors if run multiple times
            allow_cell_edits=False   # optional, disallow edits if you prefer
        )
        d.open_browser()
        
    print("✅ D-Tale sessions launched for the selected Jira DataFrames.")

# --- If you want to run this script directly, test with some custom repos ---
if __name__ == "__main__":
    # Example usage: Load ALL repos defined in the config
    # explore_all_fields_in_dtale()

    # Example usage: Only visualize 'Hyperledger' and 'SecondLife'
    explore_all_fields_in_dtale(["Hyperledger", "SecondLife"])

In [None]:
import json
import dtale
import pandas as pd
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor, as_completed

def flatten_histories(histories):
    """
    Given a list of changelog history entries, returns a DataFrame
    where each row corresponds to one change item.
    """
    rows = []
    for history in histories:
        history_id = history.get("id")
        # Choose either 'name' or 'displayName' from the author
        author = history.get("author", {}).get("name")
        created = history.get("created")
        items = history.get("items", [])
        for item in items:
            rows.append({
                "changelog.history_id": history_id,
                "changelog.author": author,
                "changelog.created": created,
                "changelog.field": item.get("field"),
                "changelog.fieldtype": item.get("fieldtype"),
                "changelog.from": item.get("from"),
                "changelog.fromString": item.get("fromString"),
                "changelog.to": item.get("to"),
                "changelog.toString": item.get("toString")
            })
    return pd.DataFrame(rows)

def process_issue_histories(issue):
    """
    Process one issue: if it has a changelog with histories,
    flatten them and attach the issue identifier.
    """
    if "changelog" in issue and "histories" in issue["changelog"]:
        histories = issue["changelog"]["histories"]
        df_history = flatten_histories(histories)
        # Add an identifier for the issue (using 'key' if available, else 'id')
        df_history["issue_key"] = issue.get("key", issue.get("id"))
        return df_history
    return None

def extract_and_flatten_histories(issues):
    """
    Extracts and flattens changelog.histories from a list of issues using parallel processing.
    Uses a ThreadPoolExecutor for compatibility in notebook environments.
    """
    flattened_histories = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_issue_histories, issue): issue for issue in issues}
        for future in as_completed(futures):
            result = future.result()
            if result is not None and not result.empty:
                flattened_histories.append(result)
    if flattened_histories:
        return pd.concat(flattened_histories, ignore_index=True)
    return pd.DataFrame()

def explore_all_fields_in_dtale(selected_jiras=None):
    """
    Connects to the MongoDB 'JiraRepos' database, loads issues from the specified
    Jira collections, and:
      - Flattens the main issue documents using pd.json_normalize.
      - Extracts and flattens the changelog.histories field from each issue using parallel processing.
      - Aggregates the flattened histories (grouped by issue) and merges them as new columns into the main DataFrame.
      - Opens a single D-Tale session with the merged data.
    
    Parameters
    ----------
    selected_jiras : list of str, optional
        If provided, only these Jira repos will be loaded. Otherwise, all repos in the config are used.
    """
    # Connect to MongoDB
    client = MongoClient("mongodb://admin:password@localhost:27017/")
    db = client["JiraRepos"]
    
    # Load JIRA data sources config
    with open("../0. DataDefinition/jira_data_sources.json") as f:
        jira_data_sources = json.load(f)
    
    # Determine available JIRA names from the config
    all_jiras = list(jira_data_sources.keys())
    if selected_jiras is not None and len(selected_jiras) > 0:
        all_jiras = [j for j in all_jiras if j in selected_jiras]
        if not all_jiras:
            print(f"⚠️ No valid Jiras found in {selected_jiras}.")
            return
    
    merged_dfs = []  # To store merged DataFrames from each repo
    for jira_name in all_jiras:
        print(f"\nLoading issues from collection: {jira_name} ...")
        issues = list(db[jira_name].find())
        if not issues:
            print(f"⚠️ No documents found for '{jira_name}', skipping.")
            continue
        
        # Flatten the main issue structure using pd.json_normalize
        df_main = pd.json_normalize(issues, sep='.')
        
        # Extract and flatten changelog.histories using parallel processing
        df_histories = extract_and_flatten_histories(issues)
        
        if not df_histories.empty:
            # Aggregate histories by issue: group by 'issue_key' and collect each column's values as a list.
            agg_histories = df_histories.groupby("issue_key").agg(lambda x: list(x)).reset_index()
            # Ensure df_main has a column that can join with 'issue_key'. Use 'key' if available, else 'id'.
            if "key" not in df_main.columns:
                df_main["key"] = df_main["id"]
            # Merge the aggregated histories into the main dataframe.
            df_merged = pd.merge(df_main, agg_histories, how="left", left_on="key", right_on="issue_key")
            # Optionally, drop the redundant 'issue_key' column.
            df_merged.drop(columns=["issue_key"], inplace=True)
        else:
            df_merged = df_main
        
        merged_dfs.append(df_merged)
    
    # Combine all repos into one DataFrame if more than one was processed.
    if merged_dfs:
        final_df = pd.concat(merged_dfs, ignore_index=True)
    else:
        print("No data to display.")
        return
    
    # Open a single D-Tale session for the merged DataFrame
    print("Opening D-Tale for merged issues data (including aggregated changelog.histories)...")
    d = dtale.show(final_df, ignore_duplicate=True, allow_cell_edits=False)
    d.open_browser()
    
    print("✅ D-Tale session launched for the merged Jira DataFrame.")

if __name__ == "__main__":
    # Example usage: visualize specific Jira repos
    explore_all_fields_in_dtale(["Hyperledger", "SecondLife"])

In [None]:
import json
import dtale
import pandas as pd
import random
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor, as_completed
import dateparser

def convert_date_columns_dateparser(df, date_columns):
    """
    Convert the specified date columns from string to datetime using dateparser.
    
    Parameters:
      df (pd.DataFrame): Input DataFrame containing date strings.
      date_columns (list): List of column names to convert.
    
    Returns:
      pd.DataFrame: The DataFrame with specified columns converted to datetime objects.
    """
    for col in date_columns:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: dateparser.parse(x, 
                                    settings={'RETURN_AS_TIMEZONE_AWARE': True, 'TIMEZONE': 'UTC'})
                                    if pd.notnull(x) else pd.NaT)
    return df

def flatten_histories(histories):
    """
    Given a list of changelog history entries, returns a DataFrame
    where each row corresponds to one change item.
    """
    rows = []
    for history in histories:
        history_id = history.get("id")
        author = history.get("author", {}).get("name")
        created = history.get("created")
        items = history.get("items", [])

        for item in items:
            rows.append({
                "changelog.history_id": history_id,
                "changelog.author": author,
                "changelog.created": created,
                "changelog.field": item.get("field"),
                "changelog.fieldtype": item.get("fieldtype"),
                "changelog.from": item.get("from"),
                "changelog.fromString": item.get("fromString"),
                "changelog.to": item.get("to"),
                "changelog.toString": item.get("toString")
            })
    
    return pd.DataFrame(rows)

def process_issue_histories(issue):
    """
    Process one issue: If it has a changelog with histories,
    flatten them and attach the issue identifier.
    """
    if "changelog" in issue and "histories" in issue["changelog"]:
        histories = issue["changelog"]["histories"]
        df_history = flatten_histories(histories)
        df_history["issue_key"] = issue.get("key", issue.get("id"))
        return df_history
    return None

def extract_and_flatten_histories(issues):
    """
    Extracts and flattens changelog.histories from a list of issues using parallel processing.
    Uses a ThreadPoolExecutor for better performance.
    """
    flattened_histories = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_issue_histories, issue): issue for issue in issues}
        for future in as_completed(futures):
            result = future.result()
            if result is not None and not result.empty:
                flattened_histories.append(result)

    if flattened_histories:
        return pd.concat(flattened_histories, ignore_index=True)
    
    return pd.DataFrame()

def explore_all_fields_in_dtale(selected_jiras=None, sample_ratio=0.2):
    """
    Connects to the MongoDB 'JiraRepos' database, loads only 20% of issues from each repository,
    flattens changelog histories, and launches a D-Tale session.

    Parameters
    ----------
    selected_jiras : list of str, optional
        If provided, only these Jira repos will be loaded. Otherwise, all repos in the config are used.
    sample_ratio : float, optional
        The fraction of issues to load per repo (default is 20%).
    """
    # Connect to MongoDB
    client = MongoClient("mongodb://admin:password@localhost:27017/")
    db = client["JiraRepos"]
    
    # Load JIRA data sources config
    with open("../0. DataDefinition/jira_data_sources.json") as f:
        jira_data_sources = json.load(f)

    # Determine available JIRA names from the config
    all_jiras = list(jira_data_sources.keys())
    if selected_jiras is not None and len(selected_jiras) > 0:
        all_jiras = [j for j in all_jiras if j in selected_jiras]
        if not all_jiras:
            print(f"⚠️ No valid Jiras found in {selected_jiras}.")
            return

    merged_dfs = []  # To store processed DataFrames

    for jira_name in all_jiras:
        print(f"\nLoading issues from collection: {jira_name} ...")
        issues = list(db[jira_name].find())

        if not issues:
            print(f"⚠️ No documents found for '{jira_name}', skipping.")
            continue
        
        # Randomly sample 20% of issues
        sample_size = max(1, int(len(issues) * sample_ratio))  # At least 1 issue per repo
        sampled_issues = random.sample(issues, sample_size)

        

        # Flatten the main issue structure using pd.json_normalize
        df_main = pd.json_normalize(sampled_issues, sep='.')

        # Extract and flatten changelog.histories
        df_histories = extract_and_flatten_histories(sampled_issues)

        if not df_histories.empty:
            # Aggregate histories by issue: group by 'issue_key' and collect each column's values as a list.
            agg_histories = df_histories.groupby("issue_key").agg(lambda x: list(x)).reset_index()

            # Ensure df_main has a column that can join with 'issue_key'. Use 'key' if available, else 'id'.
            if "key" not in df_main.columns:
                df_main["key"] = df_main["id"]

            # Merge the aggregated histories into the main dataframe.
            df_merged = pd.merge(df_main, agg_histories, how="left", left_on="key", right_on="issue_key")
            df_merged.drop(columns=["issue_key"], inplace=True, errors='ignore')
        else:
            df_merged = df_main

        merged_dfs.append(df_merged)

    # Combine all sampled repos into one DataFrame
    if merged_dfs:
        final_df = pd.concat(merged_dfs, ignore_index=True)
    else:
        print("No data to display.")
        return
    
    # Convert known date columns using dateparser
    date_columns = ["fields.created", "fields.updated", "fields.resolutiondate", "changelog.created"]
    final_df = convert_date_columns_dateparser(final_df, date_columns)
    
     # Convert known date columns using dateparser
    print("Converting date columns to date objects")
    date_columns = ["fields.created", "fields.updated", "fields.resolutiondate", "changelog.created"]
    final_df = convert_date_columns_dateparser(final_df, date_columns)
    
    
    # Open D-Tale for visualization
    print("Opening D-Tale for sampled issues data (including aggregated changelog.histories)...")
    d = dtale.show(final_df, ignore_duplicate=True, allow_cell_edits=False)
    d.open_browser()
    
    print("✅ D-Tale session launched for the sampled Jira DataFrame.")

if __name__ == "__main__":
    # Example usage: visualize specific Jira repos, loading only 20% of their issues
    explore_all_fields_in_dtale(["Hyperledger", "SecondLife"], sample_ratio=0.2)


Loading issues from collection: Hyperledger ...


KeyboardInterrupt: 

In [None]:
import json
import dtale
import pandas as pd
import random
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor, as_completed
import dateparser

def fix_data_types(df, numeric_threshold=0.9):
    """
    Converts DataFrame columns stored as strings into proper data types.
    For each column:
      - If at least `numeric_threshold` of values can be converted to numeric, convert the column.
      - Otherwise, cast the column to a 'category' dtype.
    Columns containing list-like entries (from aggregation) are skipped.
    
    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    numeric_threshold : float, optional
        The fraction of values that must be numeric for conversion (default 0.9).
    
    Returns
    -------
    df : pd.DataFrame
        DataFrame with updated types.
    """
    for col in df.columns:
        # Skip columns with list-like entries (e.g., aggregated changelog data)
        if df[col].apply(lambda x: isinstance(x, list)).any():
            continue
        numeric_series = pd.to_numeric(df[col], errors='coerce')
        non_null_ratio = numeric_series.notnull().mean()
        if non_null_ratio >= numeric_threshold:
            df[col] = numeric_series
        else:
            df[col] = df[col].astype('category')
    return df

def flatten_histories(histories):
    """
    Given a list of changelog history entries, returns a DataFrame where each row corresponds to one change item.
    """
    rows = []
    for history in histories:
        history_id = history.get("id")
        author = history.get("author", {}).get("name")
        created = history.get("created")
        items = history.get("items", [])
        for item in items:
            rows.append({
                "changelog.history_id": history_id,
                "changelog.author": author,
                "changelog.created": created,
                "changelog.field": item.get("field"),
                "changelog.fieldtype": item.get("fieldtype"),
                "changelog.from": item.get("from"),
                "changelog.fromString": item.get("fromString"),
                "changelog.to": item.get("to"),
                "changelog.toString": item.get("toString")
            })
    return pd.DataFrame(rows)

def process_issue_histories(issue):
    """
    Process one issue by flattening its changelog histories.
    After flattening, the DataFrame is converted to proper data types.
    """
    if "changelog" in issue and "histories" in issue["changelog"]:
        histories = issue["changelog"]["histories"]
        df_history = flatten_histories(histories)
        # Convert types right after flattening histories
        df_history = fix_data_types(df_history)
        df_history["issue_key"] = issue.get("key", issue.get("id"))
        return df_history
    return None

def extract_and_flatten_histories(issues):
    """
    Extracts and flattens changelog.histories from a list of issues using parallel processing.
    """
    flattened_histories = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_issue_histories, issue): issue for issue in issues}
        for future in as_completed(futures):
            result = future.result()
            if result is not None and not result.empty:
                flattened_histories.append(result)
    if flattened_histories:
        return pd.concat(flattened_histories, ignore_index=True)
    return pd.DataFrame()

def explore_all_fields_in_dtale(selected_jiras=None, sample_ratio=0.2):
    """
    Connects to the MongoDB 'JiraRepos' database, samples issues from each repository,
    flattens changelog histories with parallel processing, converts data types appropriately,
    and launches a D-Tale session for interactive visualization.
    
    Parameters
    ----------
    selected_jiras : list of str, optional
        If provided, only these Jira repos will be loaded.
    sample_ratio : float, optional
        The fraction of issues to load per repo (default is 0.2).
    """
    # Connect to MongoDB
    client = MongoClient("mongodb://admin:password@localhost:27017/")
    db = client["JiraRepos"]
    
    # Load JIRA data sources config
    with open("../0. DataDefinition/jira_data_sources.json") as f:
        jira_data_sources = json.load(f)
    
    all_jiras = list(jira_data_sources.keys())
    if selected_jiras is not None and len(selected_jiras) > 0:
        all_jiras = [j for j in all_jiras if j in selected_jiras]
        if not all_jiras:
            print(f"⚠️ No valid Jiras found in {selected_jiras}.")
            return
    
    merged_dfs = []  # To store processed DataFrames
    
    for jira_name in all_jiras:
        print(f"\nLoading issues from collection: {jira_name} ...")
        issues = list(db[jira_name].find())
        if not issues:
            print(f"⚠️ No documents found for '{jira_name}', skipping.")
            continue
        
        # Randomly sample a fraction of issues (ensuring at least one issue)
        sample_size = max(1, int(len(issues) * sample_ratio))
        sampled_issues = random.sample(issues, sample_size)
        
        # Flatten the main issue structure and convert data types immediately
        df_main = pd.json_normalize(sampled_issues, sep='.')
        df_main = fix_data_types(df_main)
        
        # Extract and flatten changelog.histories (each flattened DataFrame is already converted)
        df_histories = extract_and_flatten_histories(sampled_issues)
        
        if not df_histories.empty:
            # Aggregate histories by issue: group by 'issue_key' and collect values as lists.
            agg_histories = df_histories.groupby("issue_key").agg(lambda x: list(x)).reset_index()
            
            # Ensure df_main has a key to join on.
            if "key" not in df_main.columns:
                df_main["key"] = df_main["id"]
            
            # Merge aggregated histories into the main DataFrame.
            df_merged = pd.merge(df_main, agg_histories, how="left", left_on="key", right_on="issue_key")
            df_merged.drop(columns=["issue_key"], inplace=True, errors='ignore')
        else:
            df_merged = df_main
        
        merged_dfs.append(df_merged)
    
    if merged_dfs:
        final_df = pd.concat(merged_dfs, ignore_index=True)
    else:
        print("No data to display.")
        return
    
    # At this point the main DataFrame has been fixed and aggregated data (lists) is left untouched.
    print("Data types have been fixed for flattened data. Launching D-Tale session...")
    d = dtale.show(final_df, ignore_duplicate=True, allow_cell_edits=False)
    d.open_browser()
    
    print("✅ D-Tale session launched for the processed Jira DataFrame.")

if __name__ == "__main__":
    # Example usage: visualize specific Jira repos, loading only 20% of their issues.
    explore_all_fields_in_dtale(["Hyperledger", "SecondLife"], sample_ratio=0.2)


Loading issues from collection: Hyperledger ...

Loading issues from collection: SecondLife ...



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Data types have been fixed for flattened data. Launching D-Tale session...
✅ D-Tale session launched for the processed Jira DataFrame.



The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.





In [None]:
import json
import dtale
import pandas as pd
import random
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.impute import SimpleImputer

def fix_data_types(df, numeric_threshold=0.9):
    """
    Convert DataFrame columns (stored as strings) to appropriate data types,
    excluding any date formatting.

    For each column that does not contain list-like entries:
      - If at least `numeric_threshold` fraction of values can be converted to numeric,
        the column is converted to a numeric dtype.
      - Otherwise, the column is cast to 'category' dtype.
    
    Note: Date-like strings remain as strings.
    """
    for col in df.columns:
        if df[col].apply(lambda x: isinstance(x, list)).any():
            continue
        numeric_series = pd.to_numeric(df[col], errors='coerce')
        if numeric_series.notnull().mean() >= numeric_threshold:
            df[col] = numeric_series
        else:
            df[col] = df[col].astype('category')
    return df

def flatten_histories(histories):
    """
    Flatten a list of changelog history entries into a DataFrame.
    Each row represents a single change item.
    """
    rows = []
    for history in histories:
        history_id = history.get("id")
        author = history.get("author", {}).get("name")
        created = history.get("created")
        items = history.get("items", [])
        for item in items:
            rows.append({
                "changelog.history_id": history_id,
                "changelog.author": author,
                "changelog.created": created,
                "changelog.field": item.get("field"),
                "changelog.fieldtype": item.get("fieldtype"),
                "changelog.from": item.get("from"),
                "changelog.fromString": item.get("fromString"),
                "changelog.to": item.get("to"),
                "changelog.toString": item.get("toString")
            })
    return pd.DataFrame(rows)

def process_issue_histories(issue):
    """
    Process a single issue's changelog histories by flattening them and applying type conversion.
    Adds an 'issue_key' for later merging.
    """
    if "changelog" in issue and "histories" in issue["changelog"]:
        histories = issue["changelog"]["histories"]
        df_history = flatten_histories(histories)
        df_history = fix_data_types(df_history)
        df_history["issue_key"] = issue.get("key", issue.get("id"))
        return df_history
    return None

def extract_and_flatten_histories(issues):
    """
    Extract and flatten changelog histories from a list of issues using parallel processing.
    """
    flattened_histories = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_issue_histories, issue): issue for issue in issues}
        for future in as_completed(futures):
            result = future.result()
            if result is not None and not result.empty:
                flattened_histories.append(result)
    if flattened_histories:
        return pd.concat(flattened_histories, ignore_index=True)
    return pd.DataFrame()

def summarize_changelog_histories(df_histories):
    """
    Summarize flattened changelog histories by counting the number of changes per field.
    Returns a DataFrame with one row per issue keyed by 'issue_key'.
    """
    summary = df_histories.groupby('issue_key')['changelog.field'].value_counts().unstack(fill_value=0).reset_index()
    summary = summary.rename(columns=lambda x: f'changelog_count_{x}' if x != 'issue_key' else x)
    return summary

def drop_zero_dominated_columns(df, prefix='changelog_count_', zero_threshold=0.8):
    """
    Drop numeric columns with names starting with the specified prefix if the fraction of zeros exceeds zero_threshold.
    """
    cols_to_drop = []
    for col in df.columns:
        if col.startswith(prefix) and df[col].dtype.kind in 'biufc':  # numeric types
            frac_zeros = (df[col] == 0).mean()
            if frac_zeros > zero_threshold:
                cols_to_drop.append(col)
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)
    return df

def process_issue_links(issuelinks):
    """
    Process the 'fields.issuelinks' JSON array and extract features:
      - Total number of links.
      - Count and binary flag for each link type.
    """
    features = {"issuelinks_total": 0}
    link_types = {}
    if isinstance(issuelinks, list):
        features["issuelinks_total"] = len(issuelinks)
        for link in issuelinks:
            lt = link.get("type", {}).get("name", "Unknown")
            link_types[lt] = link_types.get(lt, 0) + 1
    else:
        features["issuelinks_total"] = 0
    for lt, count in link_types.items():
        features[f"issuelinks_{lt.lower()}_count"] = count
        features[f"has_issuelinks_{lt.lower()}"] = 1 if count > 0 else 0
    return features

def process_comments(comments):
    """
    Process the 'fields.comments' JSON array and extract summary features:
      - Total number of comments.
      - Average and maximum comment length.
      - Number of unique authors.
    """
    features = {
        "comment_count": 0,
        "avg_comment_length": 0,
        "max_comment_length": 0,
        "unique_authors_count": 0
    }
    if not isinstance(comments, list) or len(comments) == 0:
        return features
    comment_bodies = [c.get('body', '') for c in comments if isinstance(c, dict)]
    authors = [c.get('author', {}).get('name') for c in comments if isinstance(c, dict)]
    features["comment_count"] = len(comment_bodies)
    lengths = [len(body) for body in comment_bodies]
    if lengths:
        features["avg_comment_length"] = sum(lengths) / len(lengths)
        features["max_comment_length"] = max(lengths)
    unique_authors = {a for a in authors if a is not None}
    features["unique_authors_count"] = len(unique_authors)
    return features

def process_repo(jira_name, db, sample_ratio):
    """
    Process a single Jira repository:
      - Load issues from MongoDB.
      - Sample a fraction of issues.
      - Flatten main issue data and apply type conversion.
      - Extract and flatten changelog histories, then summarize them (without from/to transitions).
      - Merge the changelog summary with the main DataFrame.
    """
    print(f"\nProcessing repository: {jira_name} ...")
    issues = list(db[jira_name].find())
    if not issues:
        print(f"⚠️ No documents found for '{jira_name}', skipping.")
        return None
    sample_size = max(1, int(len(issues) * sample_ratio))
    sampled_issues = random.sample(issues, sample_size)
    
    # Process main issue structure
    df_main = pd.json_normalize(sampled_issues, sep='.')
    df_main = fix_data_types(df_main)
    
    # Process and summarize changelog histories (without from/to transitions)
    df_histories = extract_and_flatten_histories(sampled_issues)
    if not df_histories.empty:
        changelog_summary = summarize_changelog_histories(df_histories)
        if "key" not in df_main.columns:
            df_main["key"] = df_main["id"]
        df_merged = pd.merge(df_main, changelog_summary, how="left", left_on="key", right_on="issue_key")
        df_merged.drop(columns=["issue_key"], inplace=True, errors='ignore')
    else:
        df_merged = df_main
    return df_merged

def drop_high_missing_columns(df, threshold=0.3):
    """
    Drop columns from the DataFrame where the fraction of missing values exceeds the threshold.
    """
    return df.loc[:, df.isnull().mean() <= threshold]

def impute_missing_values(df, numeric_strategy='median', categorical_strategy='constant', fill_value='Missing'):
    """
    Impute missing values using scikit-learn's SimpleImputer.
      - Numeric columns: impute with the specified strategy (default: median).
      - Categorical columns: impute with a constant value (default: "Missing").
    """
    numeric_cols = df.select_dtypes(include=['number']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(numeric_cols) > 0:
        num_imputer = SimpleImputer(strategy=numeric_strategy)
        df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
    if len(categorical_cols) > 0:
        cat_imputer = SimpleImputer(strategy=categorical_strategy, fill_value=fill_value)
        df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])
    return df

def explore_all_fields_in_dtale(selected_jiras=None, sample_ratio=0.2, missing_threshold=0.3, zero_threshold=0.8):
    """
    Connect to the MongoDB 'JiraRepos' database, sample issues from selected repositories,
    process and flatten changelog histories (summarizing them without from/to transitions),
    process JSON array fields (issuelinks and comments) into engineered features,
    drop columns with excessive missing data, impute missing values,
    drop changelog summary columns dominated by zeros,
    and launch a D-Tale session for interactive visualization.
    """
    client = MongoClient("mongodb://admin:password@localhost:27017/")
    db = client["JiraRepos"]
    
    with open("../0. DataDefinition/jira_data_sources.json") as f:
        jira_data_sources = json.load(f)
    
    all_jiras = list(jira_data_sources.keys())
    if selected_jiras and len(selected_jiras) > 0:
        all_jiras = [j for j in all_jiras if j in selected_jiras]
        if not all_jiras:
            print(f"⚠️ No valid Jira repositories found for {selected_jiras}.")
            return
    
    merged_dfs = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_repo, jira_name, db, sample_ratio): jira_name for jira_name in all_jiras}
        for future in as_completed(futures):
            result = future.result()
            if result is not None:
                merged_dfs.append(result)
    
    if merged_dfs:
        final_df = pd.concat(merged_dfs, ignore_index=True)
    else:
        print("No data to display.")
        return
    
    # Drop columns with high missing ratios
    final_df = drop_high_missing_columns(final_df, threshold=missing_threshold)
    
    # Process JSON array field for issuelinks
    if "fields.issuelinks" in final_df.columns:
        issuelinks_features = final_df["fields.issuelinks"].apply(process_issue_links)
        issuelinks_df = pd.json_normalize(issuelinks_features)
        final_df = pd.concat([final_df.drop(columns=["fields.issuelinks"]), issuelinks_df], axis=1)
    
    # Process JSON array field for comments
    if "fields.comments" in final_df.columns:
        comments_features = final_df["fields.comments"].apply(process_comments)
        comments_df = pd.json_normalize(comments_features)
        final_df = pd.concat([final_df.drop(columns=["fields.comments"]), comments_df], axis=1)
    
    # Impute missing values
    final_df = impute_missing_values(final_df)
    
    # Drop changelog summary columns dominated by zeros
    final_df = drop_zero_dominated_columns(final_df, prefix='changelog_count_', zero_threshold=zero_threshold)
    
    print("Data processed: types fixed, changelog histories summarized (without from/to transitions), JSON array features engineered, high-missing columns dropped, missing values imputed, and zero-dominated changelog fields dropped.")
    d = dtale.show(final_df, ignore_duplicate=True, allow_cell_edits=False)
    d.open_browser()
    print("✅ D-Tale session launched successfully.")

if __name__ == "__main__":
    explore_all_fields_in_dtale(["Hyperledger", "SecondLife"], sample_ratio=0.2, missing_threshold=0.3, zero_threshold=0.8)

In [None]:
import json
import dtale
import pandas as pd
import random
import numpy as np
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.impute import SimpleImputer
from sentence_transformers import SentenceTransformer

# Load Sentence Transformer model (you can choose a smaller model if needed)
desc_model = SentenceTransformer('all-MiniLM-L6-v2')

def fix_data_types(df, numeric_threshold=0.9):
    """
    Convert DataFrame columns (stored as strings) to appropriate data types,
    excluding any date formatting.

    For each column that does not contain list-like entries:
      - If at least `numeric_threshold` fraction of values can be converted to numeric,
        the column is converted to a numeric dtype.
      - Otherwise, the column is cast to 'category' dtype.
    
    Note: Date-like strings remain as strings.
    """
    for col in df.columns:
        if df[col].apply(lambda x: isinstance(x, list)).any():
            continue
        numeric_series = pd.to_numeric(df[col], errors='coerce')
        if numeric_series.notnull().mean() >= numeric_threshold:
            df[col] = numeric_series
        else:
            df[col] = df[col].astype('category')
    return df

def flatten_histories(histories):
    """
    Flatten a list of changelog history entries into a DataFrame.
    Each row represents a single change item.
    """
    rows = []
    for history in histories:
        history_id = history.get("id")
        author = history.get("author", {}).get("name")
        created = history.get("created")
        items = history.get("items", [])
        for item in items:
            rows.append({
                "changelog.history_id": history_id,
                "changelog.author": author,
                "changelog.created": created,
                "changelog.field": item.get("field"),
                "changelog.fieldtype": item.get("fieldtype"),
                "changelog.from": item.get("from"),
                "changelog.fromString": item.get("fromString"),
                "changelog.to": item.get("to"),
                "changelog.toString": item.get("toString")
            })
    return pd.DataFrame(rows)

def process_issue_histories(issue):
    """
    Process a single issue's changelog histories by flattening them and applying type conversion.
    Adds an 'issue_key' for later merging.
    """
    if "changelog" in issue and "histories" in issue["changelog"]:
        histories = issue["changelog"]["histories"]
        df_history = flatten_histories(histories)
        df_history = fix_data_types(df_history)
        df_history["issue_key"] = issue.get("key", issue.get("id"))
        return df_history
    return None

def extract_and_flatten_histories(issues):
    """
    Extract and flatten changelog histories from a list of issues using parallel processing.
    """
    flattened_histories = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_issue_histories, issue): issue for issue in issues}
        for future in as_completed(futures):
            result = future.result()
            if result is not None and not result.empty:
                flattened_histories.append(result)
    if flattened_histories:
        return pd.concat(flattened_histories, ignore_index=True)
    return pd.DataFrame()

def summarize_changelog_histories(df_histories):
    """
    Summarize flattened changelog histories by counting the number of changes per field.
    Returns a DataFrame with one row per issue keyed by 'issue_key'.
    """
    summary = df_histories.groupby('issue_key')['changelog.field'].value_counts().unstack(fill_value=0).reset_index()
    summary = summary.rename(columns=lambda x: f'changelog_count_{x}' if x != 'issue_key' else x)
    return summary

def drop_zero_dominated_columns(df, prefix='changelog_count_', zero_threshold=0.8):
    """
    Drop numeric columns with names starting with the specified prefix if the fraction of zeros exceeds zero_threshold.
    """
    cols_to_drop = []
    for col in df.columns:
        if col.startswith(prefix) and df[col].dtype.kind in 'biufc':  # numeric types
            frac_zeros = (df[col] == 0).mean()
            if frac_zeros > zero_threshold:
                cols_to_drop.append(col)
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)
    return df

def process_issue_links(issuelinks):
    """
    Process the 'fields.issuelinks' JSON array and extract features:
      - Total number of links.
      - Count and binary flag for each link type.
    """
    features = {"issuelinks_total": 0}
    link_types = {}
    if isinstance(issuelinks, list):
        features["issuelinks_total"] = len(issuelinks)
        for link in issuelinks:
            lt = link.get("type", {}).get("name", "Unknown")
            link_types[lt] = link_types.get(lt, 0) + 1
    else:
        features["issuelinks_total"] = 0
    for lt, count in link_types.items():
        features[f"issuelinks_{lt.lower()}_count"] = count
        features[f"has_issuelinks_{lt.lower()}"] = 1 if count > 0 else 0
    return features

def process_comments(comments):
    """
    Process the 'fields.comments' JSON array and extract summary features:
      - Total number of comments.
      - Average and maximum comment length.
      - Number of unique authors.
    """
    features = {
        "comment_count": 0,
        "avg_comment_length": 0,
        "max_comment_length": 0,
        "unique_authors_count": 0
    }
    if not isinstance(comments, list) or len(comments) == 0:
        return features
    comment_bodies = [c.get('body', '') for c in comments if isinstance(c, dict)]
    authors = [c.get('author', {}).get('name') for c in comments if isinstance(c, dict)]
    features["comment_count"] = len(comment_bodies)
    lengths = [len(body) for body in comment_bodies]
    if lengths:
        features["avg_comment_length"] = sum(lengths) / len(lengths)
        features["max_comment_length"] = max(lengths)
    unique_authors = {a for a in authors if a is not None}
    features["unique_authors_count"] = len(unique_authors)
    return features

def process_description_field(descriptions):
    """
    Process the 'fields.description' field by generating dense embeddings
    using a pre-trained Sentence Transformer. The resulting embedding vector
    is expanded into multiple columns (one per dimension).
    """
    # Ensure descriptions are strings and fill missing values with empty string
    descriptions = descriptions.fillna("").astype(str)
    embeddings = descriptions.apply(lambda x: desc_model.encode(x, show_progress_bar=False))
    emb_array = np.vstack(embeddings.values)
    emb_df = pd.DataFrame(emb_array, index=descriptions.index, 
                          columns=[f"desc_emb_{i}" for i in range(emb_array.shape[1])])
    return emb_df

def process_repo(jira_name, db, sample_ratio):
    """
    Process a single Jira repository:
      - Load issues from MongoDB.
      - Sample a fraction of issues.
      - Flatten main issue data and apply type conversion.
      - Extract and flatten changelog histories, then summarize them (without from/to transitions).
      - Merge the changelog summary with the main DataFrame.
    """
    print(f"\nProcessing repository: {jira_name} ...")
    issues = list(db[jira_name].find())
    if not issues:
        print(f"⚠️ No documents found for '{jira_name}', skipping.")
        return None
    sample_size = max(1, int(len(issues) * sample_ratio))
    sampled_issues = random.sample(issues, sample_size)
    
    # Process main issue structure
    df_main = pd.json_normalize(sampled_issues, sep='.')
    df_main = fix_data_types(df_main)
    
    # Process and summarize changelog histories (without from/to transitions)
    df_histories = extract_and_flatten_histories(sampled_issues)
    if not df_histories.empty:
        changelog_summary = summarize_changelog_histories(df_histories)
        if "key" not in df_main.columns:
            df_main["key"] = df_main["id"]
        df_main = pd.merge(df_main, changelog_summary, how="left", left_on="key", right_on="issue_key")
        df_main.drop(columns=["issue_key"], inplace=True, errors='ignore')
    
    return df_main

def drop_high_missing_columns(df, threshold=0.3):
    """
    Drop columns from the DataFrame where the fraction of missing values exceeds the threshold.
    """
    return df.loc[:, df.isnull().mean() <= threshold]

def impute_missing_values(df, numeric_strategy='median', categorical_strategy='constant', fill_value='Missing'):
    """
    Impute missing values using scikit-learn's SimpleImputer.
      - Numeric columns: impute with the specified strategy (default: median).
      - Categorical columns: impute with a constant value (default: "Missing").
    """
    numeric_cols = df.select_dtypes(include=['number']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(numeric_cols) > 0:
        num_imputer = SimpleImputer(strategy=numeric_strategy)
        df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
    if len(categorical_cols) > 0:
        cat_imputer = SimpleImputer(strategy=categorical_strategy, fill_value=fill_value)
        df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])
    return df

def explore_all_fields_in_dtale(selected_jiras=None, sample_ratio=0.2, missing_threshold=0.3, zero_threshold=0.8):
    """
    Connect to the MongoDB 'JiraRepos' database, sample issues from selected repositories,
    process and flatten changelog histories (summarizing them without from/to transitions),
    process JSON array fields (issuelinks, comments) into engineered features,
    process the 'fields.description' field into dense embedding features,
    drop columns with excessive missing data, impute missing values,
    drop changelog summary columns dominated by zeros,
    and launch a D-Tale session for interactive visualization.
    """
    client = MongoClient("mongodb://admin:password@localhost:27017/")
    db = client["JiraRepos"]
    
    with open("../0. DataDefinition/jira_data_sources.json") as f:
        jira_data_sources = json.load(f)
    
    all_jiras = list(jira_data_sources.keys())
    if selected_jiras and len(selected_jiras) > 0:
        all_jiras = [j for j in all_jiras if j in selected_jiras]
        if not all_jiras:
            print(f"⚠️ No valid Jira repositories found for {selected_jiras}.")
            return
    
    merged_dfs = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_repo, jira_name, db, sample_ratio): jira_name for jira_name in all_jiras}
        for future in as_completed(futures):
            result = future.result()
            if result is not None:
                merged_dfs.append(result)
    
    if merged_dfs:
        final_df = pd.concat(merged_dfs, ignore_index=True)
    else:
        print("No data to display.")
        return
    
    # Drop columns with high missing ratios
    final_df = drop_high_missing_columns(final_df, threshold=missing_threshold)
    
    # Process JSON array field for issuelinks
    if "fields.issuelinks" in final_df.columns:
        issuelinks_features = final_df["fields.issuelinks"].apply(process_issue_links)
        issuelinks_df = pd.json_normalize(issuelinks_features)
        final_df = pd.concat([final_df.drop(columns=["fields.issuelinks"]), issuelinks_df], axis=1)
    
    # Process JSON array field for comments
    if "fields.comments" in final_df.columns:
        comments_features = final_df["fields.comments"].apply(process_comments)
        comments_df = pd.json_normalize(comments_features)
        final_df = pd.concat([final_df.drop(columns=["fields.comments"]), comments_df], axis=1)
    
    # Process the 'fields.description' field to create dense embeddings
    if "fields.description" in final_df.columns:
        desc_embeddings = process_description_field(final_df["fields.description"])
        final_df = pd.concat([final_df.drop(columns=["fields.description"]), desc_embeddings], axis=1)
    
    # Impute missing values
    final_df = impute_missing_values(final_df)
    
    # Drop changelog summary columns dominated by zeros
    final_df = drop_zero_dominated_columns(final_df, prefix='changelog_count_', zero_threshold=zero_threshold)
    
    print("Data processed: types fixed, changelog histories summarized, JSON array features engineered, description embeddings added, high-missing columns dropped, missing values imputed, and zero-dominated changelog fields dropped.")
    d = dtale.show(final_df, ignore_duplicate=True, allow_cell_edits=False)
    d.open_browser()
    print("✅ D-Tale session launched successfully.")

if __name__ == "__main__":
    explore_all_fields_in_dtale(["Hyperledger", "SecondLife"], sample_ratio=0.2, missing_threshold=0.3, zero_threshold=0.8)

In [None]:
import json
import dtale
import pandas as pd
import random
import numpy as np
import dateparser
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.impute import SimpleImputer
from sentence_transformers import SentenceTransformer

# Load the Sentence Transformer model (you can choose a smaller one if needed)
desc_model = SentenceTransformer('all-MiniLM-L6-v2')

def convert_date_columns_dateparser(df, date_columns):
    """
    Convert the specified date columns from string to datetime using dateparser.
    
    Parameters:
      df (pd.DataFrame): Input DataFrame containing date strings.
      date_columns (list): List of column names to convert.
    
    Returns:
      pd.DataFrame: The DataFrame with specified columns converted to datetime objects.
    """
    for col in date_columns:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: dateparser.parse(x, 
                                    settings={'RETURN_AS_TIMEZONE_AWARE': True, 'TIMEZONE': 'UTC'})
                                    if pd.notnull(x) else pd.NaT)
    return df


def fix_data_types(df, numeric_threshold=0.9):
    """
    Convert DataFrame columns (stored as strings) to appropriate data types,
    excluding any date formatting.

    For each column that is not list-like:
      - If at least `numeric_threshold` fraction of values can be converted to numeric,
        the column is converted to a numeric dtype.
      - Otherwise, the column is cast to 'category' dtype.
    (Date-like strings remain as strings.)
    """
    for col in df.columns:
        if df[col].apply(lambda x: isinstance(x, list)).any():
            continue
        numeric_series = pd.to_numeric(df[col], errors='coerce')
        if numeric_series.notnull().mean() >= numeric_threshold:
            df[col] = numeric_series
        else:
            df[col] = df[col].astype('category')
    return df


def flatten_histories(histories):
    """
    Flatten a list of changelog history entries into a DataFrame.
    Each row represents a single change item.
    """
    rows = []
    for history in histories:
        history_id = history.get("id")
        author = history.get("author", {}).get("name")
        created = history.get("created")
        items = history.get("items", [])
        for item in items:
            rows.append({
                "changelog.history_id": history_id,
                "changelog.author": author,
                "changelog.created": created,
                "changelog.field": item.get("field"),
                "changelog.fieldtype": item.get("fieldtype"),
                "changelog.from": item.get("from"),
                "changelog.fromString": item.get("fromString"),
                "changelog.to": item.get("to"),
                "changelog.toString": item.get("toString")
            })
    return pd.DataFrame(rows)


def process_issue_histories(issue):
    """
    Process a single issue's changelog histories:
      - Flatten the histories.
      - Apply type conversion.
      - Add an 'issue_key' (using 'key' if available, else 'id') for merging.
    """
    if "changelog" in issue and "histories" in issue["changelog"]:
        histories = issue["changelog"]["histories"]
        df_history = flatten_histories(histories)
        df_history = fix_data_types(df_history)
        df_history["issue_key"] = issue.get("key", issue.get("id"))
        return df_history
    return None


def extract_and_flatten_histories(issues):
    """
    Extract and flatten changelog histories from a list of issues using parallel processing.
    """
    flattened_histories = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_issue_histories, issue): issue for issue in issues}
        for future in as_completed(futures):
            result = future.result()
            if result is not None and not result.empty:
                flattened_histories.append(result)
    if flattened_histories:
        return pd.concat(flattened_histories, ignore_index=True)
    return pd.DataFrame()


def summarize_changelog_histories(df_histories):
    """
    Summarize flattened changelog histories by counting the number of changes per field.
    Returns a DataFrame with one row per issue (keyed by 'issue_key').
    """
    summary = df_histories.groupby('issue_key')['changelog.field'].value_counts().unstack(fill_value=0).reset_index()
    summary = summary.rename(columns=lambda x: f'changelog_count_{x}' if x != 'issue_key' else x)
    return summary


def drop_zero_dominated_columns(df, prefix='changelog_count_', zero_threshold=0.8):
    """
    Drop numeric columns with names starting with `prefix` if more than `zero_threshold`
    fraction of their values are zeros.
    """
    cols_to_drop = []
    for col in df.columns:
        if col.startswith(prefix) and df[col].dtype.kind in 'biufc':
            frac_zeros = (df[col] == 0).mean()
            if frac_zeros > zero_threshold:
                cols_to_drop.append(col)
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)
    return df


def process_issue_links(issuelinks):
    """
    Process the 'fields.issuelinks' JSON array and extract features:
      - Total number of links.
      - Count and binary flag for each link type.
    """
    features = {"issuelinks_total": 0}
    link_types = {}
    if isinstance(issuelinks, list):
        features["issuelinks_total"] = len(issuelinks)
        for link in issuelinks:
            lt = link.get("type", {}).get("name", "Unknown")
            link_types[lt] = link_types.get(lt, 0) + 1
    else:
        features["issuelinks_total"] = 0
    for lt, count in link_types.items():
        features[f"issuelinks_{lt.lower()}_count"] = count
        features[f"has_issuelinks_{lt.lower()}"] = 1 if count > 0 else 0
    return features


def process_comments(comments):
    """
    Process the 'fields.comments' JSON array and extract summary features:
      - Total number of comments.
      - Average and maximum comment length.
      - Number of unique authors.
    """
    features = {
        "comment_count": 0,
        "avg_comment_length": 0,
        "max_comment_length": 0,
        "unique_authors_count": 0
    }
    if not isinstance(comments, list) or len(comments) == 0:
        return features
    comment_bodies = [c.get('body', '') for c in comments if isinstance(c, dict)]
    authors = [c.get('author', {}).get('name') for c in comments if isinstance(c, dict)]
    features["comment_count"] = len(comment_bodies)
    lengths = [len(body) for body in comment_bodies]
    if lengths:
        features["avg_comment_length"] = sum(lengths) / len(lengths)
        features["max_comment_length"] = max(lengths)
    unique_authors = {a for a in authors if a is not None}
    features["unique_authors_count"] = len(unique_authors)
    return features


def process_description_field(descriptions):
    """
    Process the 'fields.description' field by generating dense embeddings
    using a pre-trained Sentence Transformer. The resulting embedding vector
    is expanded into multiple columns (one per dimension).
    """
    descriptions = descriptions.fillna("").astype(str)
    embeddings = descriptions.apply(lambda x: desc_model.encode(x, show_progress_bar=False))
    emb_array = np.vstack(embeddings.values)
    emb_df = pd.DataFrame(emb_array, index=descriptions.index,
                          columns=[f"desc_emb_{i}" for i in range(emb_array.shape[1])])
    return emb_df


def process_repo(jira_name, db, sample_ratio):
    """
    Process a single Jira repository:
      - Load issues from MongoDB.
      - Sample a fraction of issues.
      - Flatten main issue data and apply type conversion.
      - Extract and flatten changelog histories, then summarize them (without from/to transitions).
      - Merge the changelog summary with the main DataFrame.
    """
    print(f"\nProcessing repository: {jira_name} ...")
    issues = list(db[jira_name].find())
    if not issues:
        print(f"⚠️ No documents found for '{jira_name}', skipping.")
        return None
    sample_size = max(1, int(len(issues) * sample_ratio))
    sampled_issues = random.sample(issues, sample_size)
    
    df_main = pd.json_normalize(sampled_issues, sep='.')
    df_main = fix_data_types(df_main)
    
    df_histories = extract_and_flatten_histories(sampled_issues)
    if not df_histories.empty:
        changelog_summary = summarize_changelog_histories(df_histories)
        if "key" not in df_main.columns:
            df_main["key"] = df_main["id"]
        df_main = pd.merge(df_main, changelog_summary, how="left", left_on="key", right_on="issue_key")
        df_main.drop(columns=["issue_key"], inplace=True, errors='ignore')
    
    return df_main


def drop_high_missing_columns(df, threshold=0.3):
    """
    Drop columns from the DataFrame where the fraction of missing values exceeds the threshold.
    """
    return df.loc[:, df.isnull().mean() <= threshold]


def impute_missing_values(df, numeric_strategy='median', categorical_strategy='constant', fill_value='Missing'):
    """
    Impute missing values using scikit-learn's SimpleImputer.
      - Numeric columns: impute with the specified strategy (default: median).
      - Categorical columns: impute with a constant value (default: "Missing").
    """
    numeric_cols = df.select_dtypes(include=['number']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(numeric_cols) > 0:
        num_imputer = SimpleImputer(strategy=numeric_strategy)
        df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
    if len(categorical_cols) > 0:
        cat_imputer = SimpleImputer(strategy=categorical_strategy, fill_value=fill_value)
        df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])
    return df


def explore_all_fields_in_dtale(selected_jiras=None, sample_ratio=0.2, missing_threshold=0.3,
                                zero_threshold=0.8, open_dtale=True):
    """
    Connect to the MongoDB 'JiraRepos' database, sample issues from selected repositories,
    process and flatten changelog histories (summarizing them without from/to transitions),
    process JSON array fields (issuelinks, comments) into engineered features,
    process the 'fields.description' field into dense embedding features,
    drop columns with excessive missing data, impute missing values,
    and drop changelog summary columns dominated by zeros.
    
    If open_dtale is True, launch a D-Tale session for interactive visualization;
    otherwise, simply return the final DataFrame.
    """
    # Connect to MongoDB
    client = MongoClient("mongodb://admin:password@localhost:27017/")
    db = client["JiraRepos"]

    # Load Jira data sources configuration
    with open("../0. DataDefinition/jira_data_sources.json") as f:
        jira_data_sources = json.load(f)

    all_jiras = list(jira_data_sources.keys())
    if selected_jiras and len(selected_jiras) > 0:
        all_jiras = [j for j in all_jiras if j in selected_jiras]
        if not all_jiras:
            print(f"⚠️ No valid Jira repositories found for {selected_jiras}.")
            return

    # Process each repository in parallel
    merged_dfs = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_repo, jira_name, db, sample_ratio): jira_name for jira_name in all_jiras}
        for future in as_completed(futures):
            result = future.result()
            if result is not None:
                merged_dfs.append(result)

    if merged_dfs:
        final_df = pd.concat(merged_dfs, ignore_index=True)
    else:
        print("No data to display.")
        return

    # Drop columns with high missing ratios
    final_df = drop_high_missing_columns(final_df, threshold=missing_threshold)

    # Process JSON array field for issuelinks
    if "fields.issuelinks" in final_df.columns:
        issuelinks_features = final_df["fields.issuelinks"].apply(process_issue_links)
        issuelinks_df = pd.json_normalize(issuelinks_features)
        final_df = pd.concat([final_df.drop(columns=["fields.issuelinks"]), issuelinks_df], axis=1)

    # Process JSON array field for comments
    if "fields.comments" in final_df.columns:
        comments_features = final_df["fields.comments"].apply(process_comments)
        comments_df = pd.json_normalize(comments_features)
        final_df = pd.concat([final_df.drop(columns=["fields.comments"]), comments_df], axis=1)

    # Process the 'fields.description' field to generate dense embeddings
    if "fields.description" in final_df.columns:
        desc_embeddings = process_description_field(final_df["fields.description"])
        final_df = pd.concat([final_df.drop(columns=["fields.description"]), desc_embeddings], axis=1)

    # Impute missing values
    final_df = impute_missing_values(final_df)

    # Drop changelog summary columns dominated by zeros
    final_df = drop_zero_dominated_columns(final_df, prefix='changelog_count_', zero_threshold=zero_threshold)

    if open_dtale:
        print("Data processed. Launching D-Tale session...")
        d = dtale.show(final_df, ignore_duplicate=True, allow_cell_edits=False)
        d.open_browser()
        print("✅ D-Tale session launched successfully.")

    return final_df


def export_final_df():
    """
    Run the full OverviewAnalysis pipeline and return the final DataFrame with all engineered features.
    This version does not launch D-Tale (open_dtale=False) so that it can be used directly as the training dataset.
    
    Returns:
        pd.DataFrame: The final processed DataFrame ready for training.
    """
    final_df = explore_all_fields_in_dtale(
        selected_jiras=["Hyperledger", "SecondLife"],
        sample_ratio=0.05,
        missing_threshold=0.3,
        zero_threshold=0.8,
        open_dtale=True
    )
    return final_df


# For testing purposes, you can run export_final_df() if executing this module directly.
if __name__ == "__main__":
    df_for_training = export_final_df()


Processing repository: Hyperledger ...

Processing repository: SecondLife ...



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Data processed. Launching D-Tale session...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ D-Tale session launched successfully.
