## Visualising Retrieved Data

This Jupyter Notebook is for visualising core components of the data generated from the previous notebooks, to point out key attributes and relationships in the data. This could be helpful in assisting redistribution managers in better understanding what trends, dependencies and relationships exist, which could be useful for prioritising and decision-making in the redistribution process.

In [None]:
# Import required modules and libraries
import os
import sys
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx

# Get current working directory and append parent directory for module imports
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
sys.path.append(parent_dir)

# Import modules from other project scripts
from data_constants import *


##### Functions to Visualise Extracted Data

In [None]:
def create_author_labels(data_dict: dict) -> dict:
    """Takes a dict and returns a new dict that maps each key to a label, where they are identified 
    as "user" and then a letter in alphabetic order ("user a" for the first key, "user b" for the second key and so on). 
    This is to avoid displaying long hash values when plotting data, as it compromises readability.

    Args:
        data_dict (dict): Input dict for which labels are to be created.

    Returns:
        dict: New dict mapping each key to a user label in alphabetic order.
    """
    labels = {}
    for idx, key in enumerate(data_dict.keys()):
        label = f"User {chr(97 + idx)}"
        labels[key] = label
    return labels


In [None]:
def plot_pie_chart(data: dict, title: str) -> None:
    """Plot a pie chart for the given data.

    Args:
        data (dict): The data to plot. The keys represent the categories,
        and the values represent the corresponding values for each category.
        title (str): The title of the plot.

    Returns:
        None.
    """
    plt.figure(figsize=(8, 8))
    plt.pie(data.values(), labels=data.keys(), autopct='%1.1f%%', startangle=140, colors=sns.color_palette("pastel"))
    plt.title(title)
    plt.show()
    

In [None]:
def plot_histogram(data: list, title: str, x_label: str, bins: int = 10) -> None:
    """Plot a histogram for the given data.

    Args:
        data (list): A list containing the data to plot.
        title (str): The title of the plot.
        x_label (str): The label for the x-axis.
        bins (int): The number of bins for the histogram.

    Returns:
        None.
    """
    plt.figure(figsize=(10, 6))
    plt.hist(data, bins=bins, color='#ffb86e', rwidth=0.8)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel('Number of Repositories')
    plt.grid(axis='y', alpha=0.75)
    plt.show()


In [None]:
def plot_bar_chart(data: dict, title: str, x_label: str, y_label: str) -> None:
    """Plot a bar chart using the given data.

    Args:
        data (dict): The data to plot. The keys represent the categories on the x-axis, 
        and the values represent the corresponding values on the y-axis.
        title (str): The title of the plot.
        x_label (str): The label for the x-axis.
        y_label (str): The label for the y-axis.

    Returns:
        None.
    """
    plt.figure(figsize=(10, 6))
    plt.bar(data.keys(), data.values(), color='#ffb86e')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


In [None]:
def plot_collaborations(community_data: dict, top_n_users: int) -> None:
    """Plots collaborations between authors and submitters in the GAP community.

    Parameters:
        community_data (dict): A dict containing community data with authors, submitters and interactions.
        top_n_users (int): The number of submitters to be included in the plot.

    Returns:
        None.
    """
    # Add nodes for authors and submitters
    G = nx.DiGraph()
    G.add_nodes_from(community_data["authors"], node_type="author")
    G.add_nodes_from(community_data["submitters"], node_type="submitter")

    # Add edges for collaborations between authors and submitters
    for author, submitters in community_data["interactions"].items():
        for submitter in submitters:
            if author in G.nodes and submitter in G.nodes:
                G.add_edge(submitter, author)

    # Get the top_n submitters based on collaboration frequency
    submitter_collaborations = {submitter: sum(1 for authors in community_data["interactions"].values() if submitter in authors) for submitter in G.nodes if G.nodes[submitter]["node_type"] == "submitter"}
    top_n_submitters = sorted(submitter_collaborations, key=submitter_collaborations.get, reverse=True)[:top_n_users]

    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, seed=42, k=1)

    # Draw nodes with different colours for the submitters, while authors are always red
    nx.draw_networkx_nodes(G, pos, nodelist=[node for node in G.nodes if G.nodes[node]["node_type"] == "author"], node_size=200, node_color="red", alpha=0.7, label="Authors")
    for i, submitter in enumerate(top_n_submitters):
        node_colour = f"C{i}"
        nx.draw_networkx_nodes(G, pos, nodelist=[submitter], node_size=200, node_color=node_colour, alpha=0.7, label="Submitters")

        # Draw coloured lines to show collaborations with each submitter
        edges = G.edges()
        colour_edges = [(u, v) for u, v in edges if G.nodes[u]["node_type"] == "submitter" and G.nodes[v]["node_type"] == "author" and u == submitter]
        nx.draw_networkx_edges(G, pos, edgelist=colour_edges, edge_color=node_colour, arrowsize=5, alpha=0.5)

    plt.title(f"Interactions between Authors and Top {top_n_users} Issue Submitters")
    plt.axis("off")
    plt.text(0.8, 0.95, "Red: Authors\nColoured Nodes: Submitters\nColoured Lines: Interactions", transform=plt.gca().transAxes, fontsize=12, bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="white", alpha=0.7))
    plt.show()


##### Analyse and Visualise Extracted Data

In [None]:
# Global variables to regulate the scope for certain user and repository visualisations
top_n_repos = 10
top_n_users = 10


In [None]:
# Load the repository data from the JSON file
data_folder = "collected_data"
repo_file_path = os.path.join(data_folder, "repo_data.json")
repo_data = load_data(repo_file_path)

# Load monitoring data from the JSON file
monitoring_file_path = os.path.join(data_folder, "monitoring_data.json")
monitoring_data = load_data(monitoring_file_path)

# Load testing data from the JSON file
testing_file_path = os.path.join(data_folder, "testing_data.json")
testing_data = load_data(testing_file_path)

# Load community data from the JSON file
community_file_path = os.path.join(data_folder, "community_data.json")
community_data = load_data(community_file_path)


##### Repository Data: Visualising Key Metrics

In [None]:
# Display key metrics for repositories based on their current state
# Show the distribution of reposiroties by age, measured in years
repo_ages = [repo['age_in_days'] / 365 for repo in repo_data]
plot_histogram(repo_ages, "Distribution of Repositories by Age", "Age in Years")

# Show the distribution of repositories by total releases
total_releases = [repo['total_releases'] for repo in repo_data]
plot_histogram(total_releases, "Distribution of Repositories by Total Releases", "Number of Releases")

# Show the distribution of repositories by open issues
open_issues = {repo['repo']: repo['open_issues_count'] for repo in repo_data}
open_issues = dict(sorted(open_issues.items(), key=lambda item: item[1], reverse=True)[:top_n_repos])
plot_bar_chart(open_issues, "Distribution of Top {} Repositories by Open Issues".format(top_n_repos), "Repository", "Number of Open Issues")

# Show the distribution of repositories by open and closed pull requests
open_prs = sum([repo['open_pull_requests'] for repo in repo_data])
closed_prs = sum([repo['closed_pull_requests'] for repo in repo_data])
plot_pie_chart({'Open PRs': open_prs, 'Closed PRs': closed_prs}, "Distribution of Pull Requests (Open vs. Closed)")


##### Monitoring Data: Visualising Key Metrics

In [None]:
# Define monitoring data for visualisation
packages_with_different_versions = monitoring_data.get('packages_with_different_versions', [])
all_previous_and_maybe_next = monitoring_data.get('all_previous_and_maybe_next', [])
previous_and_maybe_next_labels = monitoring_data.get('previous_and_maybe_next_labels', [])

# Plot to show next releases, in last release and maybe in the next, and in last release and maybe in the next based on labels
# The labels in question are "automatic pr", "new package" and "update package"
status_count = {
    'Previous and Maybe Next': len(all_previous_and_maybe_next),
    'Previous and Maybe Next Labels': len(previous_and_maybe_next_labels),
    'In Next Release': len(packages_with_different_versions),
}

# Create a bar chart for the status count
plot_bar_chart(status_count, 'Next Release Predictions Based on PackageDistro Repository', 'Predictions for Next Release', 'Number of Packages')


##### Testing Data: Visualising Key Metrics

In [None]:
# Count the number of packages with tested versions, required version tested, both version tests and no tests
ci_tests_count = 0
pkg_tests_count = 0
both_tests_count = 0
no_tests_count = 0

for package_info in repo_data:
    package = package_info["repo"]
    if package in testing_data:
        if "tested_ci_versions" in testing_data[package]:
            ci_tests_count += 1
        if "required_pkginfo_version" in testing_data[package]:
            pkg_tests_count += 1
        if "tested_ci_versions" in testing_data[package] and "required_pkginfo_version" in testing_data[package]:
            both_tests_count += 1
    else:
        no_tests_count += 1

categories = ["Tested Versions", "Required Test Version", "Both Test Formats", "No Test Formats"]
counts = [ci_tests_count, pkg_tests_count, both_tests_count, no_tests_count]

plot_bar_chart(
    data=dict(zip(categories, counts)),
    title='Number of GAP Packages with Tested Versions in CI and Required PackageInfo Version',
    x_label='Categories',
    y_label='Counts'
)


In [None]:
# Plot correlation between number of test files and lines of test code
test_line_count = {}
test_file_count = {}

for package, data in testing_data.items():
    if "tst_file_count" in data:
        test_file_count[package] = data["tst_file_count"]
    if "total_lines_in_tst_files" in data:
        test_line_count[package] = data["total_lines_in_tst_files"]

# Convert data to numpy arrays for calculations
num_test_files = np.array(list(test_file_count.values()))
total_lines_test_files = np.array(list(test_line_count.values()))

# Calculate correlation coefficient between test file count and test line count
correlation_coefficient = np.corrcoef(num_test_files, total_lines_test_files)[0, 1]

# Create a scatter plot with linear regression line and confidence interval of 50%
plt.figure(figsize=(8, 6))
sns.regplot(x=num_test_files, y=total_lines_test_files, color='#ff9934', ci=50, label="Data Points")

# Provide labels and create the plot 
plt.text(0.98, 0.03, "Linear Regression Line\nand Confidence Interval (50%)", transform=plt.gca().transAxes, 
         fontsize=12, color="#ff9934", ha='right', va='bottom')
plt.xlabel("Number of Test Files")
plt.ylabel("Total Lines in Test Files")
plt.title(f"Correlation between Test Files and Lines of Test Code\nCorrelation Coefficient: {correlation_coefficient:.2f}")
plt.legend()
plt.tight_layout()
plt.show()


##### Community Data: Visualising Key Metrics

In [None]:
# Define community data for visualisation
authors = community_data['authors']
submitters = community_data['submitters']
author_submitters = community_data['author_submitters']
author_repo_counts = community_data['author_repo_counts']
interactions = community_data['interactions']
first_author_commit = community_data['first_commit_by_author']


In [None]:
# Plot to show the number of users who are authors, submitters and both authors and submitters
count_contributor_labels = {
    "Authors": len(authors),
    "Submitters": len(submitters),
    "Author-Submitters": len(author_submitters)
}

plot_bar_chart(
    data=count_contributor_labels,
    title='Number of Authors, Issue Submitters and Author-Submitters',
    x_label='Contributor Type',
    y_label='Count'
)


In [None]:
# Get the top authors and the number of their repository contributions
sorted_contributors = sorted(author_repo_counts.items(), key=lambda x: x[1], reverse=True)
top_contributors = dict(sorted_contributors[:top_n_users])

# Get labels for displaying top authors using label function
labels_dict = create_author_labels({author: 0 for author in top_contributors})
top_contributor_labels = [labels_dict[author] for author in top_contributors]
data_with_labels = {labels_dict[author]: count for author, count in top_contributors.items()}

plot_bar_chart(
    data=data_with_labels,
    title=f'Top {top_n_users} Authors and Their Repository Contributions',
    x_label='Authors',
    y_label='Repository Contribution Count'
)


In [None]:
# Plot to show the top authors with the most other users submitting issues to their repos
top_authors = sorted(interactions.keys(), key=lambda k: len(interactions[k]), reverse=True)[:top_n_users]
interactions_count = [len(interactions[author]) for author in top_authors]

# Get labels for top authors using label generator function
labels_dict = create_author_labels({author: 0 for author in top_authors})
top_author_labels = [labels_dict[author] for author in top_authors]

plot_bar_chart(
    data={labels_dict[author]: len(interactions[author]) for author in top_authors},
    title=f'Top {top_n_users} Authors with Most Issue Submitter Interactions',
    x_label='Authors',
    y_label='Interactions Count'
)


In [None]:
# Get dynamic overview on the historic developments of contributor community based on first commit
first_commit_years = {}
for commit_date in first_author_commit.values():
    if commit_date != "No commits":
        year = int(commit_date[-4:])
        first_commit_years[year] = first_commit_years.get(year, 0) + 1

sorted_years = sorted(first_commit_years.items(), key=lambda x: x[0])
years, counts = zip(*sorted_years)

plot_bar_chart(
    data=dict(zip(years, counts)),
    title='Number of Contributors by Year of First Commit',
    x_label='Year',
    y_label='Number of Contributors'
)


In [None]:
# Create a NetworkX plot to represent interactions between authors and contributers
plot_collaborations(community_data, top_n_users)


##### Save Notebook For Dashboard

After the script has been executed once and the outputs have been generated, it is very important to **save the file** before starting the Streamlit dashboard. If not, the outputs will not be available on the dashboard.