## GAP Data Analytics, Data Visualisatoin

This Jupyter Notebook is for visualising core components of the data generated from the previous notebooks, to point out key attributes and relationships in the data. This could be helpful in assisting redistribution managers in better understanding what trends, dependencies and relationships exist, which could be useful for prioritising and decision-making in the redistribution process.

In [None]:
# Import required modules and libraries
import os
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

# Get current working directory and append parent directory for module imports
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
sys.path.append(parent_dir)

# Import modules from other project scripts
from data_constants import *


##### Functions to Visualise Extracted Data

In [None]:
def load_data(file_path: str) -> dict:
    """Read a JSON file from the given file path and return the data
    as a Python dictionary.

    Args:
        file_path (str): The path to the JSON file to be loaded.

    Returns:
        dict: The data loaded from the JSON file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

In [None]:
def create_author_labels(data_dict: dict) -> dict:
    """Takes a dictionary and returns a new dictionary that maps each key to a label like "user x", 
    where x corresponds to a letter in alphabetic order (e.g., "user a" for the first key, 
    "user b" for the second key and so on). This is to avoid displaying long keys when 
    plotting data, as it makes it less readable.

    Args:
        data_dict (dict): Input dict for which labels need to be created.

    Returns:
        dict: New dict mapping each key to a label like "user x".
    """
    labels = {}
    for idx, key in enumerate(data_dict.keys()):
        label = f"User {chr(97 + idx)}"
        labels[key] = label
    return labels


In [None]:
def plot_bar_chart(data: dict, title: str, x_label: str, y_label: str) -> None:
    """Plot a bar chart using the given data.

    Args:
        data (dict): A dictionary containing the data to plot. The keys represent
            the categories on the x-axis, and the values represent the corresponding
            values on the y-axis.
        title (str): The title of the plot.
        x_label (str): The label for the x-axis.
        y_label (str): The label for the y-axis.

    Returns:
        None: The function displays the plot but does not return anything.
    """
    plt.figure(figsize=(10, 6))
    plt.bar(data.keys(), data.values(), color='lightblue')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


In [None]:
def plot_collaborations(community_data: dict, top_n_users: int) -> None:
    """Plots collaborations between authors and submitters in the GAP community.

    Parameters:
        community_data (dict): A dictionary containing community data, with authors, submitters and interactions.
        top_n_users (int): The number of top submitters to be displayed in the plot.

    Returns:
        None: The function plots the graph but does not return anything.
    """
    # Add nodes for authors and submitters
    G = nx.DiGraph()
    G.add_nodes_from(community_data["authors"], node_type="author")
    G.add_nodes_from(community_data["submitters"], node_type="submitter")

    # Add edges for collaborations between authors and submitters
    for author, submitters in community_data["interactions"].items():
        for submitter in submitters:
            if author in G.nodes and submitter in G.nodes:
                G.add_edge(submitter, author)

    # Get the top_n submitters based on collaboration frequency
    submitter_collaborations = {submitter: sum(1 for authors in community_data["interactions"].values() if submitter in authors) for submitter in G.nodes if G.nodes[submitter]["node_type"] == "submitter"}
    top_n_submitters = sorted(submitter_collaborations, key=submitter_collaborations.get, reverse=True)[:top_n_users]

    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, seed=42, k=1)

    # Draw nodes with different colours for the submitters, while authors are always red
    nx.draw_networkx_nodes(G, pos, nodelist=[node for node in G.nodes if G.nodes[node]["node_type"] == "author"], node_size=200, node_color="red", alpha=0.7, label="Authors")
    for i, submitter in enumerate(top_n_submitters):
        node_colour = f"C{i}"
        nx.draw_networkx_nodes(G, pos, nodelist=[submitter], node_size=200, node_color=node_colour, alpha=0.7, label="Submitters")

        # Draw coloured lines to show collaborations with each submitter
        edges = G.edges()
        colour_edges = [(u, v) for u, v in edges if G.nodes[u]["node_type"] == "submitter" and G.nodes[v]["node_type"] == "author" and u == submitter]
        nx.draw_networkx_edges(G, pos, edgelist=colour_edges, edge_color=node_colour, arrowsize=5, alpha=0.5)

    plt.title("Collaborations between Authors and Submitters")
    plt.axis("off")

    # Add a label for what each colour represents
    plt.text(0.8, 0.95, "Red: Authors\nColoured Nodes: Submitters\nColoured Lines: Interactions", transform=plt.gca().transAxes, fontsize=12, bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="white", alpha=0.7))

    plt.show()


##### Analyse and Visualise Extracted Data

In [None]:
# Global variables that can be regulated
top_n_users = 5

In [None]:
# Load the repo data from the JSON file
data_folder = "collected_data"
repo_file_path = os.path.join(data_folder, "repo_data.json")
repo_data = load_data(repo_file_path)

# Load monitoring data from the JSON file
monitoring_file_path = os.path.join(data_folder, "distro_data.json")
monitoring_data = load_data(monitoring_file_path)

# Load testing data from the JSON file
testing_file_path = os.path.join(data_folder, "testing_data.json")
testing_data = load_data(testing_file_path)

# Load community data from the JSON file
community_file_path = os.path.join(data_folder, "community_data.json")
community_data = load_data(community_file_path)


In [None]:
# Extract data for visualisation
authors = community_data['authors']
submitters = community_data['submitters']
author_submitters = community_data['author_submitters']
author_repo_counts = community_data['author_repo_counts']
interactions = community_data['interactions']


In [None]:
# Count the number of packages with CI tests, PackageInfo tests, both test types and no test types
ci_tests_count = 0
pkg_tests_count = 0
both_tests_count = 0
no_tests_count = 0

for package_info in repo_data:
    package = package_info["repo"]
    if package in testing_data:
        if "ci_file_version" in testing_data[package]:
            ci_tests_count += 1
        if "pkginfo_version" in testing_data[package]:
            pkg_tests_count += 1
        if "ci_file_version" in testing_data[package] and "pkginfo_version" in testing_data[package]:
            both_tests_count += 1
    else:
        no_tests_count += 1

categories = ["CI Tests", "PackageInfo Tests", "Both Tests", "No Tests"]
counts = [ci_tests_count, pkg_tests_count, both_tests_count, no_tests_count]

plot_bar_chart(
    data=dict(zip(categories, counts)),
    title='Number of GAP Packages with CI Tests and PackageInfo Tests',
    x_label='Categories',
    y_label='Counts'
)


In [None]:
# Plot to show the number of users who are authors, submitters and both authors and submitters
count_contributor_labels = {
    "Authors": len(authors),
    "Submitters": len(submitters),
    "Author-Submitters": len(author_submitters)
}

plot_bar_chart(
    data=count_contributor_labels,
    title='Number of Authors, Submitters, and Author-Submitters',
    x_label='Contributor Type',
    y_label='Count'
)

In [None]:
# Get the top authors and the number of their repo contributions
sorted_contributors = sorted(author_repo_counts.items(), key=lambda x: x[1], reverse=True)
top_contributors = dict(sorted_contributors[:top_n_users])

# Get labels for top authors using create_author_labels function
labels_dict = create_author_labels({author: 0 for author in top_contributors})
top_contributor_labels = [labels_dict[author] for author in top_contributors]
data_with_labels = {labels_dict[author]: count for author, count in top_contributors.items()}

plot_bar_chart(
    data=data_with_labels,
    title=f'Top {top_n_users} Authors and Their Repo Contributions',
    x_label='Authors',
    y_label='Repo Contribution Count'
)


In [None]:
# Plot to show the top authors with the most other users submitting issues to their repos
top_authors = sorted(interactions.keys(), key=lambda k: len(interactions[k]), reverse=True)[:top_n_users]
interactions_count = [len(interactions[author]) for author in top_authors]

# Get labels for top authors using create_author_labels function
labels_dict = create_author_labels({author: 0 for author in top_authors})
top_author_labels = [labels_dict[author] for author in top_authors]

plot_bar_chart(
    data={labels_dict[author]: len(interactions[author]) for author in top_authors},
    title=f'Top {top_n_users} Authors with Most Issue Submitter Interactions',
    x_label='Authors',
    y_label='Interactions Count'
)

In [None]:
# Creste a NetworkX plot to represent interactions between authors and contributers
plot_collaborations(community_data, top_n_users)
