## GAP Data Analytics, Data Visualisatoin

This Jupyter Notebook is for visualising core components of the data generated from the previous notebooks, to point out key attributes and relationships in the data. This could be helpful in assisting redistribution managers in better understanding what trends, dependencies and relationships exist, which could be useful for prioritising and decision-making in the redistribution process.

In [None]:
# Import required modules and libraries
import os
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter
import seaborn as sns

# Get current working directory and append parent directory for module imports
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
sys.path.append(parent_dir)

# Import modules from other project scripts
from data_constants import *


##### Functions to Visualise Extracted Data

In [None]:
def load_data(file_path: str) -> dict:
    """Read a JSON file from the given file path and return the data
    as a Python dictionary.

    Args:
        file_path (str): The path to the JSON file to be loaded.

    Returns:
        dict: The data loaded from the JSON file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

In [None]:
def create_author_labels(data_dict: dict) -> dict:
    """Takes a dictionary and returns a new dictionary that maps each key to a label like "user x", 
    where x corresponds to a letter in alphabetic order (e.g., "user a" for the first key, 
    "user b" for the second key and so on). This is to avoid displaying long keys when 
    plotting data, as it makes it less readable.

    Args:
        data_dict (dict): Input dict for which labels need to be created.

    Returns:
        dict: New dict mapping each key to a label like "user x".
    """
    labels = {}
    for idx, key in enumerate(data_dict.keys()):
        label = f"User {chr(97 + idx)}"
        labels[key] = label
    return labels


In [None]:
def plot_bar_chart(data: dict, title: str, x_label: str, y_label: str) -> None:
    """Plot a bar chart using the given data.

    Args:
        data (dict): A dictionary containing the data to plot. The keys represent
            the categories on the x-axis, and the values represent the corresponding
            values on the y-axis.
        title (str): The title of the plot.
        x_label (str): The label for the x-axis.
        y_label (str): The label for the y-axis.

    Returns:
        None: The function displays the plot but does not return anything.
    """
    plt.figure(figsize=(10, 6))
    plt.bar(data.keys(), data.values(), color='lightblue')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


##### Analyse and Visualise Extracted Data

In [None]:
# Load data from the JSON file
data_folder = "collected_data"
file_path = os.path.join(data_folder, "community_data.json")
data = load_data(file_path)

# Extract data for visualisation
authors = data['authors']
submitters = data['submitters']
author_submitters = data['author_submitters']
author_repo_counts = data['author_repo_counts']
interactions = data['interactions']

# Global variables that can be regulated
top_n_users = 10

In [None]:
# Plot to show the number of users who are authors, submitters and both authors and submitters
count_contributor_labels = {
    "Authors": len(authors),
    "Submitters": len(submitters),
    "Author-Submitters": len(author_submitters)
}

plot_bar_chart(
    data=count_contributor_labels,
    title='Number of Authors, Submitters, and Author-Submitters',
    x_label='Contributor Type',
    y_label='Count'
)

In [None]:
# Get the top authors and the number of their repo contributions
sorted_contributors = sorted(author_repo_counts.items(), key=lambda x: x[1], reverse=True)
top_contributors = dict(sorted_contributors[:top_n_users])

# Get labels for top authors using create_author_labels function
labels_dict = create_author_labels({author: 0 for author in top_contributors})
top_contributor_labels = [labels_dict[author] for author in top_contributors]
data_with_labels = {labels_dict[author]: count for author, count in top_contributors.items()}

plot_bar_chart(
    data=data_with_labels,
    title=f'Top {top_n_users} Authors and Their Repo Contributions',
    x_label='Authors',
    y_label='Repo Contribution Count'
)

In [None]:
# Plot to show the top authors with the most other users submitting issues to their repos
top_authors = sorted(interactions.keys(), key=lambda k: len(interactions[k]), reverse=True)[:top_n_users]
interactions_count = [len(interactions[author]) for author in top_authors]

# Get labels for top authors using create_author_labels function
labels_dict = create_author_labels({author: 0 for author in top_authors})
top_author_labels = [labels_dict[author] for author in top_authors]

plot_bar_chart(
    data={labels_dict[author]: len(interactions[author]) for author in top_authors},
    title=f'Top {top_n_users} Authors with Most Issue Submitter Interactions',
    x_label='Authors',
    y_label='Interactions Count'
)