## GAP Data Analytics, Community Study

This Jupyter Notebook is intended to provide a deeper understanding of the community behind GAP distributed through GitHub, by studying the members developing, releasing and collaborating on GAP packages on GitHub, to gather valuable information on their collaboration trends and patterns. In the interest of privacy, the real values of contributor usernames are hashed upon extraction. The hash value is then the variable used to compute and generate statistical data analysis.

In [None]:
# Import required modules and libraries
import os
import sys
import json
import hashlib
from github import Repository

# Get current working directory and append parent directory for module imports
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
sys.path.append(parent_dir)

# Import modules from other project scripts
from data_constants import *


### Studying the community

Several variables related to autors and collaborations can provide valuable input on how the community behind GAP functions, and what dependencies might exist. Further investigating the frequency of contributions, who contributes to what and where connections are made yields an understanding of who the people behind the GAP packages are, how the collaborate and what the trends point to.

In [None]:
# Define global variables for the Jupyter Notebook
org = g.get_organization(ORG_NAME_PACKAGES)
repos = org.get_repos(type="public")


##### Functions to Retrieve Community Metrics

In [None]:
def hash_username(author_name: str) -> str:
    """Hashes the author name upon retrieval, using the SHA-256 algorithm.

    Args:
        author_name (str): The author name to be hashed.

    Returns:
        str: The hash value of the author name.
    """
    return hashlib.sha256(author_name.encode()).hexdigest()


In [None]:
def community_contributors(repos: Repository) -> tuple:
    """Get the numbers of GitHub GAP repository authors, authors who are also submitters, number of repos each author contributed to,
    authors who are also submitters and data on what authors interacted with what issue submitters.

    Args:
        repos (Repository): List of GitHub repositories.

    Returns:
        tuple: A set of hash values for all users that are authors,
            a dict with showing how many repositories an author contributed to
            a set of hash values for users who are authors and submitters,
            and a dict containing authors and what issue submitters interacted with their repos.
    """
    all_authors = set()
    all_submitters = set()
    authors_submitters = set()
    author_repo_counts = {}
    authors_contributed_together = {}

    for repo in repos:
        contributors = repo.get_contributors()
        for contributor in contributors:
            contributor_hash = hash_username(contributor.login)
            if contributor not in all_authors:
                all_authors.add(contributor_hash)
            author_repo_counts[contributor_hash] = author_repo_counts.get(contributor_hash, 0) + 1
        
        # Keep track of submitters so that each submitter is only counted once per repo
        issues = repo.get_issues(state="all")        
        submitters_in_repo = set()

        for issue in issues:
            submitter = hash_username(issue.user.login)
            if submitter not in all_submitters: 
                all_submitters.add(submitter)
            submitters_in_repo.add(submitter)

        for submitter in submitters_in_repo:
            for contributor in contributors:
                contributor = hash_username(contributor.login)
                if submitter != contributor:
                    if contributor not in authors_contributed_together:
                        authors_contributed_together[contributor] = []
                    if submitter not in authors_contributed_together[contributor]:
                        authors_contributed_together[contributor].append(submitter)

    authors_submitters = all_submitters.intersection(all_authors)

    return all_authors, all_submitters, author_repo_counts, authors_submitters, authors_contributed_together


##### Get and Display Community Metrics

In [None]:
# Unpack the tuple to access the variables
all_authors, all_submitters, author_repo_counts, author_submitters, authors_contributed_together = community_contributors(repos)
print(f"Total number of authors for all GAP packages: {len(all_authors)}")
print(f"Total number of submitters for all GAP packages: {len(all_submitters)}")
print(f"Total number of authors who were also submitters for all GAP packages: {len(author_submitters)}")

# Get information on how many repositories an author contributed to
sorted_contributors = sorted(author_repo_counts.items(), key=lambda x: x[1], reverse=True)
for value, count in sorted_contributors:
    print(f"Author Hash Value: {value}\tRepo Contribution Count: {count}")


In [None]:
# Export collected data to JSON file to store them for later use and better overview
data_folder = "collected_data"
data = {
    'authors': list(all_authors),
    'submitters': list(all_submitters),
    'author_repo_counts': author_repo_counts,
    'author_submitters': list(author_submitters),
    'interactions': authors_contributed_together
}

# Create a file path for the JSON file, and add it to the data folder
file_path = os.path.join(data_folder, "community_data.json")

# Write the data to the JSON file
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("Community data has been exported to the 'community_data.json' file in the 'collected_data' folder.")
