## Community Study Data Retrieval

This Jupyter Notebook is intended to provide a deeper understanding of the community behind GAP distributed through GitHub, by studying the members developing, releasing and collaborating on GAP packages on GitHub, to gather valuable information on their collaboration trends and patterns. In the interest of privacy, the real values of contributor usernames are hashed upon extraction. The hash value is then the variable used to compute and generate statistical data analysis.

In [None]:
# Import required modules and libraries
import os
import sys
import json
import hashlib
from datetime import datetime
from dateutil.relativedelta import relativedelta
from github import Repository, RateLimitExceededException

# Get current working directory and append parent directory for module imports
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
sys.path.append(parent_dir)

# Import modules from other project scripts
from data_constants import *


### Studying the community

Several variables related to autors and collaborations can provide valuable input on how the community behind GAP functions, and what dependencies might exist. Further investigating the frequency of contributions, who contributes to what and where connections are made yields an understanding of who the people behind the GAP packages are, how the collaborate and what the trends point to.

##### Functions to Retrieve Community Metrics

In [None]:
def hash_username(author_name: str) -> str:
    """Hashes the author name upon retrieval, using the SHA-256 algorithm.

    Args:
        author_name (str): The author name to be hashed.

    Returns:
        str: The hash value of the author name.
    """
    return hashlib.sha256(author_name.encode()).hexdigest()


In [None]:
def get_commits_by_contributor(repo: Repository, contributors: set, inactive_contributors: dict) -> None:
    """Get the commits made by each contributor since the given threshold date and identify inactive contributors.

    Args:
        repo (Repository): The GitHub repository to get the commits from.
        contributors (set): GitHub user objects representing the contributors.
        inactive_contributors (dict): Inactive contributors and their latest contribution date.
    """
    # Calculate the date threshold for inactive contributors
    threshold_date = datetime.today() - relativedelta(months=12)

    for contributor in contributors:
        try:
            # Get commits for each contributor
            commits = repo.get_commits(since=threshold_date, author=contributor)
            for commit in commits:
                commit_date = commit.author.created_at
                if commit_date is not None and commit_date < threshold_date:
                    contributor_hash = hash_username(contributor.login)
                    inactive_contributors[contributor_hash] = commit_date.strftime("%d-%m-%Y")

        except Exception as e:
            print(f"Error while processing {repo.name}: {e}")
            continue

In [None]:
def community_contributors(repos: Repository) -> tuple:
    """Get the numbers of GitHub GAP repository authors, authors who are also submitters, number of repos each author contributed to,
    authors who are also submitters and data on what authors interacted with what issue submitters. Also, identify inactive contributors.

    Args:
        repos (Repository): List of GitHub repositories.

    Returns:
        all_authors (set): Hash values for all users that are authors.
        all_submitters (set): Hash values for all users that are issue submitters.
        author_repo_counts (dict): Number of repositories an author contributed to.
        authors_submitters (set): Hash values for users who are both authors and submitters.
        authors_contributed_together (dict): Authors and what issue submitters interacted with their repos.
        inactive_contributors (dict): Inactive contributors and their latest contribution date.
        first_commit_by_author (dict): The first commit date for each contributor to the repo.
    """
    all_authors = set()
    all_submitters = set()
    authors_submitters = set()
    author_repo_counts = {}
    authors_contributed_together = {}
    inactive_contributors = {}
    first_commit_by_author = {}

    for repo in repos:
        # Get all authors and their contribution count
        contributors = repo.get_contributors()
        for contributor in contributors:
            contributor_hash = hash_username(contributor.login)
            all_authors.add(contributor_hash)
            author_repo_counts[contributor_hash] = author_repo_counts.get(contributor_hash, 0) + 1

        # Get inactive contributors based on the given threshold
        get_commits_by_contributor(repo, contributors, inactive_contributors)

        # Get the first commit date for each contributor to the repo
        for contributor in contributors:
            try:
                commits = repo.get_commits(author=contributor)
                if commits.totalCount > 0:
                    first_commit_date = commits[0].author.created_at
                    contributor_hash = hash_username(contributor.login)
                    first_commit_by_author[contributor_hash] = first_commit_date.strftime("%d-%m-%Y")
                else:
                    first_commit_by_author[contributor_hash] = "No commits"
            except Exception as e:
                print(f"Error while getting first commit for {contributor.login} in {repo.name}: {e}")

        # Get all submitters for the repo
        issues = repo.get_issues(state="all")
        submitters_in_repo = set(hash_username(issue.user.login) for issue in issues)
        all_submitters.update(submitters_in_repo)

        # Get all interactions
        for submitter in submitters_in_repo:
            for contributor in contributors:
                contributor_hash = hash_username(contributor.login)
                if submitter != contributor_hash:
                    if contributor_hash not in authors_contributed_together:
                        authors_contributed_together[contributor_hash] = []
                    if submitter not in authors_contributed_together[contributor_hash]:
                        authors_contributed_together[contributor_hash].append(submitter)

    # Get all authors and submitters
    authors_submitters = all_submitters.intersection(all_authors)

    return all_authors, all_submitters, author_repo_counts, authors_submitters, inactive_contributors, authors_contributed_together, first_commit_by_author

In [None]:
def export_community_data() -> None:
    """Export the community data to a JSON file, while instructing the program to sleep for the
    duration of the time it takes for the GitHub API calls limit to reset in the event that it runs out.

    Args:
        None.
        
    Returns:
        None.
    """
    while True:
        try:
            # Define organisation and repos
            org = g.get_organization(ORG_NAME_PACKAGES)
            repos = org.get_repos(type="public")

            # Get the data by calling the function and store it appropriately
            all_authors, all_submitters, author_repo_counts, author_submitters, inactive_contributors, authors_contributed_together, first_commit_by_author = community_contributors(repos)
            data = {
                'authors': list(all_authors),
                'submitters': list(all_submitters),
                'author_repo_counts': author_repo_counts,
                'author_submitters': list(author_submitters),
                'inactive_contributors': inactive_contributors,
                'interactions': authors_contributed_together,
                'first_commit_by_author': first_commit_by_author
            }

            file_path = os.path.join("collected_data", "community_data.json")

            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=4)

            print("Community data has been exported to the 'community_data.json' file in the 'collected_data' folder.")
            break

        except RateLimitExceededException:
            remaining_requests, _ = g.rate_limiting
            reset_time = g.rate_limiting_resettime
            if remaining_requests < 100:
                wait_until_reset(reset_time)

##### Get and Export Community Metrics

In [None]:
# Call the function to export the data
export_community_data()
