## GAP Data Analytics, Package Actions

This Jupyter Notebook investigates tests, actions and workflows for the GAP packages hosted on GitHub.

In [None]:
# Import required libraries and packages
import os
import sys
import re
import requests
import json
from collections import Counter

# Get current working directory and append parent directory for module imports
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
sys.path.append(parent_dir)

# Import modules from other project scripts
from data_constants import *

In [None]:
# Define repositories that are public for gap-packages organisation on GitHub
org = g.get_organization(ORG_NAME_PACKAGES)
repos = org.get_repos(type="public")

In [None]:
# Function to check for a test directory and if it is empty
def check_test_directory(repo):
    test_directory_exists = False
    test_directory_empty = False

    contents = repo.get_contents("")  # Get the root directory contents of the repository
    
    for item in contents:
        if item.type == "dir" and item.name == "tst":
            test_directory_exists = True
            test_contents = repo.get_contents(item.path)
            if len(test_contents) == 0:
                test_directory_empty = True
            break
    
    return test_directory_exists, test_directory_empty

# Counter to keep track of repositories with test directories
repositories_with_tests = 0

# Iterate over repositories and check for test directory
for repo in repos:
    test_exists, test_empty = check_test_directory(repo)
    if test_exists:
        repositories_with_tests += 1

# Print the total number of repositories with test directories
print(f"Total repositories with test directories: {repositories_with_tests}")

# Iterate over repositories again to notify user if any of the test directories are empty
for repo in repos:
    test_exists, test_empty = check_test_directory(repo)
    if test_exists:
        if test_empty:
            print("Test Directory Empty")


In [None]:
# Iterate over repositories again and find tst directories with .tst files
for repo in repos:
    test_exists, test_empty = check_test_directory(repo)
    if test_exists and not test_empty:
        contents = repo.get_contents("tst")  # Get the contents of the tst directory
        tst_file_count = 0  # Counter for .tst files in the tst directory
        for item in contents:
            if item.type == "file" and item.name.endswith(".tst"):
                tst_file_count += 1
        
        if tst_file_count > 0:
            print(f"Repository: {repo.name}")
            print(f"Number of .tst files in tst directory: {tst_file_count}")

In [None]:
# Get test information based on what is listed in the CI.yml files
count_repos_with_ci_file = 0
version_info = {}
packages_without_tests = []

for repo in repos:
    repo_name = repo.name

    try:
        contents = repo.get_contents("")
        has_workflows = any(content.name == ".github" and content.type == "dir" for content in contents)

        if has_workflows:
            workflows_contents = repo.get_contents(".github/workflows")
            if isinstance(workflows_contents, list):
                if any(file.name.lower() == "ci.yml" for file in workflows_contents):
                    count_repos_with_ci_file += 1

                    ci_file = next(file for file in workflows_contents if file.name.lower() == "ci.yml")
                    pattern = r"stable-(\d+\.\d+)"
                    ci_file_contents = requests.get(ci_file.download_url).text

                    matches = re.findall(pattern, ci_file_contents)
                    if matches:
                        version_info[repo_name] = matches
                    else:
                        packages_without_tests.append(repo_name)

    except Exception as e:
        print(f"Error occurred while analyzing repository '{repo_name}': {str(e)}")

print(f"Number of repositories with 'CI.yml' file: {count_repos_with_ci_file}")

# if version_info:
#     print("Versions each package was tested on in the 'CI.yml' files:")
#     for package, versions in version_info.items():
#         versions_str = ", ".join(versions)
#         print(f"Package: {package} | Versions: {versions_str}")
# else:
#     print("No version info found in 'CI.yml' files.")

num_packages_without_tests = len(packages_without_tests)
print(f"Number of packages without any test data in their 'CI.yml' files: {num_packages_without_tests}")
if num_packages_without_tests > 0:
    print("Packages without any test data in their 'CI.yml' files:")
    for package in packages_without_tests:
        print(f"Package: {package}")


In [None]:
# See what versions of GAP the packages are most commonly tested on in CI.yml files
# Count the frequency of each version
version_counts = Counter(version for _, versions in version_info.items() for version in versions)

# Sort versions based on their frequency in descending order
sorted_versions = sorted(version_counts.items(), key=lambda x: x[1], reverse=True)

for version, count in sorted_versions:
    packages = [package for package, versions in version_info.items() if version in versions]
    package_list = ", ".join(packages)
    print(f"Version: {version} | Count: {count} | Packages: {package_list}")

In [None]:
# Get test information based on what is listed in the PackageInfo.g files
# Confirm that all repositories with a PackageInfo.g file also has dependencies, as this is where test info would be
count_repos_with_packageinfo = 0
count_packages_with_dependencies = 0

# List to hold the tested GAP versions
gap_versions = []

for repo in repos:
    repo_name = repo.name

    try:
        # Check if the repository has a PackageInfo.g file
        contents = repo.get_contents("", ref="HEAD")
        packageinfo_file = next((file for file in contents if file.name.lower() == "packageinfo.g"), None)
        if packageinfo_file:
            count_repos_with_packageinfo += 1

            # Read the contents of the PackageInfo.g file
            packageinfo_content = packageinfo_file.decoded_content.decode("utf-8")

            # Check if the PackageInfo.g file contains the "Dependencies" section
            if "Dependencies" in packageinfo_content:
                count_packages_with_dependencies += 1

            # Extract the tested GAP version from the PackageInfo.g file
            version_pattern = r'GAP\s+:=\s+"[^"]*?([\d.]+)"'
            version_match = re.search(version_pattern, packageinfo_content)
            if version_match:
                gap_version = version_match.group(1)
                gap_versions.append((repo_name, gap_version))

    except Exception as e:
        pass

print(f"Number of repositories with 'PackageInfo.g' file: {count_repos_with_packageinfo}")
print(f"Number of packages with 'Dependencies' section: {count_packages_with_dependencies}")

# if gap_versions:
#     print("Tested GAP Versions:")
#     for package, version in gap_versions:
#         print(f"Package: {package} | Tested GAP Version: {version}")
# else:
#     print("No Tested GAP Versions found in 'PackageInfo.g' files.")

In [None]:
# Create a dictionary to hold the version testing data
version_testing_data = {}

# Add version info from CI.yml files to the dictionary
for package, versions in version_info.items():
    version_testing_data.setdefault(package, {"CI_Version": [], "PackageInfo_Version": []})
    version_testing_data[package]["CI_Version"].extend(versions)

# Add GAP version info from PackageInfo.g files to the dictionary
for package, version in gap_versions:
    version_testing_data.setdefault(package, {"CI_Version": [], "PackageInfo_Version": []})
    version_testing_data[package]["PackageInfo_Version"].append(version)

# Define the path for the JSON file
json_path = os.path.join("collected_data", "version_testing.json")

# Write the data to the JSON file
with open(json_path, "w") as json_file:
    json.dump(version_testing_data, json_file, indent=4)

print(f"Version testing data exported to: {json_path}")
