## GAP Data Analytics, Package Actions

This Jupyter Notebook investigates tests, actions and workflows for the GAP packages hosted on GitHub.

In [None]:
# Import required libraries and packages
import os
import sys
import re
import requests
import json

# Get current working directory and append parent directory for module imports
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
sys.path.append(parent_dir)

# Import modules from other project scripts
from data_constants import *

In [None]:
# Define repositories that are public for gap-packages organisation on GitHub
org = g.get_organization(ORG_NAME_PACKAGES)
repos = org.get_repos(type="public")

In [None]:
# Define function to check for a test directory and if it's empty
def check_for_tst_dir(repo):
    tst_dir_exists = False
    tst_dir_empty = False

    contents = repo.get_contents("")  # Get the root directory contents of the repository
    repositories_with_tests = 0
    for item in contents:
        if item.type == "dir" and item.name == "tst":
            tst_dir_exists = True
            repositories_with_tests += 1
            test_contents = repo.get_contents(item.path)
            if len(test_contents) == 0:
                tst_dir_empty = True
            break
    
    return tst_dir_exists, tst_dir_empty, repositories_with_tests

In [None]:
# Recursive function to count tst files in tst directory and subdirectories
def process_tst_directory(repo, directory_path, tst_file_info):
    contents = repo.get_contents(directory_path)
    num_tst_files = 0
    total_lines = 0

    for item in contents:
        if item.type == "file" and item.name.endswith(".tst"):
            tst_file_content = requests.get(item.download_url).text
            lines = tst_file_content.splitlines()
            num_tst_files += 1
            total_lines += len(lines)
        
        elif item.type == "dir":
            subdirectory_path = f"{directory_path}/{item.name}"
            subdir_num_tst_files, subdir_total_lines = process_tst_directory(repo, subdirectory_path, tst_file_info)
            num_tst_files += subdir_num_tst_files
            total_lines += subdir_total_lines
    
    return num_tst_files, total_lines

In [None]:
# Define function to analyse the contents of the tst directories and count tst directories with .tst files
def analyse_tst_files(repos):
    tst_dirs_with_files = 0
    tst_files_info = []
    total_test_files = 0
    total_lines = 0

    for repo in repos:
        test_exists, tst_dir_empty, _ = check_for_tst_dir(repo)

        if test_exists and not tst_dir_empty:
            tst_file_info = {
                "repository": repo.name,
                "num_tst_files": 0,
                "total_lines": 0
            }
            num_tst_files, lines = process_tst_directory(repo, "tst", tst_file_info)

            if num_tst_files > 0:
                tst_dirs_with_files += 1
                tst_file_info["num_tst_files"] = num_tst_files
                tst_file_info["total_lines"] = lines
                tst_files_info.append(tst_file_info)

            total_test_files += num_tst_files
            total_lines += lines

    return tst_dirs_with_files, total_test_files, tst_files_info

In [None]:
# Define function to retrieve version information from CI.yml files
def ci_version_testing(repos):
    repos_with_ci_file = 0
    ci_tested_version = {}
    repos_without_ci_tests = []
    for repo in repos:
        repo_name = repo.name
        try:
            contents = repo.get_contents("")
            has_workflows = any(content.name == ".github" and content.type == "dir" for content in contents)
            if has_workflows:
                workflows_contents = repo.get_contents(".github/workflows")
                if isinstance(workflows_contents, list):
                    if any(file.name.lower() == "ci.yml" for file in workflows_contents):
                        repos_with_ci_file += 1
                        ci_file = next(file for file in workflows_contents if file.name.lower() == "ci.yml")
                        pattern = r"stable-(\d+\.\d+)"
                        ci_file_contents = requests.get(ci_file.download_url).text
                        matches = re.findall(pattern, ci_file_contents)
                        if matches:
                            ci_tested_version[repo_name] = matches
                        else:
                            repos_without_ci_tests.append(repo_name)
        except Exception as e:
            print(f"Error occurred while analyzing repository '{repo_name}': {str(e)}")
    return repos_with_ci_file, ci_tested_version, repos_without_ci_tests


In [None]:
# Define function to retrieve GAP version information from PackageInfo.g files
def pkginfo_version_testing(repos):
    repos_with_pkginfo_file = 0
    repos_with_dependencies = 0
    pkg_tested_version = []
    for repo in repos:
        repo_name = repo.name
        try:
            contents = repo.get_contents("", ref="HEAD")
            pkginfo_file = next((file for file in contents if file.name.lower() == "packageinfo.g"), None)
            if pkginfo_file:
                repos_with_pkginfo_file += 1
                pkginfo_content = pkginfo_file.decoded_content.decode("utf-8")
                if "Dependencies" in pkginfo_content:
                    repos_with_dependencies += 1
                version_pattern = r'GAP\s+:=\s+"[^"]*?([\d.]+)"'
                version_match = re.search(version_pattern, pkginfo_content)
                if version_match:
                    gap_version = version_match.group(1)
                    pkg_tested_version.append((repo_name, gap_version))
        except Exception as e:
            pass
    return repos_with_pkginfo_file, repos_with_dependencies, pkg_tested_version

In [None]:
# Get the total number of test files for all the repositories
# Use get metod so no crash if the key does not exist in the dictionary
tst_dirs_with_files, total_test_files, tst_files_info = analyse_tst_files(repos)
print(f"Repositories with test directories containing files: {tst_dirs_with_files}")
print(f"Total number of test files for all packages: {total_test_files}")

# Retrieve CI.yml file information
repos_with_ci_file, ci_tested_version, repos_without_ci_tests = ci_version_testing(repos)
print(f"Number of repositories with CI.yml file: {repos_with_ci_file}")
num_packages_without_tests = len(repos_without_ci_tests)
if num_packages_without_tests > 0:
    print(f"Packages without any test data in their CI.yml files {repos_without_ci_tests}")

# Retrieve GAP version information from PackageInfo.g files
repos_with_pkginfo_file, repos_with_dependencies, pkg_tested_version = pkginfo_version_testing(repos)
print(f"Number of repositories with 'PackageInfo.g' file: {repos_with_pkginfo_file}")
print(f"Number of packages with 'Dependencies' section: {repos_with_dependencies}")

In [None]:
# Export collected data to JSON file to store them for later use and better overview
# Define the file path for the JSON file
data_folder = "collected_data"

# Create a dictionary to hold the version testing data
version_testing_data = {}

# Add all repositories as keys to version_testing_data
for repo in repos:
    package = repo.name
    version_testing_data[package] = {}

for tst_file_info in tst_files_info:
    package = tst_file_info["repository"]
    if "num_tst_files" in tst_file_info and "total_lines" in tst_file_info:
        version_testing_data[package]["tst_file_count"] = tst_file_info["num_tst_files"]
        version_testing_data[package]["total_lines_in_tst_files"] = tst_file_info["total_lines"]

# Add version info from CI.yml files to the dictionary
for package, versions in ci_tested_version.items():
    if versions:
        version_testing_data[package]["ci_file_version"] = versions

# Add GAP version info from PackageInfo.g files to the dictionary
for package, version in pkg_tested_version:
    if version:
        version_testing_data[package]["pkginfo_version"] = [version]
        
# Define the path for the JSON file
file_path = os.path.join(data_folder, "testing_data.json")

# Write the data to the JSON file
with open(file_path, "w") as json_file:
    json.dump(version_testing_data, json_file, indent=4)

print(f"Version testing data exported to the 'collected_data' folder.")