# PubMed article GitHub repository software information entropy

This notebook focuses on gathering PubMed article repository software information entropy data.

PubMed article GitHub repositories are extracted using the PubMed API to query for GitHub links within article abstracts.

## PubMed GitHub Repositories Analysis

The following code is used to analyze PubMed article GitHub repository software information entropy using existing methods from the Software Gardening Almanack package.
Each repository is processed in parallel through batches written to Parquet files which in aggregate compose a full dataset.

In [1]:
from concurrent.futures import ProcessPoolExecutor, as_completed

import pandas as pd

from almanack import process_repo_for_analysis


def repository_analysis():
    """
    Analyzes PudMed GitHub repositories to compute normalized information entropy

    Returns:
        pd.DataFrame: A DataFrame containing the results for the last batch of repositories processed.
                      Each row includes:
                      - 'Repository URL': The URL of the repository.
                      - 'Normalized Total Entropy': The normalized entropy value for the repository.
                      - 'Date of First Commit': The date of the first commit in the repository.
                      - 'Date of Last Commit': The date of the most recent commit in the repository.
                      - 'Time of Existence (days)': The number of days between the first and last commit.

    Notes:
        - Reads repository URLs from a Parquet file located at "../../../tests/data/examples/pubmed/pubmed_github_links.parquet".
        - Processes repositories in batches of 500 to manage the large datasets efficiently.
        - Results are saved in Parquet files named `repository_analysis_results_batch_X.parquet`, where X is the batch number.
    """
    df = pd.read_parquet("gather-pubmed-repos/pubmed_github_links.parquet")

    repo_urls = df["github_link"].tolist()

    batch_size = 500
    total_repos = len(repo_urls)
    repo_count = 0
    batch_number = 1

    # Process repositories in batches
    for start in range(0, total_repos, batch_size):
        # Determine the end index for the current batch
        end = min(start + batch_size, total_repos)
        # Extract the URLs for the current batch
        batch_urls = repo_urls[start:end]

        # Create a ProcessPoolExecutor to process repositories in parallel
        with ProcessPoolExecutor(max_workers=16) as executor:
            # Submit tasks to process each repository URL and store future results
            futures = {
                executor.submit(process_repo_for_analysis, repo_url): repo_url
                for repo_url in batch_urls
            }
            batch_results = []
            # Iterate over completed futures as they finish
            for future in as_completed(futures):
                repo_count += 1
                # Get the repository URL corresponding to the completed future
                repo_url = futures[future]
                try:
                    (
                        normalized_total_entropy,
                        first_commit_date,
                        most_recent_commit_date,
                        time_of_existence,
                    ) = future.result()
                    print(
                        f"Repository {repo_count}: {repo_url}, {normalized_total_entropy}"
                    )
                    batch_results.append(
                        [
                            repo_url,
                            normalized_total_entropy,
                            first_commit_date,
                            most_recent_commit_date,
                            time_of_existence,
                        ]
                    )
                except Exception:
                    # Handle any exceptions by appending a row with None values
                    batch_results.append([repo_url, None, None, None, None])

        # Create DataFrame for the current batch
        df_results = pd.DataFrame(
            batch_results,
            columns=[
                "Repository URL",
                "Normalized Total Entropy",
                "Date of First Commit",
                "Date of Last Commit",
                "Time of Existence (days)",
            ],
        )

        # Save batch results to Parquet file
        batch_filename = f"repository_analysis_results_batch_{batch_number}.parquet"
        df_results.to_parquet(batch_filename)
        print(f"Batch {batch_number} results saved to {batch_filename}")

        # Increment batch number
        batch_number += 1

    return df_results


df_results = repository_analysis()