In [1]:
# !pip install PyYAML python-dotenv
# !pip install langchain langchain-community openai

In [2]:
import requests
import yaml
import os
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain.chat_models import AzureChatOpenAI
from tqdm import tqdm
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain_core.exceptions import OutputParserException

In [3]:
# Load environment variables from .env file
load_dotenv()

# Azure OpenAI configuration from environment variables
endpoint = os.getenv("ENDPOINT_URL")
deployment = os.getenv("DEPLOYMENT_NAME")
subscription_key = os.getenv("AZURE_OPENAI_API_KEY")

# Verify that all required environment variables are present
if not all([endpoint, deployment, subscription_key]):
    raise ValueError("Missing required environment variables. Please check your .env file.")

In [4]:
# Configuration
USE_FILTERED_ACCOUNTS = True  # Set to False to process all accounts

In [5]:
def filter_repositories(accounts, target_repos):
    """
    Filter accounts to only include specified repositories.
    
    Args:
        accounts (dict): Dictionary of country to repository usernames
        target_repos (list): List of repository usernames to include
    
    Returns:
        dict: Filtered accounts containing only the specified repositories
    """
    filtered_accounts = {}
    for country, usernames in accounts.items():
        # Filter usernames that are in our target list
        filtered_usernames = [username for username in usernames if username in target_repos]
        if filtered_usernames:  # Only add countries that have matching repositories
            filtered_accounts[country] = filtered_usernames
    return filtered_accounts

In [6]:
# Define target repositories (example: UK government repositories)
TARGET_REPOS = [
    'alphagov', 'i-dot-ai', 'canada-ca', 'govtechsg', 'GSA', 'ec-europa', 'opengovsg'
]

In [7]:
def fetch_gov_github_accounts(url):
    response = requests.get(url)
    if response.status_code == 200:
        return yaml.safe_load(response.text)
    else:
        return None

def fetch_repository_details(username, token):
    headers = {'Authorization': f'token {token}'}
    repos_url = f"https://api.github.com/users/{username}/repos"
    repos_response = requests.get(repos_url, headers=headers)
    
    if repos_response.status_code == 200:
        repos_data = repos_response.json()
        full_repo_details = []
        
        for repo in repos_data:
            repo_details = {
                'name': repo['name'],
                'description': repo['description'] or "No description",
                'stars': repo['stargazers_count'],
                'forks': repo['forks'],
                'language': repo['language'] or "None specified"
            }
            # Fetch the README file
            readme_url = f"https://api.github.com/repos/{username}/{repo['name']}/readme"
            readme_response = requests.get(readme_url, headers=headers)
            if readme_response.status_code == 200:
                readme_data = readme_response.json()
                readme_content = requests.get(readme_data['download_url']).text
                repo_details['readme'] = readme_content[:100]  # Truncate for brevity
            else:
                repo_details['readme'] = "README not available"
            
            full_repo_details.append(repo_details)
        
        return full_repo_details
    else:
        return None

def save_to_markdown(repos, filename):
    with open(filename, 'w') as f:
        f.write('| Repository Name | Description | Stars | Language | README |\n')
        f.write('|-----------------|-------------|-------|----------|--------|\n')
        for repo in repos:
            f.write(f"| {repo['name']} | {repo['description']} | {repo['stars']} | {repo['language']} | {repo['readme'][:50]}... |\n")


# Load environment variables from .env file
load_dotenv('.env')

# Access environment variables
github_token = os.getenv('GITHUB_TOKEN')
if not github_token:
    raise ValueError("GITHUB_TOKEN not found in environment variables. Please check your .env file.")



In [8]:

# Main execution
url = "https://raw.githubusercontent.com/github/government.github.com/gh-pages/_data/governments.yml"
all_accounts = fetch_gov_github_accounts(url)



In [9]:
all_accounts


{'Argentina': ['argob',
  'cifasis',
  'gcba',
  'inti-cmnb',
  'municipalidad-de-vicente-lopez',
  'municipioriogrande'],
 'Australia': ['actesa',
  'actgov',
  'agnsw',
  'AtlasOfLivingAustralia',
  'ausdto',
  'australianantarcticdatacentre',
  'AustralianAntarcticDivision',
  'berowrarfb',
  'bom-radar',
  'city-of-melbourne',
  'commerce-wa-ols',
  'consumerdataright',
  'data61',
  'datagovau',
  'dbca-wa',
  'dpc-sdp',
  'dpipwe',
  'dssgovaus',
  'envris',
  'Fire-and-Rescue-NSW',
  'gccgisteam',
  'GeoscienceAustralia',
  'govau',
  'govcms',
  'gs-dawr',
  'healthgovau',
  'Healthway',
  'hiscom',
  'innovationgovau',
  'IPAustralia',
  'Landgate',
  'nla',
  'NSW-eTendering',
  'NSWPlanning',
  'pmcau',
  'PublicRecordOfficeVictoria',
  'qld-gov-au',
  'srnsw',
  'SunshineCoastCouncil',
  'treasury-aus',
  'victoriangovernment',
  'wagov',
  'wamuseum'],
 'Austria': ['datagvat'],
 'Belgium': ['belgianpolice',
  'CIRB',
  'Fedict',
  'inbo',
  'NationalBankBelgium',
  'onroer

In [10]:
# Choose whether to use filtered or all accounts
if USE_FILTERED_ACCOUNTS:
    print("Using filtered accounts...")
    accounts_to_process = filter_repositories(all_accounts, TARGET_REPOS)
else:
    print("Using all accounts...")
    accounts_to_process = all_accounts

# Process repositories
all_repos = []
if accounts_to_process:
    for country, usernames in accounts_to_process.items():
        print(f"Processing {len(usernames)} repositories for {country}")
        for username in usernames:
            print(f"Fetching data for {username}...")
            repo_details = fetch_repository_details(username, github_token)
            if repo_details:
                all_repos.extend(repo_details)
                print(f"Successfully processed {len(repo_details)} repositories for {username}")
            else:
                print(f"Failed to fetch data for {username}")

# Save results
if all_repos:
    # Choose appropriate filename based on whether we're using filtered accounts
    base_filename = "filtered_repositories" if USE_FILTERED_ACCOUNTS else "all_repositories"
    
    # Save as markdown
    markdown_file = f"{base_filename}.md"
    save_to_markdown(all_repos, markdown_file)
    print(f"\nResults saved to {markdown_file}")
    
    # Save as CSV
    import pandas as pd
    repos_df = pd.DataFrame(all_repos)
    csv_file = f"{base_filename}.csv"
    repos_df.to_csv(csv_file, index=False)
    print(f"Results also saved as {csv_file}")
else:
    print("No repository data collected.")

Using filtered accounts...
Processing 1 repositories for Canada
Fetching data for canada-ca...
Successfully processed 30 repositories for canada-ca
Processing 1 repositories for European Union
Fetching data for ec-europa...
Successfully processed 30 repositories for ec-europa
Processing 2 repositories for Singapore
Fetching data for govtechsg...
Successfully processed 30 repositories for govtechsg
Fetching data for opengovsg...
Successfully processed 30 repositories for opengovsg
Processing 2 repositories for U.K. Central
Fetching data for alphagov...
Successfully processed 30 repositories for alphagov
Fetching data for i-dot-ai...
Successfully processed 30 repositories for i-dot-ai

Results saved to filtered_repositories.md
Results also saved as filtered_repositories.csv


In [11]:
repos_df = pd.DataFrame(all_repos)
repos_df

Unnamed: 0,name,description,stars,forks,language,readme
0,a11y,No description,5,8,HTML,\n([Français](#gabarit-pour-dépôts-de-code-sou...
1,accelerators_accelerateurs-aws,[AWS] Tools and templates to accelerate GC ser...,16,3,None specified,# GC Accelerators (AWS)\n\nAWS has developed a...
2,accelerators_accelerateurs-azure,[AZURE] Tools and templates to accelerate GC s...,37,14,PowerShell,([Français](#gc-accelerateurs-azure))\n\n## Az...
3,accelerators_accelerateurs-gcp,[GCP] Tools and templates to accelerate GC ser...,10,8,HCL,\n# GC Accelerators (GCP)\n\n[![Open this proj...
4,aia-eia,Algorithmic Impact Assessment - Évaluation de ...,3,2,HTML,[![Build Status](https://travis-ci.com/canada-...
...,...,...,...,...,...,...
175,lex-graph,Building a knowledge graph of UK legislation,8,1,Jupyter Notebook,# Lex Graph 🕸️\n[![PRs Welcome](https://img.sh...
176,local-gov-hack,No description,15,7,None specified,# 🏙️ Local Government Innovation Hackathon\n\n...
177,mvp-python-cookiecutter,Unofficial cookiecutter for a basic Python repo.,0,0,Makefile,# Minimal Python Cookiecutter\n\nThis is a ver...
178,platform-terraform-test,No description,0,0,HCL,# I.AI Platform AWS / Terraform Exercise\n\n##...


In [12]:
# !pip install langchain langchain-anthropic langchain-community

In [13]:
from langchain.prompts import PromptTemplate
from langchain_anthropic import ChatAnthropic
from tqdm import tqdm
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.exceptions import OutputParserException

anthropic_api_key = os.getenv('ANTHROPIC_KEY')
df = repos_df.copy()

def classify_repos(df, topic_list, model_name="gpt-4o"):
    # Initialize Azure OpenAI client
    llm = AzureChatOpenAI(
        openai_api_version="2025-01-01-preview",
        azure_deployment=deployment,
        azure_endpoint=endpoint,
        api_key=subscription_key,
    )
    
    summary_prompt = PromptTemplate(
        input_variables=["description", "readme"],
        template="""
        Please provide a summary of the following GitHub repository based on its description and README.md content. If the README.md is not in English, please first translate it to English and then generate a summary. The summary should be concise and in fewer than 5 sentences.
        
        Repository description:
        {description}
        
        README.md content:
        {readme}
        
        Summary:
        """
    )
    
    response_schemas = [
        ResponseSchema(name="topic", description="The selected topic for the repository.")
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

    topic_prompt = PromptTemplate(
        input_variables=["summary", "topic_list"],
        template="""
        Given the following summary of a GitHub repository and a list of potential topics, select the most appropriate topic for the repository. If none of the topics in the list are suitable, generate a new topic label.

        Repository summary:
        {summary}

        Potential topics:
        {topic_list}

        Selected topic:
        {{topic}}

        PLEASE ONLY RETURN THE TOPIC LABEL AS THE RESPONSE. DO NOT INCLUDE ANY ADDITIONAL TEXT.
        """,
        output_parser=output_parser,
    )

    def generate_summary(row):
        description = row["description"] if pd.notnull(row["description"]) else ""
        readme = row["readme"] if pd.notnull(row["readme"]) else ""
        prompt = summary_prompt.format(description=description, readme=readme)
        summary = llm.invoke(prompt).content
        return summary

    def classify_repo(readme):
        row = df.loc[df["readme"] == readme].iloc[0]
        summary = generate_summary(row)
        prompt = topic_prompt.format(summary=summary, topic_list=", ".join(topic_list))
        response = llm.invoke(prompt).content

        try:
            topic = output_parser.parse(response)["topic"]
        except OutputParserException:
            # Handle the case when the response is not a valid JSON
            topic = response.strip()  # Extract the topic as a plain string

        if topic not in topic_list:
            topic_list.append(topic)

        return topic
    
    tqdm.pandas(desc="Classifying repositories")
    df["topic"] = df["readme"].progress_apply(classify_repo)
    
    df["summary"] = df.apply(generate_summary, axis=1)
    
    return df, topic_list

In [14]:
display(df)

Unnamed: 0,name,description,stars,forks,language,readme
0,a11y,No description,5,8,HTML,\n([Français](#gabarit-pour-dépôts-de-code-sou...
1,accelerators_accelerateurs-aws,[AWS] Tools and templates to accelerate GC ser...,16,3,None specified,# GC Accelerators (AWS)\n\nAWS has developed a...
2,accelerators_accelerateurs-azure,[AZURE] Tools and templates to accelerate GC s...,37,14,PowerShell,([Français](#gc-accelerateurs-azure))\n\n## Az...
3,accelerators_accelerateurs-gcp,[GCP] Tools and templates to accelerate GC ser...,10,8,HCL,\n# GC Accelerators (GCP)\n\n[![Open this proj...
4,aia-eia,Algorithmic Impact Assessment - Évaluation de ...,3,2,HTML,[![Build Status](https://travis-ci.com/canada-...
...,...,...,...,...,...,...
175,lex-graph,Building a knowledge graph of UK legislation,8,1,Jupyter Notebook,# Lex Graph 🕸️\n[![PRs Welcome](https://img.sh...
176,local-gov-hack,No description,15,7,None specified,# 🏙️ Local Government Innovation Hackathon\n\n...
177,mvp-python-cookiecutter,Unofficial cookiecutter for a basic Python repo.,0,0,Makefile,# Minimal Python Cookiecutter\n\nThis is a ver...
178,platform-terraform-test,No description,0,0,HCL,# I.AI Platform AWS / Terraform Exercise\n\n##...


In [14]:
df, topic_list = classify_repos(repos_df, topic_list=['ai', 'web development', 'data science', 'cybersecurity'])
df
df.to_csv('all_gov_projects.csv', index=False)

  llm = AzureChatOpenAI(
Classifying repositories: 100%|███████████████████████████████████████████████████████████████████████████████████| 180/180 [05:14<00:00,  1.75s/it]
