# Good First Issue Crawler

## Preamble

In [1]:
import json
from github import Github
from tqdm import tqdm
from loguru import logger

## Workflow

In [11]:



def get_github_instance():
    with open("../secrets/github_token.txt", "r") as file:
        token = file.read().strip()
    logger.info("GitHub instance initialized")
    return Github(token)

def get_repositories(github_instance, repo_list):
    logger.info(f"Fetching specified repositories")
    repos = []
    for repo_path in repo_list:
        repo_path = repo_path.replace(" ", "")  # remove any spaces
        try:
            repos.append(github_instance.get_repo(repo_path))
        except Exception as e:
            logger.error(f"Error fetching repo {repo_path}: {e}")
    return repos


def get_issue_dict(issue, repo):
    return {
        'repo': repo.full_name,
        'repo_url': repo.html_url,
        'title': issue.title,
        'description': issue.body, 
        'labels': [label.name for label in issue.labels],
        'issue_url': issue.html_url,
        'created_at': str(issue.created_at),
        'comments': []
    }

def get_comment_dict(comment):
    return {
        'author': comment.user.login,
        'body': comment.body,
        'created_at': str(comment.created_at),
    }

def is_good_first_issue(issue):
    return any(label.name.lower() == "good first issue" for label in issue.labels)

def process_repositories(repos, progress_bar):
    all_issues = {}
    issue_count = 0
    for repo in repos:
        logger.info(f"Processing repository: {repo.full_name}")
        try:
            issues = repo.get_issues(state='open')
            for issue in issues:
                if is_good_first_issue(issue):
                    issue_dict = get_issue_dict(issue, repo)
                    comments = issue.get_comments()
                    for comment in comments:
                        comment_dict = get_comment_dict(comment)
                        issue_dict['comments'].append(comment_dict)
                    all_issues[issue.id] = issue_dict
                    issue_count += 1
        except Exception as e:
            logger.error(f"Error processing repo: {e}")
        finally:
            progress_bar.update(1)
    return all_issues, issue_count

def save_issues_to_file(all_issues):
    with open('issues.json', 'w') as f:
        json.dump(all_issues, f)
    logger.info("Issues saved to file")

def main():
    g = get_github_instance()
    repo_list = [
        "matplotlib / matplotlib",
        "einsteinpy / einsteinpy",
        "zulip / zulip",
        "falconry / falcon",
        "dmlc / gluon-nlp",
        "mesonbuild  / meson",
        "scipy / scipy",
        "oilshell / oil",
        "google / TensorNetwork",
        "statsmodels / statsmodels",
        "ray-project / ray",
        "cython / cython",
        "automl / auto-sklearn",
        "facebookresearch / pythia",
        "mlflow / mlflow",
        "dask / dask",
        "streamlit / streamlit",
        "quantumblacklabs / kedro",
        "geopandas / geopandas",
        "pandas-dev / pandas",
        "astropy / astropy",
        "sympy / sympy",
    ]  # add the repos you want to process here

    repos = get_repositories(g, repo_list)
    repo_progress_bar = tqdm(total=len(repos), desc="Processing repositories", dynamic_ncols=True)
    all_issues, issue_count = process_repositories(repos, repo_progress_bar)
    repo_progress_bar.close()
    issue_progress_bar = tqdm(total=issue_count, desc="Matching issues found", dynamic_ncols=True)
    issue_progress_bar.update(issue_count)
    issue_progress_bar.close()
    save_issues_to_file(all_issues)
    logger.info(f"Total issues found: {issue_count}")

if __name__ == "__main__":
    main()

[32m2023-06-17 19:30:41.034[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_github_instance[0m:[36m4[0m - [1mGitHub instance initialized[0m
[32m2023-06-17 19:30:41.035[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_repositories[0m:[36m8[0m - [1mFetching specified repositories[0m
Processing repositories:   0%|          | 0/17 [00:00<?, ?it/s][32m2023-06-17 19:30:48.823[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_repositories[0m:[36m44[0m - [1mProcessing repository: matplotlib/matplotlib[0m
Processing repositories:   6%|▌         | 1/17 [01:07<18:03, 67.70s/it][32m2023-06-17 19:31:56.519[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_repositories[0m:[36m44[0m - [1mProcessing repository: einsteinpy/einsteinpy[0m
Processing repositories:  12%|█▏        | 2/17 [01:11<07:32, 30.19s/it][32m2023-06-17 19:32:00.459[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_repositories[0m:[36m44[0m - [1mProcessing repository: zulip/zulip[