In [1]:
import os
import datetime
import pandas as pd
import pytz
import time
import subprocess
import functools
import json
from pathlib import Path

from ghapi.all import GhApi
from ghapi.page import paged

In [None]:
# Make sure we're using the right tokens
api = GhApi(token="token")  # pull token from env var
os.environ['GITHUB_TOKEN'] = "token"  # pull token from env var

In [3]:
# get repo activity
# PR added / PR closed / Total PR (change from last period)
# Issue added / Issue closed / Total Issues (change from last period)


def in_period_mask(series: pd.Series, start: datetime.datetime, end: datetime.datetime):
    return (start < series) & (series < end)


def create_issue_summary(
    data,  # Generator of fastcore.L (list) of attr-dicts
    page_wait: int = 2,
    prev_info = None
):
    """
    Iterate through a PaginatedList and collect information from issues into a DataFrame
    Assumes each item in ``data`` has attrs: 'number', 'state', 'created_at', 'closed_at'

    Waits for ``page_wait`` between each page of results to dodge rate limits
    """
    if prev_info is None:
        info = {'number': [], 'state': [], 'created_at': [], 'closed_at': [],
                'title': [], 'is_pull': [], 'repository': []}
    else:
        info = prev_info
    # Must sleep between pages, each is a separate request, and therefore can
    # trigger rate limits
    page_no = 1
    for page in data:
        time.sleep(page_wait)
        print(f'> grabbing issue page {page_no}')
        
        # This apparently also makes a request each time
        for n, item in enumerate(page):
            # must also sleep between issue info requests
            time.sleep(.5)
            print(f'> reading {item.number} ({n+1})...   ', end='\r')
            info['number'].append(item.number)
            info['title'].append(item.title)
            info['state'].append(item.state)
            info['repository'].append(item.repository_url.split('/')[-1])
            info['is_pull'].append(bool(hasattr(item, 'pull_request')))
            info['created_at'].append(pd.to_datetime(item.created_at))
            info['closed_at'].append(pd.to_datetime(item.closed_at))

        page_no += 1
    return info
    # df = pd.DataFrame(data=info)
    # return df 


def summarize_period(df: pd.DataFrame, start, end, print_results=False):
    """
    Summarize the some key metrics for data inside the period (start, end)
    """
    # ensure tz aware
    start = pytz.utc.localize(start)
    end = pytz.utc.localize(end)
    # construct dataframe
    # find issues created inside the period
    new_in_period = df[in_period_mask(df['created_at'], start, end)]

    # find issues closed inside the period
    closed_in_period = df[in_period_mask(df['closed_at'], start, end)]

    if print_results:
        print(f"opened in last period: {len(new_in_period)}")
        print(f"closed in last period: {len(closed_in_period)}")
        print(f"total open: {len(df[df['state'] == 'open'])}")

    return len(new_in_period), len(closed_in_period)

In [4]:
from update_github_settings import gh_api_graphql_paginated

In [None]:
os.environ['NO_COLOR'] = '1'
os.environ['CLICOLOR_FORCE'] = '0'
@functools.lru_cache()
def get_packaged_graphql(filename: str) -> str:
    # Ref: https://graphql.org/learn/queries/
    # Ref: https://gist.github.com/duboisf/68fb6e22ac0a2165ca298074f0e3b553

    with open(filename, "rt") as fp:
        return fp.read().strip()

def run_gh(*command: str) -> bytes:
    return subprocess.check_output(["gh", *command])

def create_commit_summary_for_repo(
    repo: str,
    owner: str = 'pcdshub',
    prev_info = None,
):
    if prev_info is None:
        info = {'author': [], 'commit_sha': [], 'author_date': [], 'additions': [],
                'deletions': [], 'total': [], 'files_changed': [],
                'message': [], 'repo': []}
    else:
        info = prev_info
    
    # get info
    data = gh_api_graphql_paginated(
        get_packaged_graphql('repo_commits.graphql'),
        key=('repository', 'defaultBranchRef', 'target', 'history'),
        repoName=repo,
    )

    for cm in data:
        # Listing all commits does not give full details
        try:
            author = cm['author']['user']['login']
        except Exception:
            author = "N/A"
        info['author'].append(author)
        info['commit_sha'].append(cm['oid'][:10])
        info['author_date'].append(pd.to_datetime(cm['authoredDate']))
        info['additions'].append(cm['additions'])
        info['deletions'].append(cm['deletions'])
        info['total'].append(cm['additions']+cm['deletions'])
        info['files_changed'].append(cm['changedFilesIfAvailable'])
        info['message'].append(cm['message'].replace('\n', '\\n'))
        info['repo'].append(repo)

    return info

def create_commit_summary(
    data,  # Generator of fastcore.L (list) of attr-dicts,
    api,
    page_wait: int = 2,
    prev_info = None
):    
    if prev_info is None:
        info = {'commit_sha': [], 'author_date': [], 'additions': [],
                'deletions': [], 'total': []}
    else:
        info = prev_info
    
    page_no = 1
    for page in data:
        time.sleep(page_wait)
        print(f'> grabbing commit page {page_no}')
        for commit in page:
            # Listing all commits does not give full details
            cm = api.repo.get_commit(owner='pcdshub', repo='commit')
            info['commit_sha'].append(cm.sha[:10])
            info['author_date'].append(pd.to_datetime(cm.commit.author.date))
            info['additions'].append(cm.stats.additions)
            info['deletions'].append(cm.stats.deletions)
            info['total'].append(cm.stats.total)
 
    return info

# Show me the stats

In [6]:
today = datetime.datetime.now()
start_period = datetime.datetime(2021,1,1)

## Issues

In [None]:
# For use if we ended part way
prev_df = None
prev_df = pd.read_csv("../issue_stats_232repo.csv")
prev_df.head()

Unnamed: 0.1,Unnamed: 0,number,state,created_at,closed_at,title,is_pull,repository
0,0,356,open,2025-02-12 22:52:58+00:00,,First pass at allowing for cached devices to b...,True,happi
1,1,355,closed,2025-02-11 17:45:21+00:00,2025-02-11 20:32:32+00:00,"Fix minor typos in readme, add generated _vers...",True,happi
2,2,354,open,2025-01-31 18:59:31+00:00,,happi.cli.search_parser has misleading annotat...,False,happi
3,3,353,closed,2025-01-23 19:34:57+00:00,2025-02-10 19:43:58+00:00,BLD: split ui dependencies into subpackage,True,happi
4,4,352,closed,2025-01-09 16:40:58+00:00,2025-01-09 20:59:02+00:00,"BLD: Try swapping to line_profiler, seems the ...",True,happi


In [None]:
# Gather all relevant data up front first
repo_pages = paged(api.repos.list_for_org, org='pcdshub', per_page=100)
info_dict = None
for i, page in enumerate(repo_pages):
    print(f'processing repo page: {i}')
    for repo in page:
        if prev_df and repo.name in prev_df['repository'].unique():
            print(f"skipping, data already collected for {repo.name}")
            continue
        print('', end='\r')
        print(f'processing {repo.name}...')
        time.sleep(4)
        issue_info_pages = paged(api.issues.list_for_repo,
                                 owner='pcdshub', repo=repo.name, state='all',
                                 since=start_period, per_page=100)
        info_dict = create_issue_summary(issue_info_pages, prev_info=info_dict)

issue_df = pd.DataFrame(data=info_dict)

processing repo page: 0
skipping, data already collected for pcds-logstash
skipping, data already collected for pcds-plc-image-generator
processing lcls-plc-test-el2212-evaluation...
processing lcls_plc_kfe_gatt...
skipping, data already collected for lcls-plc-txi-optics
skipping, data already collected for PCB-Laser-Environmental-Sensor
skipping, data already collected for lcls-plc-tmo-motion
skipping, data already collected for lcls-plc-kfe-rix-motion
skipping, data already collected for lcls-plc-example-vac-interface
skipping, data already collected for pre-commit-hooks
processing ads-pydm2...
skipping, data already collected for pmps_test
skipping, data already collected for qtpyinheritance
skipping, data already collected for solid-attenuator
skipping, data already collected for lcls-plc-lfe-gem
skipping, data already collected for ophyd-ads
skipping, data already collected for pyqt-designer-plugin-entry-points
skipping, data already collected for lcls-plc-lfe-arbiter
skipping, da

In [20]:
issue_df.head()

Unnamed: 0,number,state,created_at,closed_at,title,is_pull,repository
0,2,open,2021-07-20 18:45:14+00:00,NaT,Try pyupgrade,False,old-hutch-configs
1,6,closed,2024-08-02 17:04:11+00:00,2024-08-02 21:11:00+00:00,Make user in charge compulsory,True,netconfig
2,5,open,2023-02-08 18:26:11+00:00,NaT,Note: repository exists also on gitea,False,netconfig
3,4,open,2023-02-08 17:24:29+00:00,NaT,Fail loudly when edits not performed on psdev,False,netconfig
4,3,closed,2021-09-23 21:59:37+00:00,2022-03-10 23:42:22+00:00,Py3k,True,netconfig


# Commit Stats
We're doing this through graphql because it lets us make fewer queries

In [11]:
repo_pages = paged(api.repos.list_for_org, org='pcdshub', per_page=100)
fp = "../commit_stats2.csv"
fp_old = "../commit_stats.csv"

prev_df = pd.read_csv(fp_old, usecols=['repo'])
# Gather all relevant commit data up front first
info_dict = None
for i, page in enumerate(repo_pages):
    print(f'processing repo page: {i}')
    for repo in page:
        if prev_df is not None and repo.name in prev_df['repo'].unique():
            print(f"skipping, data already collected for {repo.name}")
            continue
        print('', end='\r')
        print(f'processing {repo.name} commits...')
        time.sleep(2)
        try:
            info_dict = create_commit_summary_for_repo(repo=repo.name, prev_info=info_dict)
        except Exception:
            print('skipping, likely no valid commits')
            time.sleep(2)
        commit_df = pd.DataFrame(data=info_dict)
        if Path(fp).exists():
            commit_df.to_csv(fp, mode='a', header=False, index=False)
        else:
            commit_df.to_csv(fp, index=False)

processing repo page: 0
skipping, data already collected for happi
skipping, data already collected for pcds-recipes
skipping, data already collected for pcdsdevices
skipping, data already collected for lightpath
skipping, data already collected for pswalker
skipping, data already collected for hxrsnd
skipping, data already collected for archapp
skipping, data already collected for transfocate
skipping, data already collected for lcls-plc-xrt-optics
processing skywalker-docs commits...
skipping, likely no valid commits
processing QDarkStyleSheet commits...
skipping, likely no valid commits
skipping, data already collected for roadrunner
skipping, data already collected for typhos
skipping, data already collected for engineering_tools
skipping, data already collected for pcds-envs
skipping, data already collected for cookiecutter-pcds-python
skipping, data already collected for pytmc
skipping, data already collected for hutch-python
skipping, data already collected for pcdshub.github.io

In [10]:
fp_old = '../commit_stats.csv'
prev_df = pd.read_csv(fp_old, usecols=['repo'])
len(prev_df['repo'].unique())

859

In [16]:
len(pd.read_csv(fp, usecols=['repo'])['repo'].unique())

123

In [None]:
# gather these stats for all the repos
data = {'repo': [], 'issues_opened': [], 'issues_closed': [], 'total_issues': [],
        'pulls_opened': [], 'pulls_closed': [], 'total_pulls': [],
        'n_commits': [], 'additions': [], 'deletions': [], 'total_changes': []}

for repo in repos:
    opened_iss, closed_iss = summarize_period(issue_df, start_period, today)
    pull_info = issue_df[issue_df['is_pull'] & (issue_df['repository']==repo.name)]
    opened_pr, closed_pr = summarize_period(pull_info, start_period, today)
    data['repo'].append(repo.name)
    data['issues_opened'].append(opened_iss)
    data['issues_closed'].append(closed_iss)
    data['pulls_opened'].append(opened_pr)
    data['pulls_closed'].append(closed_pr)
    # TODO: check only open pulls/issues
    data['total_pulls'].append(len([p for p in pull_info if p.state == 'open']))
    # data['total_issues'].append(len(list(repo.get_issues(state='open'))))
    data['total_issues'].append(len([i for i in issue_info if i.state == 'open']))

    # commit stats
    time.sleep(6)
    commit_info = repo.get_commits(since=start_period)
    commit_df = create_commit_df(commit_info, start_period, today)

    in_period_commits = commit_df[
        in_period_mask(commit_df['author_date'], start_period, today)
    ]
    data['n_commits'].append(len(in_period_commits))
    data['additions'].append(in_period_commits['additions'].sum())
    data['deletions'].append(in_period_commits['deletions'].sum())
    data['total_changes'].append(in_period_commits['total'].sum())
    break

stats_df = pd.DataFrame(data=data)

In [117]:
stats_df.to_csv('/Users/roberttk/gh_loc/data/repo_stats.csv')