In [1]:
import pandas as pd
from google.cloud import bigquery

In [2]:
def fetch_and_save_data(client, org_list, time_period, dates):
    """
    Queries the GitHub archive dataset for specified repositories and dates, then saves the results in JSON files.

    Parameters:
        client (bigquery.Client): The BigQuery client used to execute queries.
        repository_list (str): A string of repository names formatted for SQL IN clause.
        time_period (str): The table suffix in BigQuery dataset (e.g., 'day', 'month').
        dates (list): A list of date strings in 'YYYYMMDD' format.

    Outputs:
        JSON files containing query results for each specified date.
    """
    for date in dates:
        try:
            query = f"""
            SELECT *
            FROM `githubarchive.{time_period}.{date}`
            WHERE org.login IN ({org_list})
            ORDER BY created_at
            """
            print(f"Querying data for {date}...")
            query_job = client.query(query)
            result = query_job.result(timeout=300)  # Extended timeout for large queries

            print(f"Job ID: {query_job.job_id}")
            print(f"Total bytes processed (GiB): {query_job.total_bytes_processed / (1024 ** 3)}")
            print(f"Query start time: {query_job.started}")
            print(f"Query end time: {query_job.ended}")
            print(f"Duration: {query_job.ended - query_job.started}")

            df = result.to_dataframe(progress_bar_type='tqdm')
            json_filename = f'numfocus-raw-data-v2/gh_events_{date}.json'
            df.to_json(json_filename, orient='records')
            print(f"Data for {date} saved to {json_filename}")

        except bigquery.exceptions.BigQueryError as e:
            print(f"BigQuery error for {date}: {e} \n\n")
        except Exception as e:
            print(f"Unexpected error for {date}: {e} \n\n")
        finally:
            print(f"Finished processing for {date} \n\n")

In [3]:
# Initialize the BigQuery client
bq_client = bigquery.Client('sylvan-client-422708-d4')

In [5]:
# Organizations to query
orgs = "'matplotlib', 'numpy', 'pandas-dev', 'jupyter', 'ipython', 'scipy', 'nteract', 'stan-dev', 'pymc-devs', 'JuliaLang', 'jump-dev', 'PyTables', 'shogun-toolbox', 'sympy', 'FEniCS', 'yt-project', 'econ-ark', 'astropy', 'sunpy', 'QuantEcon', 'ropensci', 'openjournals', 'Cantera', 'bokeh', 'pydata', 'Blosc', 'dask', 'mlpack', 'zarr-developers', 'MDAnalysis', 'scikit-image', 'InsightSoftwareConsortium', 'Open-MBEE', 'scikit-learn', 'SciML', 'tardis-sn', 'arviz-devs', 'lfortran', 'networkx', 'OSGeo', 'sgkit-dev', 'nipy', 'napari', 'scientific-python', 'conda', 'cupy', 'gnu-octave', 'galaxyproject', 'Bioconductor', 'pybamm-team', 'WorldWideTelescope', 'spyder-ide', 'gammapy', 'geopandas', 'scverse', 'holoviz', 'petsc', 'pangeo-data', 'fortran-lang', 'conda-forge', 'openfheorg'"

# Period & dates to query
query_period = 'month'
query_dates = ['202301', '202302', '202303', '202304', '202305', '202306', 
               '202307', '202308', '202309', '202310', '202311', '202312',
               '202401', '202402', '202403', '202404', '202405', '202406']

In [6]:
# Execute the function
fetch_and_save_data(bq_client, orgs, query_period, query_dates)

Querying data for 202301...
Job ID: b726ee6e-3048-4f06-bca5-14bd56f1a2ce
Total bytes processed (GiB): 466.9329786458984
Query start time: 2024-07-08 09:37:27.006000+00:00
Query end time: 2024-07-08 09:37:38.391000+00:00
Duration: 0:00:11.385000
Downloading: 100%|[32m█████████████████████████████████████████████████████████████[0m|[0m
Data for 202301 saved to numfocus-raw-data-v2/gh_events_202301.json
Finished processing for 202301 


Querying data for 202302...
Job ID: ce4f9eaa-68a0-4f8c-8d2b-fc87d5f68d15
Total bytes processed (GiB): 393.6564892306924
Query start time: 2024-07-08 09:49:43.318000+00:00
Query end time: 2024-07-08 09:49:57.405000+00:00
Duration: 0:00:14.087000
Downloading: 100%|[32m█████████████████████████████████████████████████████████████[0m|[0m
Data for 202302 saved to numfocus-raw-data-v2/gh_events_202302.json
Finished processing for 202302 


Querying data for 202303...
Job ID: 5dde3642-1ec2-4c13-a644-d4e4c00165f8
Total bytes processed (GiB): 429.361620815470