In [19]:
import requests
import json
import re
import numpy as np
import pandas as pd
import github_config  # a hidden file in the directory with github API username and password info

#create a blank API query that returns every code
response = requests.get("https://dev.ascl.net/api/search/?q=\"\"&fl=bibcode,site_list")
j = response.json()

#now, see if the values in the 'site_list' field are valid github repos
validRepo = "^.*\"(https://github.com/.*/.*\").*$"
repos = np.array([])
for code in j:
    gitRepos = re.match(validRepo, code['site_list'])
    if gitRepos:
        #if it contains a valid github repo, then add relevant information to the array  
        repo = gitRepos.group(1)
        url = repo.split("\"", 1)[0] + '/'
        urlRegex = "^https:\/\/github.com\/(.+?)\/(.+?)(?:\/.*)$"
        match = re.match(urlRegex, url)
        if match:
            author = match.group(1)
            repo_title = match.group(2)
            ascl_id = code['ascl_id']
            arr = np.array([author, repo_title, ascl_id])
            repos = np.append(repos, [arr])

#now we have an array of arrays with the repo name, author, and an ascl_id
data = np.reshape(repos, [-1,3])
df = pd.DataFrame({'Author': data[:,0], 'Repo': data[:,1], 'ascl_id': data[:,2]})

#get rid of duplicates
df.drop_duplicates(subset ="Repo", 
                     keep = 'first', inplace = True) 

#might need to get rid of 0000.000 ascl_ids

display(df)

Unnamed: 0,Author,Repo,ascl_id
0,EdoardoCarlesi,cmbeasy,1007.004
1,daddeptr,Needlets,1010.004
2,piernik-dev,piernik,1010.005
3,skendrew,midIR_sensitivity,1010.008
4,Starlink,starlink,1407.002
5,PrincetonUniversity,Athena-Cversion,1010.014
6,evanoconnor,GR1D,1010.022
7,mariogrs,Simfast21,1010.025
8,eggplantbren,DNest4,1010.029
9,jobovy,extreme-deconvolution,1010.032


In [21]:
#now, we can redo statistics on the data
#first, re-obtain the language data

session = requests.Session()
session.auth = (github_config.username, github_config.password)
http_base = 'https://api.github.com/repos/'

new_arr = []


for index, row in df.iterrows():
    try:
        date = session.get(http_base+row['Author']+'/'+row['Repo']).json()['created_at']
    except:
        date = None
    r = session.get(http_base+row['Author']+'/'+row['Repo']+'/languages')
    languages = r.json()
    for key in languages:
        arr = [row['Author'], row['Repo'], row['ascl_id'], date, key, languages[key]]
        new_arr.append(arr)
        
df = pd.DataFrame(new_arr, columns=['Author', 'Repo', 'ascl-id', 'repo_date', 'Language', 'Bytes'])
display(df)

Unnamed: 0,Author,Repo,ascl-id,repo_date,Language,Bytes
0,EdoardoCarlesi,cmbeasy,1007.004,2013-04-13T10:55:00Z,C++,2313919
1,EdoardoCarlesi,cmbeasy,1007.004,2013-04-13T10:55:00Z,C,20287
2,EdoardoCarlesi,cmbeasy,1007.004,2013-04-13T10:55:00Z,Objective-C,4294
3,daddeptr,Needlets,1010.004,2014-02-24T18:28:03Z,Fortran,63589
4,daddeptr,Needlets,1010.004,2014-02-24T18:28:03Z,IDL,7343
5,daddeptr,Needlets,1010.004,2014-02-24T18:28:03Z,Perl,1782
6,piernik-dev,piernik,1010.005,2013-06-14T11:31:14Z,Fortran,2863809
7,piernik-dev,piernik,1010.005,2013-06-14T11:31:14Z,Python,120477
8,piernik-dev,piernik,1010.005,2013-06-14T11:31:14Z,Shell,24446
9,piernik-dev,piernik,1010.005,2013-06-14T11:31:14Z,Roff,7201


In [24]:
#saving the data so we don't have to redo this work
df.to_csv('language_data_with_dates.csv', index=False)