In [1]:
%%capture
%run ../path_manager.ipynb

In [2]:
import pandas as pd
import requests
from pprint import pprint
from datetime import datetime, timezone
from githubgql import gql_query, gql_get_repo_readme
import pymongo
import json
import time

In [3]:
mongo = pymongo.MongoClient(port=27018)
db = mongo['nlp']
collection = db['github-bq']

In [4]:
%%time
mdf = pd.read_csv('./data/module_usage_per_repository/module_usage_per_repository_20191120233530.csv.gz')
py_mdf = mdf[mdf.script_type.isin(['Python', 'IPyton'])]
r_mdf = mdf[mdf.script_type.isin(['R'])]

CPU times: user 7.34 s, sys: 627 ms, total: 7.96 s
Wall time: 7.96 s


In [5]:
%%time
r_module_docs = r_mdf.groupby('repo_name').apply(lambda x: x['module'].dropna().tolist())

CPU times: user 5.96 s, sys: 319 ms, total: 6.28 s
Wall time: 5.87 s


In [6]:
%%time
py_module_docs = py_mdf.groupby('repo_name').apply(lambda x: x['module'].dropna().tolist())

CPU times: user 2min 58s, sys: 4.64 s, total: 3min 3s
Wall time: 2min 51s


In [7]:
py_module_docs[py_module_docs.map(len) == 0]

repo_name
ChaiTeaNunes/python-struct                 []
ChaiTeaNunes/python_struct                 []
Distrotech/kmod                            []
GalliumOS/kmod                             []
ImageMagick/PythonMagick                   []
                                           ..
tokyo-jesus/funk                           []
tubia/accollo                              []
voanhcuoc/tcg                              []
volosovich/Monokai-ST3-theme-for-vscode    []
xj9/funk                                   []
Length: 67, dtype: object

# Store R and Python repositories to mongodb

In [8]:
module_docs = r_module_docs.append(py_module_docs)
module_docs.head()

repo_name
00tau/skyline-addon-easyqc           [chron, ggplot2, plyr]
0111001101111010/cs595-f13                           [grid]
02N/mal                                          [rdyncall]
06122010/r_isi                [RSQLite, ggplot2, plyr, DBI]
0gajun/mal                                       [rdyncall]
dtype: object

In [9]:
repo_names = module_docs.index.drop_duplicates()

In [40]:
def check_rate_limit(rate_limit):
    if rate_limit['remaining'] < 2:
        reset_at = rate_limit['resetAt']
        sleep_seconds = (pd.to_datetime(reset_at) - pd.to_datetime(datetime.now(tz=timezone.utc).isoformat())).total_seconds()
        sleep_seconds = sleep_seconds + 1
        notify('running.log', f'Rate limit below threshold. Rate refreshes at {reset_at} and the program will sleep for {sleep_seconds} seconds.', verbose=True)
        time.sleep(sleep_seconds)

In [41]:
processed_repos = set()
stored_repos = {i['_id'] for i in collection.find({}, projection=['_id'])}
err_repos = set()

with open('errors.log') as e:
    for i in e.readlines():
        err_repos.add(i.split(':')[3].strip())
        
processed_repos = stored_repos.union(err_repos)
len(stored_repos), len(err_repos), len(processed_repos)

(69785, 7141, 76926)

In [None]:
%%time
for ix, repo_name in enumerate(repo_names):
    if repo_name in processed_repos:
        continue

    notify('running.log', f'{ix + 1}. {repo_name}', verbose=False)
    r_modules = r_module_docs[repo_name] if repo_name in r_module_docs.index else []
    py_modules = py_module_docs[repo_name] if repo_name in py_module_docs.index else []

    owner, name = repo_name.split('/')
    result = gql_get_repo_readme(owner=owner, name=name)
    
    if 'request' in result:
        request = result['request']
        try:
            request_error = request.json()
            request_error = json.dumps(request_error)
        except:
            request_error = request.content
            if isinstance(request_error, bytes):
                request_error = request_error.decode('utf-8')
                
        notify('request_errors.log', f'{repo_name} :: {request_error}', verbose=False)
        time.sleep(5)
        continue
        
    repo_data = pd.DataFrame(result['data']['repository'])
    rate_limit = result['data']['rateLimit']

    notify('rate_limit.log', f'{repo_name} :: {json.dumps(rate_limit)}', verbose=False)
    if 'errors' in result:
        errors = json.dumps(result['errors'])
        notify('errors.log', f'{repo_name} :: {errors}', verbose=False)
        check_rate_limit(rate_limit)
        continue

    repo_data = repo_data.apply(lambda x: x.dropna(), axis=1).mode()  # hax
    readme_cols = repo_data.columns[repo_data.columns.str.startswith('readme')]
    if len(readme_cols) > 0:
        readme_col = readme_cols[0]
        repo_data['readme'] = repo_data[readme_col]
        repo_data = repo_data[repo_data.columns.difference(readme_cols)]

    repo_data[['watchers', 'stargazers']] = repo_data[['watchers', 'stargazers']].astype(int)

    document = repo_data.to_dict('records')[0]

    document['owner'] = owner
    document['name'] = name
    document['py_libs'] = py_modules
    document['r_libs'] = r_modules
    document['_id'] = repo_name

    now = datetime.now(tz=timezone.utc).isoformat()
    document['last_updated_date'] = now

    collection.update_one(
        {"_id": document["_id"]},
        {
            "$setOnInsert": {"insertion_date": now},
            "$set": document
        },
        upsert=True,
    )

    check_rate_limit(rate_limit)
    
#     if rate_limit['remaining'] < 2:
#         reset_at = rate_limit['resetAt']
#         sleep_seconds = (pd.to_datetime(reset_at) - pd.to_datetime(datetime.now(tz=timezone.utc).isoformat())).total_seconds()
#         sleep_seconds = sleep_seconds + 1
#         notify('running.log', f'Rate limit below threshold. Rate refreshes at {reset_at} and the program will sleep for {sleep_seconds} seconds.', verbose=True)
#         time.sleep(sleep_seconds)

2019-11-24 22:31:12.095258: Rate limit below threshold. Rate refreshes at 2019-11-25T03:48:21Z and the program will sleep for 1029.904909 seconds.
2019-11-24 23:29:38.614375: Rate limit below threshold. Rate refreshes at 2019-11-25T04:56:56Z and the program will sleep for 1638.385785 seconds.
2019-11-25 00:44:47.668792: Rate limit below threshold. Rate refreshes at 2019-11-25T05:56:57Z and the program will sleep for 730.331379 seconds.
2019-11-25 01:45:04.920475: Rate limit below threshold. Rate refreshes at 2019-11-25T06:56:58Z and the program will sleep for 714.079689 seconds.
2019-11-25 02:39:06.112695: Rate limit below threshold. Rate refreshes at 2019-11-25T07:56:59Z and the program will sleep for 1073.887472 seconds.
2019-11-25 03:39:31.529644: Rate limit below threshold. Rate refreshes at 2019-11-25T09:35:02Z and the program will sleep for 3331.470517 seconds.
2019-11-25 05:23:25.998241: Rate limit below threshold. Rate refreshes at 2019-11-25T10:35:03Z and the program will slee

In [43]:
len(repo_names)

577320

# Old version

In [None]:
# %%time
# for ix, repo_name in enumerate(r_module_docs.index):
#     if repo_name in processed_repos:
#         continue
        
#     notify('running.log', f'{ix + 1}. {repo_name}', verbose=False)
#     r_modules = r_module_docs[repo_name]
#     py_modules = py_module_docs[repo_name] if repo_name in py_module_docs.index else []
    
#     owner, name = repo_name.split('/')
#     result = gql_get_repo_readme(owner=owner, name=name)

#     repo_data = pd.DataFrame(result['data']['repository'])
#     rate_limit = result['data']['rateLimit']
    
#     notify('rate_limit.log', f'{repo_name} :: {json.dumps(rate_limit)}', verbose=False)
#     if 'errors' in result:
#         errors = json.dumps(result['errors'])
#         notify('errors.log', f'{repo_name} :: {errors}', verbose=False)
#         continue

#     if 'text' in repo_data.index:
#         text = repo_data.loc['text'].dropna()
#         rtype = text.index[text.index.str.startswith('readme')]
#         text = text.rename(index={rtype[0]: 'readme', 'updatedAt': 'repo_updated_at'})
#     else:
#         text = repo_data.loc['id'].dropna()
#         text['readme'] = ''
#         text = text.rename(index={'updatedAt': 'repo_updated_at'})

#     text['owner'] = owner
#     text['name'] = name
#     text['Python'] = py_modules
#     text['R'] = r_modules
#     text['_id'] = repo_name
#     document = text.to_dict()

#     now = datetime.now(tz=timezone.utc).isoformat()
#     document['last_updated_date'] = now
    
#     collection.update_one(
#         {"_id": document["_id"]},
#         {
#             "$setOnInsert": {"insertion_date": now},
#             "$set": document
#         },
#         upsert=True,
#     )
    
#     if rate_limit['remaining'] < 2:
#         reset_at = rate_limit['resetAt']
#         sleep_seconds = (pd.to_datetime(reset_at) - pd.to_datetime(datetime.now(tz=timezone.utc).isoformat())).total_seconds()
#         sleep_seconds = sleep_seconds + 5
#         notify('running.log', f'Rate limit below threshold. Rate refreshes at {reset_at} and the program will sleep for {sleep_seconds} seconds.', verbose=True)
#         time.sleep(sleep_seconds)

# Raw query examples

In [None]:
# "mcaceresb", name: "stata-gtools"
result = gql_get_repo_readme(owner='mcaceresb', name='stata-gtools')
# result = gql_get_repo_readme(owner='LisaNeef', name='ClimateInGermany')  # No readme

In [None]:
repo_data = pd.DataFrame(result['data']['repository'])

repo_data = repo_data.apply(lambda x: x.dropna(), axis=1).mode()  # hax
readme_cols = repo_data.columns[repo_data.columns.str.startswith('readme')]
if len(readme_cols) > 0:
    readme_col = readme_cols[0]
    repo_data['readme'] = repo_data[readme_col]
    repo_data = repo_data[repo_data.columns.difference(readme_cols)]
repo_data[['watchers', 'stargazers']] = repo_data[['watchers', 'stargazers']].astype(int)

In [3]:
# README.md

result = gql_get_repo_readme(owner='lfkrebs', name='stata-cookbook')
pprint(result)

{'data': {'rateLimit': {'cost': 1,
                        'limit': 5000,
                        'remaining': 4999,
                        'resetAt': '2019-11-24T04:45:28Z'},
          'repository': {'description': 'This is the Stata cookbook for '
                                        '“Introduction to Data Science” in the '
                                        'M.Sc. Public Policy & Human '
                                        'Development.',
                         'fork_count': 27,
                         'homepage_url': 'https://www.maastrichtuniversity.nl/education/master/master-public-policy-and-human-development',
                         'languages': {'edges': [{'node': {'name': 'Stata'}}]},
                         'license_info': {'name': 'MIT License'},
                         'owner': {'id': 'MDQ6VXNlcjExNTQ1Mzgy'},
                         'primary_language': {'name': 'Stata'},
                         'readme0': None,
                         'readme1': None

In [None]:
# # README.rst

# result = gql_get_repo_readme(owner='Ogeon', name='Climate-Visualizer')
# pprint(result)

In [None]:
# import certifi
# certifi.where()

In [None]:
# import os
# os.environ['REQUESTS_CA_BUNDLE'] = os.path.join(
#     '/etc/ssl/certs/',
#     'ca-bundle.crt')

# os.environ['REQUESTS_CA_BUNDLE'] = os.path.join(
#     '/home/wb536061/wbes2474/NLP/SCRIPTS/github',
#     'intermediate-certs.pem')

# os.environ['SSL_CERT_FILE'] = os.path.join(
#     '/home/wb536061/wbes2474/NLP/SCRIPTS/github',
#     'wbg-github.cer')

In [None]:
# request.content
# b'{"message":"This endpoint requires you to be authenticated.","documentation_url":"https://developer.github.com/v3/#authentication"}'