In [11]:
# load packages 
import os
import psycopg2 as pg
from sqlalchemy import create_engine
import pandas as pd
import requests as r
import string 
import json
import base64
import urllib.request
import itertools 
import numpy as np
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from github import Github, RateLimitExceededException, BadCredentialsException, BadAttributeException, GithubException, UnknownObjectException, BadUserAgentException
import warnings
import datetime

import multiprocessing
#from multiprocessing.pool import ThreadPool as Pool
from multiprocessing import Pool, freeze_support

import concurrent.futures

warnings.simplefilter(action='ignore', category=FutureWarning)

In [12]:
# connect to the database, download data, limit to repos with at least 20,000 commits?
connection = pg.connect(host = 'postgis1', database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

#PATs access token, saved as a dataframe
github_pats = '''SELECT * FROM gh_2007_2020.pats_update'''
github_pats = pd.read_sql_query(github_pats, con=connection)
access_tokens = github_pats["token"]
num_token = '''SELECT COUNT(*) FROM gh_2007_2020.pats_update'''
num_token = pd.read_sql_query(num_token, con=connection)
num_token=num_token.iloc[0]['count']

# grab distinct users 
distinct_users = '''SELECT * FROM gh.ctrs_from_commits_lchn'''
distinct_users = pd.read_sql_query(distinct_users, con=connection)
#get rid of leading and ending space, save users to a list
users_list = distinct_users["login"].tolist()
logins = []
for s in users_list:
    logins.append(s.strip())  
print(len(logins))
print(logins[0], logins[len(logins)-1])

connection.close()

3260612
kerneq sosterbind


In [13]:
# index ranges from 0 to maximum number of PATs available
def get_access_token(github_pat_index):
    if github_pat_index < num_token:
        return github_pats.token[github_pat_index]
    else:
        print("token exceed limit")

In [14]:
def pull_user_stats(users, github_pat_index):
    df_user_stats = pd.DataFrame()
    for user in users:
        if github_pat_index >= len(access_tokens):
            github_pat_index -= len(access_tokens)
            print("***PAT access token exceed limit, restart access token loop with #", github_pat_index)
        while github_pat_index < len(access_tokens):
            try:
                access_token = get_access_token(github_pat_index)
                #print("Scrapping --", user,". Extracting access token #", github_pat_index+1,", total", num_token, "tokens are available.")
                #if false, retry until true, max number of retry is 20 times
                g = Github(access_token, retry = 20, timeout = 15)
                user = g.get_user(user)
                df_user_stats = df_user_stats.append({
                    "login": user.login, "name": user.name, "email": user.email, "bio": user.bio,
                    "company": user.company, "location": user.location,
                    "created_at": user.created_at.strftime("%m/%d/%Y %H:%M:%S"),
                    "blog": user.blog, "orgs_url": user.organizations_url,
                    "collaborators": user.collaborators, "repos_url": user.repos_url,
                    "gists_public": user.public_gists, "gists_private": user.private_gists,
                    "repos_public": user.public_repos, "repos_private": user.total_private_repos,
                    "followers": user.followers, "followers_url": user.followers_url,
                    "following": user.following, "following_url": user.following_url
                }, ignore_index = True)
            except RateLimitExceededException as e:
                print(e.status)
                print('Rate limit exceeded --', user, ", using access token #", github_pat_index)
                print("Current time:", datetime.datetime.now())
                #time.sleep(300)
                github_pat_index+=1
                print("***Exit current access token, proceed with next aceess token #", github_pat_index, "rescrape --",user)
                break
            except BadCredentialsException as e:
                print(e.status)
                print('Bad credentials exception --', user, ", using access token #", github_pat_index)
                print("Current time:", datetime.datetime.now())
                github_pat_index+=1
                print("***Exit current access token, proceed with next aceess token #", github_pat_index, "rescrape --",user)
                break
            except UnknownObjectException as e:
                print(e.status)
                print('Unknown object exception --', user)
                break
            except GithubException as e:
                print(e.status)
                print('General exception --', user)
                break
            except r.exceptions.ConnectionError as e:
                print('Retries limit exceeded --', user)
                print(str(e))
                time.sleep(10)
                continue
            except r.exceptions.Timeout as e:
                print('Time out exception --', user)
                print(str(e))
                time.sleep(10)
                continue
            break
    return df_user_stats

In [6]:
logins_subset = logins[3150001:3200000]
len(logins_subset)

49999

In [None]:
start_time = datetime.datetime.now()
print("Start scraping:", start_time)
#specify the index of pat you want use to start scraping
df_user_stats = pull_user_stats(logins_subset, 10) 
df_user_stats = df_user_stats[['login', 'name', 'email', 'bio', 'company', 'location','created_at','blog',
                               'orgs_url','collaborators','gists_public','gists_private','repos_public',
                               'repos_private','repos_url','followers','followers_url','following','following_url']]
df_user_stats.to_csv(r'/project/biocomplexity/sdad/projects_data/ncses/oss/github_user_data/76_github_user_data.csv', index = False)
end_time =  datetime.datetime.now()
print("Finished scraping", len(df_user_stats), "of", len(logins_subset), "records at", end_time)
print("It took", end_time-start_time, "to run.")

In [32]:
import os
import glob 
os.chdir('/project/biocomplexity/sdad/projects_data/ncses/oss/github_user_data/')
all_filenames = [i for i in glob.glob('*_github_user_data.csv')]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv['login'].unique

<bound method Series.unique of 0                 Arro
1            Surge1223
2            matcarter
3             nirajsha
4        jkarnasiewicz
             ...      
48693      claude-zhou
48694    shixiaohu2206
48695          L1cardo
48696    dfernandezlop
48697         ptsneves
Name: login, Length: 3098913, dtype: object>

In [33]:
already_scraped = combined_csv['login'].tolist()
already_scraped = ~distinct_users.login.isin(already_scraped)
distinct_users = distinct_users[already_scraped]
distinct_users.count()

login    194764
dtype: int64

In [34]:
users_list = distinct_users["login"].tolist()
logins_leftovers = []
for s in users_list:
    logins_leftovers.append(s.strip())  
logins_leftovers = logins_leftovers[65001:130000]
print(len(logins_leftovers))

64999


In [None]:
start_time = datetime.datetime.now()
print("Start scraping:", start_time)
#specify the index of pat you want use to start scraping
df_user_stats = pull_user_stats(logins_leftovers, 10) 
df_user_stats = df_user_stats[['login', 'name', 'email', 'bio', 'company', 'location','created_at','blog',
                               'orgs_url','collaborators','gists_public','gists_private','repos_public',
                               'repos_private','repos_url','followers','followers_url','following','following_url']]
df_user_stats.to_csv(r'/project/biocomplexity/sdad/projects_data/ncses/oss/github_user_data/80_github_user_data.csv', index = False)
end_time =  datetime.datetime.now()
print("Finished scraping", len(df_user_stats), "of", len(logins_leftovers), "records at", end_time)
print("It took", end_time-start_time, "to run.")
