In [1]:
import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import pprint

In [2]:
#let's get the GitHub token so we can use the API
import getpass

try:
    from secret import GITHUB_TOKEN
except ModuleNotFoundError:
    GITHUB_TOKEN = getpass.getpass("Introduce your personal access token to acces the GitHub API: ")

Introduce your personal access token to acces the GitHub API:  ········································


In [3]:
#!pip install PyGithub
from github import Github

g = Github(GITHUB_TOKEN)
print(g)

<github.MainClass.Github object at 0x0000022D7B810848>


In [4]:
def get_repo_description(repo):
    try:
        license = (repo.get_license().decoded_content.decode().split('\n')[0]).strip()
    except:
        license = ''
    return [repo.full_name, repo.created_at, repo.pushed_at, repo.stargazers_count, license]

In [5]:
def commits_processing(repo, bh_start, bh_end):
    
    '''
    input: repo PyGitHub object
        bh_start Initial date of the event
        bh_end final date of the event
    Output: 1. list with overall commit days 
            2. set with all authors that contributed
    
    '''

    commits = repo.get_commits()

    total_commits = [0, 0, 0, 0] #before, during, after, total
    authors = set()
    
    for commit in commits:
        authors.add(str(commit.commit.author.name))
        time_Obj = commit.commit.author.date
        if time_Obj < bh_start :
            total_commits[0] = total_commits[0] + 1
        elif bh_start <= time_Obj and time_Obj <= bh_end : 
            total_commits[1] = total_commits[1] + 1
        else :
            total_commits[2] = total_commits[2] + 1
        total_commits[3] = total_commits[3] + 1
        #timeStamp = timeObj.strftime("%d-%m-%Y")

    #return sorted(commit_days, key = lambda date: datetime.strptime(date, "%d-%m-%Y")) , authors
    return total_commits

In [6]:
BH_2020_START = datetime.datetime.strptime('2020-11-09 00:00:00', '%Y-%m-%d %H:%M:%S')
BH_2020_END = datetime.datetime.strptime('2020-11-13 23:59:59', '%Y-%m-%d %H:%M:%S')
BH_2020_START

datetime.datetime(2020, 11, 9, 0, 0)

In [7]:
repo = g.get_repo('zbmed/BioHackOutcomes')
get_repo_description(repo)

['zbmed/BioHackOutcomes',
 datetime.datetime(2020, 11, 4, 15, 39, 6),
 datetime.datetime(2020, 11, 12, 16, 18, 13),
 0,
 'Apache License']

In [8]:
repo = g.get_repo('elixir-europe/BioHackathon-projects-2020')
get_repo_description(repo)

['elixir-europe/BioHackathon-projects-2020',
 datetime.datetime(2020, 6, 22, 15, 22, 33),
 datetime.datetime(2020, 11, 12, 17, 53, 45),
 20,
 '']

In [9]:
commits_processing(repo, BH_2020_START, BH_2020_END)

[90, 147, 0, 237]

In [None]:
#Change the way to parse the license

repo_description = {"Full name:" : repo.full_name, "Description:" : repo.description,
                    "Date created:" : repo.created_at, "Date of last push:" : repo.pushed_at,
                    "Number of stars:" : repo.stargazers_count}
try:
    repo_description["License:"] = (repo.get_license().decoded_content.decode().split('\n')[0]).strip()
except:
    pass
repo_description


In [None]:
#default branch (either master or main) but we do not need to go for a branch, we can go for the repo
#branch = repo.default_branch

#Example
repo = g.get_repo('elixir-europe/BioHackathon-projects-2020')

#For each of the following, we will need to do some aggregation
#By now let's go only with commits
#lst = repo.get_downloads()
commit_lst = repo.get_commits()
#lst = repo.get_forks()
#lst = repo.get_releases()
#lst = repo.get_issues()

#Response is a paginated list, will a simple loop work? 
#It looks like but if too many we will run above the GitHub API limit... 
#ToDo: find a better way to traverse the list
lst 

In [None]:
#commit is what we need, from it we get the author username and the date
print(commit_lst[0].commit.author.name, commit_lst[0].commit.author.date) 

#next we want to aggregate committers and commits by dates
#at the end we want to get a dataframe with the following columns
#name	created_at	last_push	license	total_stars	total_commits	commits_before	commits_during	commits_after	total_committers	committers_before	committers_during	committers_after
#Let's get first the description columns and the ones for commits
#Let's get second (but the same dataframe) columns for committers (those need an extra aggregation by commiter)

# Result with added Contributors in DataFrame

In [10]:
repo = g.get_repo('elixir-europe/BioHackathon-projects-2019')

BH_2019_START = datetime.datetime.strptime('2019-11-18 00:00:00', '%Y-%m-%d %H:%M:%S')
BH_2019_END = datetime.datetime.strptime('2019-11-22 23:59:59', '%Y-%m-%d %H:%M:%S')

statistics_hackaton = commits_processing(repo, BH_2019_START, BH_2019_END)

In [22]:
def commits_processing_contributors(repo, bh_start, bh_end):
    
    '''
    input: repo PyGitHub object
        bh_start Initial date of the event
        bh_end final date of the event
    Output: 1. list with overall commit days 
            2. set with all authors that contributed
    
    '''

    commits = repo.get_commits()

    total_commits = [0, 0, 0, 0] #before, during, after, total
    overall_contributors = set() 
    total_contributors = [set(), set(), set()]#before, during, after
    for commit in commits:
        author = (str(commit.commit.author.name))
        overall_contributors.add(author)
        time_Obj = commit.commit.author.date
        if time_Obj < bh_start :
            total_commits[0] = total_commits[0] + 1
            total_contributors[0].add(author)
        elif bh_start <= time_Obj and time_Obj <= bh_end : 
            total_commits[1] = total_commits[1] + 1
            total_contributors[1].add(author)
        else :
            total_commits[2] = total_commits[2] + 1
            total_contributors[2].add(author)
        total_commits[3] = total_commits[3] + 1
        #timeStamp = timeObj.strftime("%d-%m-%Y")

    #return sorted(commit_days, key = lambda date: datetime.strptime(date, "%d-%m-%Y")) , authors
    return total_commits, total_contributors, overall_contributors

In [23]:
statistics_hackaton = commits_processing_contributors(repo, BH_2019_START, BH_2019_END)

In [35]:
contributors_before = len(statistics_hackaton[1][0])

In [36]:
contributors_during = len(statistics_hackaton[1][1])

In [37]:
contributors_after = len(statistics_hackaton[1][2])

In [38]:
contributors_overall = len(statistics_hackaton[2])

In [41]:
#DataFrame data

repo_name = get_repo_description(repo)[0]
license = get_repo_description(repo)[4]


before_hackaton = statistics_hackaton[0][0]
during_hackaton = statistics_hackaton[0][1]
after_hackaton = statistics_hackaton[0][2]
total_hackaton = statistics_hackaton[0][3]


## Output

In [51]:
#Dataframe
exportable_dataframe = pd.DataFrame({"BioHackaton Repository" : [repo_name], "License" : license, 
                                     "Commits before BH": before_hackaton, "Commits during BH": during_hackaton, 
                                     "Commits after BH": after_hackaton, "Total Commits" : total_hackaton, 
                                     "Contributors before BH" : contributors_before, "Contributors during BH" : contributors_during, 
                                     "Contributors after BH" : contributors_after, "Total Contributors" : contributors_overall})
exportable_dataframe

Unnamed: 0,BioHackaton Repository,License,Commits before BH,Commits during BH,Commits after BH,Total Commits,Contributors before BH,Contributors during BH,Contributors after BH,Total Contributors
0,elixir-europe/BioHackathon-projects-2019,MIT License,114,435,82,631,30,32,10,54


## Export to CSV

In [52]:
exportable_dataframe.to_csv('BioHackaton_stats_with_contributors.csv', index = False, header=True)