In [257]:
import pandas as pd
import requests
from requests.exceptions import HTTPError
from json.decoder import JSONDecodeError
from pprint import pprint
import base64
from github import Github

'''

It is suggested by Github to use the authenticated requests, to avoid a RateLimitExceededException
if you use the public request (without authentication) and exceed several number of requests.

'''

token = 'GitHub_authentication_token'

## JSON

In [258]:
# username
username = "elixir-europe"

#hackatons
biohackatons = ("BioHackathon-projects-2019" ,"BioHackathon-projects-2020")

# setup owner name , access_token, and headers 
headers = {'Authorization':"Token "+ token}

# url for the request
projects_dict = {}

for biohackaton in biohackatons:
    url = f"https://api.github.com/repos/{username}/{biohackaton}"
    repo = requests.get(url,headers=headers).json() # returns a json from the request
    projects_dict[biohackaton] = repo 

# pretty print for the JSON
# pprint(projects_dict["BioHackathon-projects-2019"])

## PyGitHub

In [259]:
# Github username
username = "elixir-europe"

# pygithub object uses an access token
g = Github(token)

# gets the user
user = g.get_user(username)

# gets the repo
repo = user.get_repo("BioHackathon-projects-2019")

# readme file for all projects from BioHackaton 2019
projects = 34

## Fetch README.md 

In [260]:
'''
readme_dict - dictionary containing: 

keys - project numbers
values - README.md for each Biohackaton project

'''
#dir(repo) - gives the methods and attributes that can be used

readme_dict = {}

for project in range(1, projects + 1):
    readme = repo.get_contents(f"/projects/{project}/README.md")
    readme_dict[project] = readme.decoded_content.decode()

## Fetch Repo Details

In [None]:
list(repo.get_labels())

#### Repo branches

In [375]:
repo_branches = list(repo.get_branches())

for i, e in enumerate(repo_branches):
    print("Branch", i, e)

print(commit.commit.author)
print("overall commits", repo.get_commits().totalCount)

Branch 0 Branch(name="11-text-mining-sw")
Branch 1 Branch(name="29-semantic-recommendation-system-for-life-sciences-publications")
Branch 2 Branch(name="dependabot/npm_and_yarn/projects/29/src/client/bin-links-1.1.8")
Branch 3 Branch(name="dependabot/npm_and_yarn/projects/29/src/client/handlebars-4.7.6")
Branch 4 Branch(name="dependabot/npm_and_yarn/projects/29/src/client/lodash-4.17.19")
Branch 5 Branch(name="dependabot/npm_and_yarn/projects/29/src/client/npm-6.14.6")
Branch 6 Branch(name="dependabot/npm_and_yarn/projects/29/src/client/npm-registry-fetch-4.0.5")
Branch 7 Branch(name="dependabot/npm_and_yarn/projects/29/src/client/npm-user-validate-1.0.1")
Branch 8 Branch(name="dependabot/npm_and_yarn/projects/29/src/client/websocket-extensions-0.1.4")
Branch 9 Branch(name="dependabot/npm_and_yarn/projects/29/src/web_server/elliptic-6.5.3")
Branch 10 Branch(name="dependabot/npm_and_yarn/projects/29/src/web_server/lodash-4.17.19")
Branch 11 Branch(name="dependabot/npm_and_yarn/projects/

## Commits

In [414]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import pprint

def commits_processing(repo):
    
    '''
    input: repo PyGitHub object
    Output: 1. list with overall commit days 
            2. set with all authors that contributed
    
    '''

    master = repo.get_branch("master")

    commits = list(repo.get_commits(sha="master"))

    commit_days = []
    authors = set()
    
    
    for commit in commits:
        authors.add(str(commit.commit.author))
        timeObj = commit.commit.author.date
        timeStamp = timeObj.strftime("%d-%m-%Y")
        commit_days.append(timeStamp)

    return sorted(commit_days, key = lambda date: datetime.strptime(date, "%d-%m-%Y")) , authors

overall_commit_days = commits_processing(repo)

In [390]:
def datetime_interval_processer(start_date: str, end_date: str, datetime_list: list):
    
    '''
    Input: hackaton start date
           hackaton end date
           dates of commits
           
    output: 1 a list for all commits before the hackaton days
            2 a list for all commits during the hackaton days
            3 a list for all commits after the hackaton days
    '''
    
    start = datetime.strptime(start_date, "%d-%m-%Y")
    end = datetime.strptime(end_date, "%d-%m-%Y")

    datetime_object = [datetime.strptime(date, '%d-%m-%Y') for date in datetime_list] 
    #dates are needed to be datetime objects to be able to compare them

    before_dates = []
    in_between_dates = []
    after_dates = []
    
    for d in datetime_object:
            
        if d < start:
            before_dates.append((str(d)[:10]))
            
        if d >= start and d <= end:
            in_between_dates.append(str(d)[:10])
            
        if d > end:
        
            after_dates.append((str(d)[:10]))
             
    return before_dates, in_between_dates, after_dates

biohackaton_2019 = datetime_interval_processer("18-11-2019", "22-11-2019", overall_commit_days[0])

In [391]:
def datetime_to_dict(datetime_list: list):
    
    '''
    Dictionary holding all commits for a certain period of time

    Input: list with dates, datetime format
    Output: a dictionary with:
    -keys = dates
    -values = number of commits
    
    
    '''
    hackaton_dict = {}
    
    for date in datetime_list:
        if date in hackaton_dict:
            hackaton_dict[date] += 1
        else: 
            hackaton_dict[date] = 1
            
    return hackaton_dict


In [487]:

def dict_to_df(input_dict):

    exportable_sheet = pd.DataFrame.from_dict(input_dict, orient='index', columns = ["Number of Commits"])
    exportable_sheet = exportable_sheet.rename_axis('Hacking days').reset_index()
    exportable_sheet.loc['Total', "Number of Commits"] = exportable_sheet["Number of Commits"].sum()

    return exportable_sheet



In [488]:
#Dictionary holding all commits before Hackaton Days

exportable_sheet_before = dict_to_df(datetime_to_dict(biohackaton_2019[0]))
exportable_sheet_before.tail()

Unnamed: 0,Hacking days,Number of Commits
13,2019-11-14,12.0
14,2019-11-15,20.0
15,2019-11-16,8.0
16,2019-11-17,15.0
Total,,114.0


In [489]:

#Dictionary holding all commits during Hackaton Days

exportable_sheet_during = dict_to_df(datetime_to_dict(biohackaton_2019[1]))
exportable_sheet_during

Unnamed: 0,Hacking days,Number of Commits
0,2019-11-18,14.0
1,2019-11-19,110.0
2,2019-11-20,114.0
3,2019-11-21,148.0
4,2019-11-22,49.0
Total,,435.0


In [491]:
#Dictionary holding all commits after Hackaton Days

exportable_sheet_after = dict_to_df(datetime_to_dict(biohackaton_2019[2]))
exportable_sheet_after.tail()


Unnamed: 0,Hacking days,Number of Commits
9,2020-01-22,4.0
10,2020-01-23,6.0
11,2020-02-13,8.0
12,2020-09-12,4.0
Total,,82.0


In [426]:
all_contributing_authors = {i.split("=")[1].replace(")", "") for i in overall_commit_days[1]}
all_contributing_authors

{'"6br"',
 '"Alasdair Gray"',
 '"Alex Kanitz"',
 '"Alexey Sokolov"',
 '"Anup Kumar"',
 '"Aurélien Luciani"',
 '"Björn Grüning"',
 '"Bérénice Batut"',
 '"Carlos Vega"',
 '"David Hoksza"',
 '"David Lagorce"',
 '"De"',
 '"DrYak"',
 '"Egon Willighagen"',
 '"Emma Schymanski"',
 '"Foivos Gypas"',
 '"Gurnoor Singh"',
 '"Hans Ienasescu"',
 '"Hervé Ménager"',
 '"Hirokazu Chiba"',
 '"Isuru Liyanage"',
 '"Ivan Blagoev Topolsky"',
 '"Ivan Mičetić"',
 '"Jon Ison"',
 '"Manuel Bernal Llinares"',
 '"Marc Hanauer"',
 '"Margarita Kopniczky"',
 '"Mateusz Kuzak"',
 '"Michael R. Crusoe"',
 '"Mustafa Anıl Tuncel"',
 '"Nicola Soranzo"',
 '"Piotr Gawron"',
 '"Ric"',
 '"Ricardo"',
 '"Tazro Inutano Ohta"',
 '"The Gitter Badger"',
 '"Tomas"',
 '"Toshiyuki Yokoyama"',
 '"Valentin Grouès"',
 '"Venkata P. Satagopam"',
 '"Victoria Dominguez del Angel"',
 '"Wolfgang Maier"',
 '"Yo Yehudi"',
 '"andreassteffen"',
 '"anilbey"',
 '"bonohu"',
 '"d-salgado"',
 '"dependabot[bot]"',
 '"fgypas"',
 '"gurnoor1990"',
 '"kpj"',
 

In [425]:

'''
input: a github.Repository object
output: a dictionary with repository description

'''


repo_description = {"Full name:" : repo.full_name, "Description:" : repo.description,
                    "Date created:" : repo.created_at, "Date of last push:" : repo.pushed_at,
                   "Home Page:" : repo.homepage, "Number of forks:" : repo.forks, 
                    "Number of stars:" : repo.stargazers_count}
try:
    repo_description["License:"] = base64.b64decode(repo.get_license().content.encode()).decode().split('\n')[0]
except:
    pass



In [423]:
repo_description

{'Full name:': 'elixir-europe/BioHackathon-projects-2019',
 'Description:': 'This repository is meant for the participants of the BioHackathon hosted in Paris in Nov 2019 to share ideas, create issues, manage projects, publish materials, create code, etc.',
 'Date created:': datetime.datetime(2019, 7, 5, 19, 21, 25),
 'Date of last push:': datetime.datetime(2020, 10, 16, 19, 36, 40),
 'Home Page:': None,
 'Number of forks:': 29,
 'Number of stars:': 18,
 'License:': 'MIT License'}