# 1. Data fetching and cleaning

In [1]:
%run __init__.py

INFO:root:Starting logger


In [24]:
def print_empty_cols(df):
    for col in df.columns:
        print(col)
        print('-' * len(col))
        res = df[df[col] == ''].index
        print(f"{len(res)} articles have no value for column {col}")
        print(res)
        print('\n')


In [59]:
from bokeh.io import output_notebook

output_notebook()

In [69]:
from herc_common import BokehHistogram

hist = BokehHistogram(color_fill="mediumslateblue", color_hover="slateblue", bins=25)

## Getting the repository URLs

In [2]:
REPO_URLS_FILE = 'repo_urls.txt'

with open(os.path.join(DATA_DIR, REPO_URLS_FILE), 'r') as f:
    repo_urls = [line.rstrip('\n') for line in f]

len(repo_urls)

50

In [3]:
repo_urls[0]

'https://github.com/cmungall/LIRICAL/'

## Parsing the data

In [4]:
import getpass


try:
    from secret import GITHUB_TOKEN
except ModuleNotFoundError:
    GITHUB_TOKEN = getpass.getpass("Introduce your personal access token to acces the GitHub API: ")


In [10]:
class GitHubIssue():
    def __init__(self, title, body, comments):
        self.title = title
        self.body = body
        self.comments = comments


class GitHubRepoData():
    def __init__(self, gh_id, name, description,
                 owner_name, languages, 
                 readme_text, issues):
        self.gh_id = gh_id
        self.name = name
        self.description = description
        self.owner_name = owner_name
        self.languages = languages
        self.readme_text = readme_text
        self.issues = issues
    
    def to_dict(self):
        return {
            'gh_id': self.gh_id,
            'name': self.name,
            'description': self.description,
            'owner_name': self.owner_name,
            'languages': '|'.join([f"{language}: {num_bytes}" 
                                   for language, num_bytes 
                                   in self.languages]),
            'readme_text': self.readme_text,
            'issues_text': '\n'.join([issue.body for issue in self.issues])
        }
    
    def __eq__(self, other):
        if not isinstance(other, GitHubRepoData):
            return False
        return self.gh_id == other.gh_id

    def __str__(self):
        return f"{self.gh_id} - {self.owner_name}: {self.name} - {self.description}"


In [11]:
import json
import requests

from requests.exceptions import HTTPError

from bs4 import BeautifulSoup


GITHUB_BASE_API = "https://api.github.com"

def _get_repo_info(repo_url):
    tokens = [token for token in repo_url.split('/')
              if token != '']
    return tokens[-2], tokens[-1]

def _make_github_request(url):
    response = requests.get(url, headers={'Authorization': f"token {GITHUB_TOKEN}"})
    if response.status_code != 200:
        raise HTTPError("There was an error retrieving data from GitHub: " + str(response.content))
    return response.content

def get_issue_comments(comments_url):
    comments = _make_github_request(comments_url)
    comments_dict = json.loads(comments)
    return [comment['body'] for comment in comments_dict]

def get_repo_contents(author, repo):
    response = _make_github_request(f"{GITHUB_BASE_API}/repos/{author}/{repo}")
    return json.loads(response)

def get_repo_languages(languages_url):
    response = _make_github_request(languages_url)
    response_dict = json.loads(response)
    return [(language, num_bytes) 
            for language, num_bytes
            in response_dict.items()]

def get_repo_readme_text(author, repo):
    try:
        res = _make_github_request(f"{GITHUB_BASE_API}/repos/{author}/{repo}/readme")
    except HTTPError as e:
        if "Not Found" in str(e):
            return ""
        else:
            raise(e)
    html_url = json.loads(res)['html_url']
    readme_html= requests.get(html_url).content
    soup = BeautifulSoup(readme_html)
    readme_element = soup.find('div', {'id': 'readme'})
    if readme_element is None:
        # readme.txt file
        readme_element = soup.find('div', {'itemprop': 'text'})
    return readme_element.text.strip()

def get_repo_issues(author, repo):
    issues = _make_github_request(f"{GITHUB_BASE_API}/repos/{author}/{repo}/issues")
    issues_dict = json.loads(issues)
    return [GitHubIssue(issue['title'], issue['body'],
                        get_issue_comments(issue['comments_url']))
            for issue in issues_dict]
    

def parse_repo_url(repo_url):
    author, repo = _get_repo_info(repo_url)
    repo_contents = get_repo_contents(author, repo)
    issues = get_repo_issues(author, repo) if repo_contents['has_issues'] else []
    languages = get_repo_languages(repo_contents['languages_url'])
    readme_text = get_repo_readme_text(author, repo)
    return GitHubRepoData(repo_contents['id'], repo_contents['name'],
        repo_contents['description'], repo_contents['owner']['login'],
        languages, readme_text, issues)


In [15]:
from tqdm import tqdm

git_dataset = []
pbar = tqdm(repo_urls)
for url in pbar:
    pbar.set_description(f"Processing repository: {url}")
    git_dataset.append(parse_repo_url(url))


Processing repository: https://github.com/pauldevos/Basketball_Analytics: 100%|██████████| 50/50 [02:12<00:00,  2.65s/it]                 


## Creating a dataframe

In [49]:
import pandas as pd

df = pd.DataFrame([repo.to_dict() for repo in git_dataset])
df.head()

Unnamed: 0,gh_id,name,description,owner_name,languages,readme_text,issues_text
0,216602979,LIRICAL,LIkelihood Ratio Interpretation of Clinical Ab...,cmungall,Java: 492423|FreeMarker: 13149|Python: 849,LIRICAL\n\nLIkelihood Ratio Interpretation of ...,
1,199330464,wikidata_ontomatcher,Matches ontology classes against wikidata,cmungall,Prolog: 14691|Makefile: 1472|Dockerfile: 700|S...,Match an ontology to Wikidata\nThis applicatio...,Will help with #1 and with https://github.com/...
2,253207181,ro-crate-ruby,"A Ruby gem for creating, manipulating and read...",markwilkinson,Ruby: 52724|HTML: 1319,"ro-crate-ruby\nThis is a WIP gem for creating,...",
3,212556220,Misc_Training_scripts,A place for me to keep various miscellanelous ...,markwilkinson,Shell: 15815|Ruby: 9445,Misc_Training_scripts\nA place for me to keep ...,
4,155879756,FAIRifier,A tool to make data FAIR,mikel-egana-aranguren,Java: 3514431|JavaScript: 967765|HTML: 333450|...,Dependencies:\n\nJava 8\nApache Ant\n\nBuildin...,


## Data cleaning and feature engineering

In [50]:
df.loc[:, df.columns != 'gh_id'].describe()

Unnamed: 0,name,description,owner_name,languages,readme_text,issues_text
count,50,43,50,50,50.0,50.0
unique,50,43,27,50,48.0,6.0
top,Data-Science--Cheat-Sheet,"Temporary fork of unmerged, inaccessible pull ...",GullyAPCBurns,Java: 16959,,
freq,1,1,2,1,3.0,45.0


In [51]:
df[df.isnull().any(axis=1)]

Unnamed: 0,gh_id,name,description,owner_name,languages,readme_text,issues_text
11,57412597,hemodonacion,,fanavarro,Perl: 97370|R: 36211,Lost in Translation\nStructure\nThis repositor...,
23,161862375,biohack18,,leechuck,Groovy: 14573,biohack18,
24,171842501,biosample_jsonld,,inutano,Ruby: 9183|Shell: 3153|Dockerfile: 244,BioSample records in JSON-LD\nBioSample is a d...,
29,151696606,JavaTermiteStarter,,SciBiteLabs,Java: 16959,JavaTermiteStarter\nSet of basic code to get y...,
39,42526998,pythonontologysearch,,jamesmalone,JavaScript: 194124|Python: 5748|HTML: 1606|CSS...,,
40,238953196,my-react-form,,twhetzel,JavaScript: 35896|CSS: 6855|HTML: 1271,React Form Demos\nDescription\nDemo project to...,
41,157134523,python-ecology-lesson-es-1,,agbeltran,Jupyter Notebook: 4579723|HTML: 60649|Python: ...,Canal de Slack en español\nLección de Data Car...,


In [55]:
df.fillna(value="", inplace=True)

In [56]:
print_empty_cols(df)

gh_id
-----
0 articles have no value for column gh_id
Int64Index([], dtype='int64')


name
----
0 articles have no value for column name
Int64Index([], dtype='int64')


description
-----------
7 articles have no value for column description
Int64Index([11, 23, 24, 29, 39, 40, 41], dtype='int64')


owner_name
----------
0 articles have no value for column owner_name
Int64Index([], dtype='int64')


languages
---------
1 articles have no value for column languages
Int64Index([44], dtype='int64')


readme_text
-----------
3 articles have no value for column readme_text
Int64Index([8, 35, 39], dtype='int64')


issues_text
-----------
45 articles have no value for column issues_text
Int64Index([ 0,  2,  3,  4,  5,  6,  8, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20,
            21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
            39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
           dtype='int64')


full_text
---------
7 articles have no value for column full_text
Int

  res_values = method(rvalues)


In [57]:
import re

def clean(text):
    return re.sub('\s+', ' ', text).strip()

df['full_text'] = df["description"] + ". " + df["readme_text"]
df['full_text_cleaned'] = df['full_text'].apply(lambda x: clean(x))
df['full_text_cleaned'].loc[0][:500]

'LIkelihood Ratio Interpretation of Clinical AbnormaLities. LIRICAL LIkelihood Ratio Interpretation of Clinical AbnormaLities LIRICAL is designed to provide clincially interpretable computational analysis of phenotypic abnormalities (encoded using the Human Phenotype Ontology), optionally combined with an analysis of variants and genotypes if a VCF file is provided with the results of diagnostic gene panel, exome, or genome sequencing. Detailed documentation is available This is a useful website '

## Initial exploration

### Text length

In [71]:
df['num_chars_text'] = df['full_text_cleaned'].apply(lambda x: len(x))
df['num_chars_text'].describe()

count       50.000000
mean      2471.080000
std       3431.016806
min          1.000000
25%        374.000000
50%       1770.500000
75%       3026.750000
max      20382.000000
Name: num_chars_text, dtype: float64

In [70]:
GIT_HIST_COLUMN = "num_chars_text"
GIT_HIST_TITLE = "Readme + Description length distribution"
GIT_HIST_XLABEL = "Readme and description length (# of characters)"
GIT_HIST_YLABEL = "Number of repositories"

hist.load_plot(df, GIT_HIST_COLUMN, GIT_HIST_TITLE,
          GIT_HIST_XLABEL, GIT_HIST_YLABEL, True)

In [73]:
hist.save_plot(os.path.join(RESULTS_DIR, '1_Repo_text_length.svg'))

### Languages used

## Saving the dataframe

In [74]:
GIT_DF_FILE_PATH = os.path.join(DATA_DIR, 'git_dataframe.pkl')

df.to_pickle(GIT_DF_FILE_PATH)