# 1. Data fetching and cleaning

In [1]:
%run __init__.py

INFO:root:Starting logger


## Getting the repository URLs

In [2]:
REPO_URLS_FILE = 'repo_urls.txt'

with open(os.path.join(DATA_DIR, REPO_URLS_FILE), 'r') as f:
    repo_urls = [line.rstrip('\n') for line in f]

len(repo_urls)

50

In [3]:
repo_urls[0]

'https://github.com/cmungall/LIRICAL/'

## Parsing the data

In [4]:
import getpass


try:
    from secret import GITHUB_TOKEN
except ModuleNotFoundError:
    GITHUB_TOKEN = getpass.getpass()


In [10]:
class GitHubIssue():
    def __init__(self, title, body, comments):
        self.title = title
        self.body = body
        self.comments = comments


class GitHubRepoData():
    def __init__(self, gh_id, name, description,
                 owner_name, languages, 
                 readme_text, issues):
        self.gh_id = gh_id
        self.name = name
        self.description = description
        self.owner_name = owner_name
        self.languages = languages
        self.readme_text = readme_text
        self.issues = issues
    
    def to_dict(self):
        return {
            'gh_id': self.gh_id,
            'name': self.name,
            'description': self.description,
            'owner_name': self.owner_name,
            'languages': '|'.join([f"{language}: {num_bytes}" 
                                   for language, num_bytes 
                                   in self.languages]),
            'readme_text': self.readme_text,
            'issues_text': '\n'.join([issue.body for issue in self.issues])
        }
    
    def __eq__(self, other):
        if not isinstance(other, GitHubRepoData):
            return False
        return self.gh_id == other.gh_id

    def __str__(self):
        return f"{self.gh_id} - {self.owner_name}: {self.name} - {self.description}"


In [11]:
import json
import requests

from requests.exceptions import HTTPError

from bs4 import BeautifulSoup


GITHUB_BASE_API = "https://api.github.com"

def _get_repo_info(repo_url):
    tokens = [token for token in repo_url.split('/')
              if token != '']
    return tokens[-2], tokens[-1]

def _make_github_request(url):
    response = requests.get(url, headers={'Authorization': f"token {GITHUB_TOKEN}"})
    if response.status_code != 200:
        raise HTTPError("There was an error retrieving data from GitHub: " + str(response.content))
    return response.content

def get_issue_comments(comments_url):
    comments = _make_github_request(comments_url)
    comments_dict = json.loads(comments)
    return [comment['body'] for comment in comments_dict]

def get_repo_contents(author, repo):
    response = _make_github_request(f"{GITHUB_BASE_API}/repos/{author}/{repo}")
    return json.loads(response)

def get_repo_languages(languages_url):
    response = _make_github_request(languages_url)
    response_dict = json.loads(response)
    return [(language, num_bytes) 
            for language, num_bytes
            in response_dict.items()]

def get_repo_readme_text(author, repo):
    try:
        res = _make_github_request(f"{GITHUB_BASE_API}/repos/{author}/{repo}/readme")
    except HTTPError as e:
        if "Not Found" in str(e):
            return ""
        else:
            raise(e)
    html_url = json.loads(res)['html_url']
    readme_html= requests.get(html_url).content
    soup = BeautifulSoup(readme_html)
    readme_element = soup.find('div', {'id': 'readme'})
    if readme_element is None:
        # readme.txt file
        readme_element = soup.find('div', {'itemprop': 'text'})
    return readme_element.text.strip()

def get_repo_issues(author, repo):
    issues = _make_github_request(f"{GITHUB_BASE_API}/repos/{author}/{repo}/issues")
    issues_dict = json.loads(issues)
    return [GitHubIssue(issue['title'], issue['body'],
                        get_issue_comments(issue['comments_url']))
            for issue in issues_dict]
    

def parse_repo_url(repo_url):
    author, repo = _get_repo_info(repo_url)
    repo_contents = get_repo_contents(author, repo)
    issues = get_repo_issues(author, repo) if repo_contents['has_issues'] else []
    languages = get_repo_languages(repo_contents['languages_url'])
    readme_text = get_repo_readme_text(author, repo)
    return GitHubRepoData(repo_contents['id'], repo_contents['name'],
        repo_contents['description'], repo_contents['owner']['login'],
        languages, readme_text, issues)


In [14]:
parse_repo_url("https://github.com/cmungall/wikidata_ontomatcher").to_dict()['languages']

'Prolog: 14691|Makefile: 1472|Dockerfile: 700|Shell: 278'

In [15]:
from tqdm import tqdm

git_dataset = []
pbar = tqdm(repo_urls)
for url in pbar:
    pbar.set_description(f"Processing repository: {url}")
    git_dataset.append(parse_repo_url(url))


Processing repository: https://github.com/pauldevos/Basketball_Analytics: 100%|██████████| 50/50 [02:12<00:00,  2.65s/it]                 


## Creating a dataframe

In [16]:
import pandas as pd

df = pd.DataFrame([repo.to_dict() for repo in git_dataset])
df.head()

Unnamed: 0,gh_id,name,description,owner_name,languages,readme_text,issues_text
0,216602979,LIRICAL,LIkelihood Ratio Interpretation of Clinical Ab...,cmungall,Java: 492423|FreeMarker: 13149|Python: 849,LIRICAL\n\nLIkelihood Ratio Interpretation of ...,
1,199330464,wikidata_ontomatcher,Matches ontology classes against wikidata,cmungall,Prolog: 14691|Makefile: 1472|Dockerfile: 700|S...,Match an ontology to Wikidata\nThis applicatio...,Will help with #1 and with https://github.com/...
2,253207181,ro-crate-ruby,"A Ruby gem for creating, manipulating and read...",markwilkinson,Ruby: 52724|HTML: 1319,"ro-crate-ruby\nThis is a WIP gem for creating,...",
3,212556220,Misc_Training_scripts,A place for me to keep various miscellanelous ...,markwilkinson,Shell: 15815|Ruby: 9445,Misc_Training_scripts\nA place for me to keep ...,
4,155879756,FAIRifier,A tool to make data FAIR,mikel-egana-aranguren,Java: 3514431|JavaScript: 967765|HTML: 333450|...,Dependencies:\n\nJava 8\nApache Ant\n\nBuildin...,


## Data cleaning and feature engineering

## Initial exploration

### Readme length

### Languages used

## Saving the dataframe