# Repository Classifier - SpaCy

### Author: Socorro Dominguez Vidana
#### June 2021

## Obtaining the Readme Files

## Loading the Data

In [1]:
import pandas as pd
import numpy as np
import json

# Load py2neo
import py2neo
from py2neo import Graph
from py2neo.matching import *

# Self created functions
import config as cfg

# turn off warnings
np.seterr(divide = 'ignore')

# Decoder
import base64

# API
import requests

In [2]:
graph = Graph("bolt://localhost:7687", auth=(cfg.neo4j['auth']), bolt=True, password=(cfg.neo4j['password']))

In [3]:
graph

Graph('bolt://neo4j@localhost:7687', name='neo4j')

In [4]:
sub_data = graph.run('''MATCH (s:SUBJECT)<-[:hasSubject]-(a:ANNOTATION)-[]->(dc:dataCat)\
WHERE s.id = 314 \
MATCH (dc)<-[:Target]-(:ANNOTATION)-[:Target]->(cr:codeRepo)\
RETURN distinct properties(dc), properties(cr), s.id''').data()

In [5]:
helper_dict = {'dacat': [], #done
               'dacat_name':[], #done
               'meta':[], #meta
               'cr_item' :[],
               'cr_name' : [],
               'repo_url': [],
               'repo_readme':[]
               }

In [6]:
for i in range(0, len(sub_data)-1):
    helper_dict['dacat'].append(sub_data[i]['properties(dc)']['id'])
    helper_dict['dacat_name'].append(sub_data[i]['properties(dc)']['name'])
    
    try:
        helper_dict['meta'].append(sub_data[i]['properties(cr)']['meta'])
        json_data = json.loads(sub_data[i]['properties(cr)']['meta'])
        helper_data = json_data['id']
        helper_data_name = json_data['name']
        helper_data_url = json_data['url']
        helper_data_readme = json_data['readme']['readme']['readme']

        # Name CR
        helper_dict['cr_item'].append(helper_data)
        helper_dict['cr_name'].append(helper_data_name)
        helper_dict['repo_url'].append(helper_data_url)
        helper_dict['repo_readme'].append(helper_data_readme)
        
        

    # Take care of empty spaces.
    except KeyError:
        helper_dict['meta'].append("Missing")
        helper_dict['cr_item'].append("Missing")
        helper_dict['cr_name'].append("Missing")
        helper_dict['repo_url'].append("Missing")
        helper_dict['repo_readme'].append("Missing")

In [7]:
data = pd.DataFrame(helper_dict)

In [8]:
data = data[data['cr_item'] != "Missing"]

In [9]:
data = data[['dacat', 'dacat_name', 'cr_item', 'cr_name', 'repo_url', 'repo_readme']].reset_index(drop=True)

In [10]:
data.head()

Unnamed: 0,dacat,dacat_name,cr_item,cr_name,repo_url,repo_readme
0,r3d100012894,LinkedEarth Wiki,110591925,feup-infolab/ontologies-database,https://github.com/feup-infolab/ontologies-dat...,True
1,r3d100012894,LinkedEarth Wiki,41373407,nickmckay/LiPD-utilities,https://github.com/nickmckay/LiPD-utilities,True
2,r3d100012894,LinkedEarth Wiki,61573112,cyber4paleo/ClimateLife,https://github.com/cyber4paleo/ClimateLife,True
3,r3d100012894,LinkedEarth Wiki,133586046,nickmckay/LinkedEarth-Neotoma-P418,https://github.com/nickmckay/LinkedEarth-Neoto...,True
4,r3d100012894,LinkedEarth Wiki,152458689,earthcubearchitecture-project418gui/server,https://github.com/earthcubearchitecture-proje...,True


In [11]:
data = data[data['repo_readme'] == True]

In [12]:
token = cfg.github_api['secret']

In [13]:
def get_readme(url, token = token):
    url_to_api_endpoint = url.replace('https://github.com/', '')
    new_url = 'https://api.github.com/repos/' + url_to_api_endpoint + '/contents/README.md'
    headers = {'Authorization': f'token {token}', 'accept': 'application/JSON'}
    
    try:
        readme = requests.get(new_url, headers=headers).json()
        readme = readme['content']
        readme = base64.b64decode(readme)
    except:
        readme = "Missing"

    return readme

In [14]:
data['readme_content']= data['repo_url'].apply(lambda x: get_readme(x))

In [15]:
data.head()

Unnamed: 0,dacat,dacat_name,cr_item,cr_name,repo_url,repo_readme,readme_content
0,r3d100012894,LinkedEarth Wiki,110591925,feup-infolab/ontologies-database,https://github.com/feup-infolab/ontologies-dat...,True,"b""[![Codacy Badge](https://api.codacy.com/proj..."
1,r3d100012894,LinkedEarth Wiki,41373407,nickmckay/LiPD-utilities,https://github.com/nickmckay/LiPD-utilities,True,"b'<h1 align=""left"">\n <br>\n <a href=""http:/..."
2,r3d100012894,LinkedEarth Wiki,61573112,cyber4paleo/ClimateLife,https://github.com/cyber4paleo/ClimateLife,True,b'# ClimateLife\nWe aim to foster a closer int...
3,r3d100012894,LinkedEarth Wiki,133586046,nickmckay/LinkedEarth-Neotoma-P418,https://github.com/nickmckay/LinkedEarth-Neoto...,True,b'# p418NotebooksR\nNotebooks in the R languag...
4,r3d100012894,LinkedEarth Wiki,152458689,earthcubearchitecture-project418gui/server,https://github.com/earthcubearchitecture-proje...,True,b'### UCAR JSON-LD Validation\n\n#### Project ...


In [16]:
data.to_csv('data/repository_readme_files.csv')