In [1]:
import pandas as pd
import numpy as np
import json

# Load py2neo
import py2neo
from py2neo import Graph
from py2neo.matching import *

# Interactive Plotting Libraries
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import plotly.graph_objects as go

# Plotting Widgets
import cufflinks as cf

# Self created functions
import config as cfg
import utils

# Throughput Github Analysis

This is a research project led by PhD Simon Goring.

Different research questions are tried to be answer such as: 

- How do individuals and organizations use GitHub (or other public code repositories) to reference, analyze or reuse data from Data Catalogs?

- Are there clear patterns of use across public repositories?

- Do patterns of use differ by data/disciplinary domain, or do properties of the data resource (presence of an API, online documentation, size of user community) affect patterns of use? 

- Does the data reuse observed here expand our understanding of current modes of data reuse, e.g. those outlined in https://datascience.codata.org/articles/10.5334/dsj-2017-008/ ?

- What are the characteristics and shape of the Earth Science research object network?
- What are major nodes of connectivity?
- What poorly connected islands exist? 
- What is the nature of data reuse in this network?
- What downstream/second order grant products can be identified from this network?

## Current Approach

Categorizing a subset of scraped repos, with pre-defined types, which may be updated iteratively as categorization progresses (education, analysis, archiving, informational).


Using ML techniques, we might be able to classify repos according to type automatically; and could consider classifying according to repository quality/completeness. Repository quality or completeness would be defined by:

- presence/absence/length of readme
- number of commits
- number of contributors

By using neo4j, we can construct and analyze the network graph in order to get:
- Centrality and level of connection
- Identification of small networks/islands within the network
- What databases are highly connected and which are not?
- Use database properties (has API, online search portal, has R/Python package, has user forum . . .)

## Objective of the Notebook

This Notebook is going to be used to created an initial EDA using Neo4j to create a Recommendation System with Graph Databases.

Connect to Neo4j's graph.

User credentials can be input in the `config.py` script, imported as `cfg`. 

A `config_sample.py` script has been included. Add your credentials and rename the file to be `config.py` in your system.

In [2]:
# Connect to Graph
graph = Graph("bolt://localhost:7687", auth=(cfg.neo4j['auth']), bolt=True, password=cfg.neo4j['password'])

In [3]:
graph

Graph('bolt://neo4j@localhost:7687', name='neo4j')

### Counting observations

In [4]:
graph.run('MATCH (crt:TYPE {type:"schema:CodeRepository"})\
           MATCH (crt)<-[:isType]-(ocr:OBJECT) \
           RETURN COUNT(DISTINCT ocr)').to_data_frame()

Unnamed: 0,COUNT(DISTINCT ocr)
0,73563


# EDA for Github Analysis

Creating right graphs for GA project.

We will analyze and graph the following:
- Distribution of references to DBs <- ??

- Note 'Earth Science' databases within graph / Note particular 'Subjects' within graph.
    - X = DBs; y = # of referenced repos <- ??
    - Linked repos (x) by commits (y)

- Note 'ES' and 'Subjects' commits 
    - Linked repos (x) by # of contributors (y)
    - Linked repos (x) by # of forks (y)

## Getting DataCatalogs and CodeRepos

![](img/01_graph.png)

## Getting MetaData

In [5]:
data = graph.run('''MATCH (k:KEYWORD {keyword: "earth science"})\
MATCH (k)<-[:hasKeyword]-(a1:ANNOTATION)-[:Body]->(dc:dataCat)\
MATCH (dc)<-[:Target]-(a2:ANNOTATION)-[:Target]->(cr:codeRepo)\
RETURN distinct properties(dc), properties(cr)''').data()

## Metadata to DF

In [6]:
meta_df = utils.get_metadata_1(data = data)

In [7]:
meta_df

Unnamed: 0,dacat,dacat_name,cr_item,cr_name,forks,commits,contributors
0,r3d100010356dc,Unidata's RAMADDA,37471462cr,donmurray/ramadda,0,5604,1
1,r3d100010356dc,Unidata's RAMADDA,44131591cr,CINERGI/TextTeaserOnline,0,6,1
2,r3d100010356dc,Unidata's RAMADDA,185451037cr,suvarchal/IDV-dev-old,0,9265,15
3,r3d100010356dc,Unidata's RAMADDA,9786227cr,aodn-archive/DELETE_ME.ramadda,0,2526,2
4,r3d100010356dc,Unidata's RAMADDA,38782871cr,CINERGI/scraper,1,23,2
...,...,...,...,...,...,...,...
331,r3d100011758dc,Nasa's Data Portal,105052913cr,PaulMFleming/nasa_data_visualization,1,47,1
332,r3d100011758dc,Nasa's Data Portal,212045308cr,Team-Hawking/dengue-hotspot-predictor,1,54,3
333,r3d100011758dc,Nasa's Data Portal,73648855cr,dillonchanis/meteorite-map,0,10,1
334,r3d100011758dc,Nasa's Data Portal,188626320cr,lanemk/data-prov,0,33,1


In [8]:
meta_df.describe(include = 'all')

Unnamed: 0,dacat,dacat_name,cr_item,cr_name,forks,commits,contributors
count,328,328,328,328,328.0,328.0,328.0
unique,13,13,319,319,,,
top,r3d100010134dc,PANGAEA,229084981cr,dataone-website-test/hugo-and-forestry,,,
freq,93,93,3,3,,,
mean,,,,,5.987805,1158.920732,5.463415
std,,,,,29.254757,4694.78514,28.739873
min,,,,,0.0,1.0,1.0
25%,,,,,0.0,12.0,1.0
50%,,,,,0.0,54.0,2.0
75%,,,,,2.0,251.5,3.0


### Plotting by Data Catalog or Code Repo

In [9]:
@interact(x=(0,500))
def show_dc_more_than(selection =['dacat','cr'], column=['forks', 'commits', 'contributors'], x = 1):
    meta_df
    if selection =='dacat':
        df = meta_df[['dacat_name', 'cr_item', 'forks', 'commits', 'contributors']]
        df = meta_df.groupby('dacat').agg({'dacat_name': 'max', 'cr_item' : 'count', 'forks' : 'sum', 'commits' : 'sum', 'contributors' : 'sum'}).reset_index()
        
    if selection =='cr':
        df = meta_df.groupby('cr_item').agg({'cr_name': 'max', 'dacat_name': 'max',  'forks' : 'sum', 'commits' : 'sum', 'contributors' : 'sum'}).reset_index()
        
        
    
    return df.loc[df[column] > x]

interactive(children=(Dropdown(description='selection', options=('dacat', 'cr'), value='dacat'), Dropdown(desc…

In [10]:
dacat_list = meta_df['dacat_name'].unique().tolist()
dacat_list.insert(0, 'All')

@interact
def histogram_plot(opt = ['commits', 'forks', 'contributors'],
                  dacat = dacat_list):
    if dacat == 'All':
        df = meta_df
    
    else:
        df = meta_df[meta_df['dacat_name'] == dacat]
    
    #df = meta_df
    df = df[df[opt] >= 1]
    
    df[opt] = np.log(df[opt])
    
    title_str = 'Log(' + opt.capitalize() + ') - Repos with "Earth Sciences" as a Keyword' 
    trace = go.Histogram(x = df[opt], nbinsx=50)
    
    # layout
    layout = go.Layout(
                title = title_str, # Graph title
                xaxis = dict(title = 'Log(' + opt.capitalize() + ') - Datacatalog: ' + dacat), # x-axis label
                yaxis = dict(title = 'Count'), # y-axis label
                hovermode ='closest' # handles multiple points landing on the same vertical
    )

    # fig
    fig = go.Figure(trace, layout)
    fig.update_traces(marker_line_width = 0.5, selector=dict(type='histogram'))
    fig.show()

interactive(children=(Dropdown(description='opt', options=('commits', 'forks', 'contributors'), value='commits…

# Analysis checking for Subject

In [11]:
subject_data = graph.run('''MATCH (s:SUBJECT)\
WHERE s.id IN [313, 314, 315, 317]\
MATCH (s)<-[:hasSubject]-(a:ANNOTATION)-[]->(dc:dataCat)\
MATCH (dc)<-[:Target]-(:ANNOTATION)-[:Target]->(cr:codeRepo)\
RETURN distinct properties(dc), properties(cr), s.id''').data()

In [12]:
subject_df = utils.create_df_subject(subject_data = subject_data)

In [13]:
subject_df.describe(include='all')

Unnamed: 0,dacat,dacat_name,cr_item,cr_name,forks,commits,contributors,subject,subject_str
count,7759,7759,7759,7759,7759.0,7759.0,7759.0,7759.0,7759
unique,263,263,3238,3238,,,,4.0,4
top,r3d100011989dc,USGS Earthquake Hazards Program,38782871cr,CINERGI/scraper,,,,313.0,"Atmospheric Science, Oceanography and Climate ..."
freq,1047,1047,65,65,,,,3259.0,3259
mean,,,,,9.431757,855.517206,12.167934,,
std,,,,,100.437306,8496.701811,122.566447,,
min,,,,,0.0,1.0,1.0,,
25%,,,,,0.0,8.0,1.0,,
50%,,,,,0.0,35.0,2.0,,
75%,,,,,2.0,165.0,3.0,,


In [14]:
df_for_dacat = subject_df.groupby('dacat_name').count().reset_index()
df_for_dacat = df_for_dacat[df_for_dacat['dacat'] > 100]

dacat_list = df_for_dacat['dacat_name'].unique().tolist()
dacat_list.insert(0, 'All')

@interact
def histogram_plot(subject = list(set(subject_df['subject_str'])),
                   option = ['commits', 'forks', 'contributors'],
                   dacat = dacat_list):
    
    if dacat == 'All':
        df = subject_df
    
    else:
        df = subject_df[subject_df['dacat_name'] == dacat]

    
    df = df[df['subject_str'] == subject]
    
    df = df[df[option] >= 1]
    
    df[option] = np.log(df[option])
    
    title_str = 'Log(' + option.capitalize() + ') - Repos with " '+ subject + '" as a Subject' 
    trace = go.Histogram(x = df[option], nbinsx=50)
    
    # layout
    layout = go.Layout(
                title = title_str, # Graph title
                xaxis = dict(title = 'Log(' + option.capitalize() + ') - Datacatalog: ' + dacat), # x-axis label
                yaxis = dict(title = 'Count'), # y-axis label
                hovermode ='closest' # handles multiple points landing on the same vertical
    )

    # fig
    fig = go.Figure(trace, layout)
    fig.update_traces(marker_line_width = 0.5, selector=dict(type='histogram'))
    fig.show()

interactive(children=(Dropdown(description='subject', options=('Geophysics and Geodesy', 'Geology and Palaeont…

![img2](img/subject_graph.png)

Green: Data Cat
Navy blue: subject
Pink: Code Repo
Ligh Blue: Annotation

## All Data Without Subjects

In [15]:
all_data = graph.run('''MATCH ()<-[:hasSubject]-(a:ANNOTATION)-[]->(dc:dataCat)\
MATCH (dc)<-[:Target]-(:ANNOTATION)-[:Target]->(cr:codeRepo)\
RETURN distinct properties(dc), properties(cr)''').data()

In [16]:
len(all_data)

57693

In [17]:
all_df = utils.create_all_df(all_data)

In [18]:
df_for_dacat = all_df.groupby('dacat_name').count().reset_index()
df_for_dacat = df_for_dacat[df_for_dacat['dacat'] > 10]

dacat_list = df_for_dacat['dacat_name'].unique().tolist()
dacat_list.insert(0, 'All')

@interact
def histogram_plot(opt = ['commits', 'forks', 'contributors'],
                  dacat = dacat_list):
    
    if dacat == 'All':
        df = all_df
    
    else:
        df = all_df[all_df['dacat_name'] == dacat]
    
    df = df[df[opt] >= 1]
    
    df[opt] = np.log(df[opt])
    
    title_str = 'Log(' + opt.capitalize() + ') - All Repos within Throughput.'
                 
    trace = go.Histogram(x = df[opt], nbinsx=50)
    
    # layout
    layout = go.Layout(
                title = title_str, # Graph title
                xaxis = dict(title = 'Log(' + opt.capitalize() + ') - Datacatalog: ' + dacat), # x-axis label
                yaxis = dict(title = 'Count'), # y-axis label
                hovermode ='closest' # handles multiple points landing on the same vertical
    )

    # fig
    fig = go.Figure(trace, layout)
    fig.update_traces(marker_line_width = 0.5, selector=dict(type='histogram'))
    fig.show()

interactive(children=(Dropdown(description='opt', options=('commits', 'forks', 'contributors'), value='commits…