In [1]:
import pandas as pd
import numpy as np
import json

# Load py2neo
import py2neo
from py2neo import Graph
from py2neo.matching import *

# Interactive Plotting Libraries
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import plotly.graph_objects as go
import plotly.io as pio

# Plotting Widgets
import cufflinks as cf

# Self created functions
import config as cfg
import utils

# turn off warnings
np.seterr(divide = 'ignore') 

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

# Throughput Github Analysis

This is a research project led by PhD Simon Goring.

Different research questions are tried to be answer such as: 

- How do individuals and organizations use GitHub (or other public code repositories) to reference, analyze or reuse data from Data Catalogs?

- Are there clear patterns of use across public repositories?

- Do patterns of use differ by data/disciplinary domain, or do properties of the data resource (presence of an API, online documentation, size of user community) affect patterns of use? 

- Does the data reuse observed here expand our understanding of current modes of data reuse, e.g. those outlined in https://datascience.codata.org/articles/10.5334/dsj-2017-008/ ?

- What are the characteristics and shape of the Earth Science research object network?
- What are major nodes of connectivity?
- What poorly connected islands exist? 
- What is the nature of data reuse in this network?
- What downstream/second order grant products can be identified from this network?

## Current Approach

Categorizing a subset of scraped repos, with pre-defined types, which may be updated iteratively as categorization progresses (education, analysis, archiving, informational).


Using ML techniques, we might be able to classify repos according to type automatically; and could consider classifying according to repository quality/completeness. Repository quality or completeness would be defined by:

- presence/absence/length of readme
- number of commits
- number of contributors

By using neo4j, we can construct and analyze the network graph in order to get:
- Centrality and level of connection
- Identification of small networks/islands within the network
- What databases are highly connected and which are not?
- Use database properties (has API, online search portal, has R/Python package, has user forum . . .)

## Objective of the Notebook

This Notebook is going to be used to created an initial EDA using Neo4j to create a Recommendation System with Graph Databases.

Connect to Neo4j's graph.

User credentials can be input in the `config.py` script, imported as `cfg`. 

A `config_sample.py` script has been included. Add your credentials and rename the file to be `config.py` in your system.

In [2]:
# Connect to Graph
graph = Graph("bolt://localhost:7687", auth=(cfg.neo4j['auth']), bolt=True, password=cfg.neo4j['password'])

ConnectionRefusedError: [Errno 61] Connection refused

In [None]:
graph

### Counting observations

In [None]:
graph.run('MATCH (crt:TYPE {type:"schema:CodeRepository"})\
           MATCH (crt)<-[:isType]-(ocr:OBJECT) \
           RETURN COUNT(DISTINCT ocr)').to_data_frame()

# EDA for Github Analysis

Creating right graphs for GA project.

We will analyze and graph the following:
- Distribution of references to DBs <- ??

- Note 'Earth Science' databases within graph / Note particular 'Subjects' within graph.
    - X = DBs; y = # of referenced repos
    - Linked repos (x) by commits (y)

- Note 'ES' and 'Subjects' commits 
    - Linked repos (x) by # of contributors (y)
    - Linked repos (x) by # of forks (y)

## Getting DataCatalogs and CodeRepos

![](img/01_graph.png)

## Getting MetaData

In [None]:
data = graph.run('''MATCH (k:KEYWORD {keyword: "earth science"})\
MATCH (k)<-[:hasKeyword]-(a1:ANNOTATION)-[:Body]->(dc:dataCat)\
MATCH (dc)<-[:Target]-(a2:ANNOTATION)-[:Target]->(cr:codeRepo)\
RETURN distinct properties(dc), properties(cr)''').data()

## Metadata to DF

In [None]:
meta_df = utils.get_metadata_1(data = data)

- Distribution of references to DBs <- ??

In [None]:
ref = meta_df.copy()

In [None]:
ref.head()

In [None]:
ref = ref.groupby('dacat').agg({'dacat_name': 'max', 'forks':'mean', 'commits':'mean', 'contributors':'mean'}).reset_index()

In [None]:
import plotly.io as pio


@interact
def histogram_plot(opt = ['commits', 'forks', 'contributors']):
    
    #df = meta_df
    df = ref[ref[opt] >= 1]
    
    df[opt] = np.log(df[opt])
    
    title_str = opt.capitalize() + ' - References to DB' 
    trace = go.Bar(x = df['dacat_name'], y = df[opt] )
    
    
    # layout
    layout = go.Layout(
                title = title_str, # Graph title
                xaxis = dict(title = opt.capitalize() + ' - Datacatalog'), # x-axis label
                yaxis = dict(title = 'Count'), # y-axis label
                #hovermode ='closest' # handles multiple points landing on the same vertical
    )

    # fig
    fig = go.Figure(trace, layout)
    fig.update_traces(marker_line_width = 0.5, selector=dict(type='histogram'))
    fig.update_traces(hovertemplate=None)
    fig.update_layout(hovermode='x unified')
    
    fig.show()
    #pio.write_html(fig, file='index.html', auto_open=True)

In [None]:
meta_df.describe(include = 'all')

### Plotting by Data Catalog or Code Repo

In [None]:
@interact(x=(0,500))
def show_dc_more_than(selection =['dacat','cr'], column=['forks', 'commits', 'contributors'], x = 1):
    meta_df
    if selection =='dacat':
        df = meta_df[['dacat_name', 'cr_item', 'forks', 'commits', 'contributors']]
        df = meta_df.groupby('dacat').agg({'dacat_name': 'max', 'cr_item' : 'count', 'forks' : 'sum', 'commits' : 'sum', 'contributors' : 'sum'}).reset_index()
        
    if selection =='cr':
        df = meta_df.groupby('cr_item').agg({'cr_name': 'max', 'dacat_name': 'max',  'forks' : 'sum', 'commits' : 'sum', 'contributors' : 'sum'}).reset_index()
        
        
    
    return df.loc[df[column] > x]

In [None]:
dacat_list = meta_df['dacat_name'].unique().tolist()
dacat_list.insert(0, 'All')

@interact
def histogram_plot(opt = ['commits', 'forks', 'contributors'],
                  dacat = dacat_list):
    if dacat == 'All':
        df = meta_df
    
    else:
        df = meta_df[meta_df['dacat_name'] == dacat]
    
    # Plotting object
    df = df[df[opt] >= 1]
    
    df[opt] = np.log10(df[opt])
       
    title_str = opt.capitalize() + ' - Repos with "Earth Sciences" as a Keyword' 
    trace = go.Histogram(x = df[opt], nbinsx=40)
       
    mean = 10**(df[opt].mean())
    median = 10**(df[opt].median())
    per25 = 10**(np.percentile(df[opt], 25))
    per75 = 10**(np.percentile(df[opt], 75))
    
    # Layout
    layout = go.Layout(
                title = title_str, # Graph title
                xaxis = dict(title = opt.capitalize() + ' - Datacatalog: ' + dacat), # x-axis label
                yaxis = dict(title = 'Count'), # y-axis label
                
                # Adding stats lines
                shapes= [{'line': {'color': 'LightSeaGreen', 'dash': 'dot', 'width': 4},
                                   'type': 'line',
                                   'x0': df[opt].mean(),
                                   'x1': df[opt].mean(),
                                   'xref': 'x',
                                   'y0': -0.1,
                                   'y1': 1,
                                   'yref': 'paper'},
                         
                        {'line': {'color': 'LightSeaGreen', 'dash': 'dot', 'width': 4},
                                   'type': 'line',
                                   'x0': df[opt].median(),
                                   'x1': df[opt].median(),
                                   'xref': 'x',
                                   'y0': -0.1,
                                   'y1': 1,
                                   'yref': 'paper'},
                         
                        {'line': {'color': 'LightSeaGreen', 'dash': 'dot', 'width': 4},
                                   'type': 'line',
                                   'x0': np.percentile(df[opt], 25),
                                   'x1': np.percentile(df[opt], 25),
                                   'xref': 'x',
                                   'y0': -0.1,
                                   'y1': 1,
                                   'yref': 'paper'},
                         
                        {'line': {'color': 'LightSeaGreen', 'dash': 'dot', 'width': 4},
                                   'type': 'line',
                                   'x0': np.percentile(df[opt], 75),
                                   'x1': np.percentile(df[opt], 75),
                                   'xref': 'x',
                                   'y0': -0.1,
                                   'y1': 1,
                                   'yref': 'paper'}],
        
                 # Annotations regarding stats lines
                 annotations=[
                     dict(
                          x=df[opt].mean(),
                          y=.95,
                          xref='x',
                          yref='paper',
                          text="Mean = {:,.0f}".format(mean),
                          showarrow=True,
                          font=dict(
                                    family="Sans Serif, monospace",
                                    size=18,
                                    color="Black"
                                    ),
                          arrowhead=8,
                          ax=1,
                          ay=1
                          ),
                     dict(
                         x=df[opt].median(),
                         y=0.85,
                         xref='x',
                         yref='paper',
                         text="Median = {:,.0f}".format(median),
                         showarrow=True,
                         font=dict(
                                   family="Sans Serif, monospace",
                                   size=18,
                                   color="Black"
                                   ),
                         arrowhead=8,
                         ax=1,
                         ay=1
                         ),
                     dict(
                          x=np.percentile(df[opt], 25),
                          y=0.75,
                          xref='x',
                          yref='paper',
                          text="25per = {:,.0f}".format(per25),
                          showarrow=True,
                          font=dict(
                                    family="Sans Serif, monospace",
                                    size=15,
                                    color="Black"
                                    ),
                          arrowhead=8,
                          ax=1,
                          ay=1
                         ),
                     dict(
                          x=np.percentile(df[opt], 75),
                          y=0.75,
                          xref='x',
                          yref='paper',
                          text="75per = {:,.0f}".format(per75),
                          showarrow=True,
                          font=dict(
                                    family="Sans Serif, monospace",
                                    size=15,
                                    color="Black"
                                     ),
                          arrowhead=8,
                          ax=1,
                          ay=1
                         )])

    # Figure
    fig = go.Figure(trace, layout)
    fig.update_traces(marker_line_width = 0.5, selector=dict(type='histogram'))
    fig.update_traces(hovertemplate=None)
    fig.update_layout(hovermode='x unified')    
    
    # Changing Ticks
    a = list(set(list(np.log10(meta_df[opt]))))
    a = int(len(a)/13)
    
    range_list = sorted(list(meta_df[opt]))
    range_list_skipped = range_list[0:len(list(meta_df[opt])):a*2]
    range_list_skipped.append(range_list[-1])
    range_list_skipped = sorted(list(set(range_list_skipped)))
    
    log_list = sorted(list(np.log10(meta_df[opt])))
    log_list_skipped = log_list[0:len(list(np.log10(meta_df[opt]))):a*2]
    log_list_skipped.append(log_list[-1])
    log_list_skipped = sorted(list(set(log_list_skipped)))
    
    fig.update_layout(
     xaxis = dict(
         tickmode = 'array',
         tickvals = log_list_skipped,
         ticktext = range_list_skipped
     )
 )    
    # Changing Ticks
    a = list(set(list(np.log10(meta_df[opt]))))
    a = int(len(a)/13)
    
    range_list = sorted(list(meta_df[opt]))
    range_list_skipped = range_list[0:len(list(meta_df[opt])):a*2]
    range_list_skipped.append(range_list[-1])
    range_list_skipped = sorted(list(set(range_list_skipped)))
    
    log_list = sorted(list(np.log10(meta_df[opt])))
    log_list_skipped = log_list[0:len(list(np.log10(meta_df[opt]))):a*2]
    log_list_skipped.append(log_list[-1])
    log_list_skipped = sorted(list(set(log_list_skipped)))
    
    fig.update_layout(
     xaxis = dict(
         tickmode = 'array',
         tickvals = [0, .5, 1, 1.5, 2, 2.5, 3, 4],
         ticktext = [1, 3.15, 10, 31.5, 100, 316, 1000, 10000]
     )
 )
    fig.show()

In [None]:
meta_df[(meta_df['contributors'] == 7) | (meta_df['contributors'] == 8)]

In [None]:
a = int(len(list(sorted(list(np.log(meta_df['commits'])))))/10)
a

In [None]:
a = int(len(list(sorted(list(np.log(meta_df['commits'])))))/10)
    
range_list = sorted(list(meta_df['commits']))
range_list_skipped = range_list[0:len(list(meta_df['commits'])):a]
range_list_skipped.append(range_list[-1])
range_list_skipped = sorted(list(set(range_list)))
len(range_list_skipped)

# Analysis checking for Subject

In [None]:
subject_data = graph.run('''MATCH (s:SUBJECT)\
WHERE s.id IN [313, 314, 315, 317]\
MATCH (s)<-[:hasSubject]-(a:ANNOTATION)-[]->(dc:dataCat)\
MATCH (dc)<-[:Target]-(:ANNOTATION)-[:Target]->(cr:codeRepo)\
RETURN distinct properties(dc), properties(cr), s.id''').data()

In [None]:
subject_df = utils.create_df_subject(subject_data = subject_data)

In [None]:
subject_df.head(2)

In [None]:
subject_df.to_csv('geo313_317_data_throughput.csv')

In [3]:
subject_df.describe(include='all')

NameError: name 'subject_df' is not defined

In [4]:
df_for_dacat = subject_df.groupby('dacat_name').count().reset_index()
df_for_dacat = df_for_dacat[df_for_dacat['dacat'] > 100]

dacat_list = df_for_dacat['dacat_name'].unique().tolist()
dacat_list.insert(0, 'All')

@interact
def histogram_plot(subject = list(set(subject_df['subject_str'])),
                   option = ['commits', 'forks', 'contributors'],
                   dacat = dacat_list):
    
    if dacat == 'All':
        df = subject_df
    
    else:
        df = subject_df[subject_df['dacat_name'] == dacat]

    
    df = df[df['subject_str'] == subject]
    
    df = df[df[option] >= 1]
    
    df[option] = np.log10(df[option])
    
    title_str = option.capitalize() + ' - Repos with " '+ subject + '" as a Subject' 
    trace = go.Histogram(x = df[option], nbinsx=50, marker_color = 'white')
    
    # Stats
    mean = 10**(df[option].mean())
    median = 10**(df[option].median())
    per25 = 10**(np.percentile(df[option], 25))
    per75 = 10**(np.percentile(df[option], 75))
    
    # layout
    layout = go.Layout(
                title = title_str, # Graph title
                xaxis = dict(title = option.capitalize() + '- Datacatalog: ' + dacat), # x-axis label
                yaxis = dict(title = 'Count'), # y-axis label
        
        # Adding stats lines
                shapes= [{'line': {'color': 'Black', 'dash': 'dashdot', 'width': 1},
                                   'type': 'line',
                                   'x0': df[option].mean(),
                                   'x1': df[option].mean(),
                                   'xref': 'x',
                                   'y0': -0.1,
                                   'y1': 1,
                                   'yref': 'paper'},
                         
                        {'line': {'color': 'Black', 'dash': 'dashdot', 'width': 1},
                                   'type': 'line',
                                   'x0': df[option].median(),
                                   'x1': df[option].median(),
                                   'xref': 'x',
                                   'y0': -0.1,
                                   'y1': 1,
                                   'yref': 'paper'},
                         
                        {'line': {'color': 'Black', 'dash': 'dashdot', 'width': 1},
                                   'type': 'line',
                                   'x0': np.percentile(df[option], 25),
                                   'x1': np.percentile(df[option], 25),
                                   'xref': 'x',
                                   'y0': -0.1,
                                   'y1': 1,
                                   'yref': 'paper'},
                         
                        {'line': {'color': 'Black', 'dash': 'dashdot', 'width': 1},
                                   'type': 'line',
                                   'x0': np.percentile(df[option], 75),
                                   'x1': np.percentile(df[option], 75),
                                   'xref': 'x',
                                   'y0': -0.1,
                                   'y1': 1,
                                   'yref': 'paper'}],
        
                 # Annotations regarding stats lines
                 annotations=[
                     dict(
                          x=df[option].mean()+1.5,
                          y=.95,
                          xref='x',
                          yref='paper',
                          text="Mean = {:,.0f}".format(mean),
                          showarrow=True,
                          font=dict(
                                    family="Times New Roman",
                                    size=14,
                                    color="Black"
                                    ),
                          arrowhead=8,
                          ax=1,
                          ay=1
                          ),
                     dict(
                         x=df[option].mean()+1.5,
                         y=0.87,
                         xref='x',
                         yref='paper',
                         text="Median = {:,.0f}".format(median),
                         showarrow=True,
                         font=dict(
                                   family="Times New Roman",
                                   size=14,
                                   color="Black"
                                   ),
                         arrowhead=8,
                         ax=1,
                         ay=1
                         ),
                     dict(
                          x=df[option].mean()+1.5,
                          y=0.79,
                          xref='x',
                          yref='paper',
                          text="25 percentile = {:,.0f}".format(per25),
                          showarrow=True,
                          font=dict(
                                    family="Times New Roman",
                                    size=14,
                                    color="Black"
                                    ),
                          arrowhead=8,
                          ax=1,
                          ay=1
                         ),
                     dict(
                          x=df[option].mean()+1.5,
                          y=0.71,
                          xref='x',
                          yref='paper',
                          text="75 percentile = {:,.0f}".format(per75),
                          showarrow=True,
                          font=dict(
                                    family="Times New Roman",
                                    size=14,
                                    color="Black"
                                     ),
                          arrowhead=8,
                          ax=1,
                          ay=1
                         )]
    )

    # fig
    fig = go.Figure(trace, layout)
    fig.update_traces(marker_line_width = 1, selector=dict(type='histogram'))   
    fig.update_traces(hovertemplate=None)
    fig.update_layout(hovermode='x unified')
    
    # Changing Ticks

    ticks_vals_list = None
    ticks_vals_list = [0, .5, 1, 1.5, 2, 2.5, 3, 3.5, 4]
    log_list = sorted(list(set(np.log10(df[option]))))
    #ticks_vals_list.append(log_list[-2])
    ticks_vals_list.append(log_list[-3])
    
    # Getting the text list
    fig.update_layout(
     xaxis = dict(
         tickmode = 'array',
         tickvals = [0,    .5,      1,    1.5,    2,    2.5,     3,       3.5,     4,        5.1, 5.7],
         ticktext = ['1', '3.15', '10', '31.5', '100', '316', '1,000', '3,162', '10,000', '125,892', '501,187'],
       #  xaxis = list(rangemode = 'tozero')
     ),

        template='none',
                xaxis_showgrid=False, 
                yaxis_showgrid=False
 )
    fig.update_yaxes(zeroline=True,rangemode = 'tozero'),
    fig.update_xaxes( zeroline=True)
    
    file_name = 'GeoGraphs_'+option.capitalize()+'.svg'
    fig.write_image(file_name)
    fig.show()
    
    #pio.write_html(fig, file='Geo_Graphs.html', auto_open=True)

NameError: name 'subject_df' is not defined

![img2](img/subject_graph.png)

Green: Data Cat
Navy blue: subject
Pink: Code Repo
Ligh Blue: Annotation

## All Data Without Subjects

In [24]:
#all_data = graph.run('''MATCH (s:SUBJECT)<-[:hasSubject]-(a:ANNOTATION)-[]->(dc:dataCat)\
#MATCH (dc)<-[:Target]-(:ANNOTATION)-[:Target]->(cr:codeRepo)\
#RETURN distinct properties(dc), properties(cr), s.id''').data()

In [25]:
#all_data = utils.create_all_df(all_data)

In [26]:
#all_data.to_csv('all_throughput_data_w_subject.csv')

In [6]:
all_data = pd.read_csv('output_data/all_throughput_data_w_subject.csv', index_col = 0)
all_data.head(3)

Unnamed: 0,dacat,dacat_name,cr_item,cr_name,forks,commits,contributors,subject
0,r3d100010313dc,National Earthquake Information Center,247252173cr,chinapedia/wikipedia.ko,0,326,1,3
1,r3d100010313dc,National Earthquake Information Center,247252173cr,chinapedia/wikipedia.ko,0,326,1,34
2,r3d100010313dc,National Earthquake Information Center,18522395cr,usgs/earthquake-website,54,3641,29,3


In [7]:
all_df = all_data.copy()

In [8]:
def find_geology(x):
    if x == 314:
        return 'Geology_Palaeontology'
    else:
        return 'Other'

In [9]:
all_df['Geology_Palaeontology'] = all_df['subject'].apply(lambda x: find_geology(x))

In [10]:
all_df.head(2)

Unnamed: 0,dacat,dacat_name,cr_item,cr_name,forks,commits,contributors,subject,Geology_Palaeontology
0,r3d100010313dc,National Earthquake Information Center,247252173cr,chinapedia/wikipedia.ko,0,326,1,3,Other
1,r3d100010313dc,National Earthquake Information Center,247252173cr,chinapedia/wikipedia.ko,0,326,1,34,Other


In [11]:
all_df[all_df['Geology_Palaeontology'] == 'Geology_Palaeontology']

Unnamed: 0,dacat,dacat_name,cr_item,cr_name,forks,commits,contributors,subject,Geology_Palaeontology
4807,r3d100012308dc,DesignSafe-CI,62251072cr,DesignSafe-CI/ansible,0,173,5,314,Geology_Palaeontology
4812,r3d100012308dc,DesignSafe-CI,39967132cr,DesignSafe-CI/portal,15,4499,18,314,Geology_Palaeontology
4817,r3d100012308dc,DesignSafe-CI,226961036cr,flintm/rsb-illustration,2,14,2,314,Geology_Palaeontology
4822,r3d100012308dc,DesignSafe-CI,140017532cr,fnets/designsafe-e2e,0,17,2,314,Geology_Palaeontology
4827,r3d100012308dc,DesignSafe-CI,59690720cr,DesignSafe-CI/community-forums,0,13,2,314,Geology_Palaeontology
...,...,...,...,...,...,...,...,...,...
378591,r3d100010267dc,International Ocean Discovery Program,157687257cr,japhir/cp-2018-42,0,11,1,314,Geology_Palaeontology
378598,r3d100010267dc,International Ocean Discovery Program,184012990cr,chinapedia/wikipedia.zh.mediawiki,8,7,1,314,Geology_Palaeontology
378605,r3d100010267dc,International Ocean Discovery Program,69621002cr,wkiri/MTE-corpus,2,43,1,314,Geology_Palaeontology
378612,r3d100010267dc,International Ocean Discovery Program,223410541cr,EarthSystemDiagnostics/climproxyrecords,0,49,1,314,Geology_Palaeontology


In [13]:
all_df.head()

Unnamed: 0,dacat,dacat_name,cr_item,cr_name,forks,commits,contributors,subject,Geology_Palaeontology
0,r3d100010313dc,National Earthquake Information Center,247252173cr,chinapedia/wikipedia.ko,0,326,1,3,Other
1,r3d100010313dc,National Earthquake Information Center,247252173cr,chinapedia/wikipedia.ko,0,326,1,34,Other
2,r3d100010313dc,National Earthquake Information Center,18522395cr,usgs/earthquake-website,54,3641,29,3,Other
3,r3d100010313dc,National Earthquake Information Center,18522395cr,usgs/earthquake-website,54,3641,29,34,Other
4,r3d100010313dc,National Earthquake Information Center,94204283cr,ttsteiger/ttsteiger.github.io,0,102,2,3,Other


In [21]:
all_df_dist = all_df.groupby(['subject', 'dacat'])['cr_item'].count().reset_index()
all_df_dist.head(3)

Unnamed: 0,subject,dacat,cr_item
0,1,r3d100000005dc,3
1,1,r3d100000006dc,51
2,1,r3d100000028dc,5


In [22]:
geo_sel = all_df_dist[all_df_dist['subject'] == 314]
other_sel = all_df_dist[all_df_dist['subject'] != 314]

In [36]:
fig = go.Figure()   
fig.add_trace(go.Bar(x = geo_sel['dacat'], y = geo_sel['cr_item'], name = 'Geosci', opacity=1.0, marker_color = 'black'))
fig.add_trace(go.Bar(x = other_sel['dacat'], y = other_sel['cr_item'], name = 'Other', opacity=.7, marker_color = 'silver'))

fig.update_layout(
            title = 'Count of Repos per Data Catalog', # Graph title
            hovermode ='closest', # handles multiple points landing on the same vertical
            template='none',
            xaxis_showgrid=False, 
            yaxis_showgrid=False
            )
fig.update_xaxes(title = 'Data Catalogs', categoryorder = 'total descending',showticklabels=False, 
                 linecolor='black', zeroline=True,)

fig.show()
#pio.write_html(fig, file='Count_Repos_Catalog.html', auto_open=True)

file_name = 'Count_Repos_Catalog.svg'
fig.write_image(file_name)

In [12]:
import plotly.express as px

df_for_dacat = all_df.groupby('dacat_name').count().reset_index()
df_for_dacat = df_for_dacat[df_for_dacat['dacat'] > 10]

dacat_list = df_for_dacat['dacat_name'].unique().tolist()
dacat_list.insert(0, 'All')

@interact
def histogram_plot(opt = ['commits', 'forks', 'contributors'],
                  dacat = dacat_list):
    
    if dacat == 'All':
        df = all_df
    
    else:
        df = all_df[all_df['dacat_name'] == dacat]
    
    
    df = df[df[opt] >= 1]
    
    df[opt] = np.log10(df[opt])
    
    title_str = opt.capitalize() + '- All Repos within Throughput.'
    
    first_sel = df[df['Geology_Palaeontology'] == 'Geology_Palaeontology']
    second_sel = df[df['Geology_Palaeontology'] == 'Other']
    fig = go.Figure()   
    fig.add_trace(go.Histogram(histfunc="sum", name = 'Other', x = second_sel[opt], nbinsx=50, opacity=0.8))
    fig.add_trace(go.Histogram(histfunc="sum", name = 'Geology_Palaeontology', x = first_sel[opt], nbinsx=50, opacity=0.8))
    
    
    # layout
    fig.update_layout(
                title = title_str, # Graph title
                xaxis = dict(title = opt.capitalize() + '- Datacatalog: ' + dacat), # x-axis label
                yaxis = dict(title = 'Count'), # y-axis label
                hovermode ='closest', # handles multiple points landing on the same vertical
                
    )
    #fig.update_layout(barmode='stack')
    fig.update_traces(marker_line_width = 0.5, selector=dict(type='histogram'))
    #fig.update_yaxes(type="log")
    
    # Changing Ticks

    fig.update_layout(
     xaxis = dict(
         tickmode = 'array',
         tickvals = [0, .5, 1, 1.5, 2, 2.5, 3, 4],
         ticktext = [1, 3.15, 10, 31.5, 100, 316, 1000, 10000]
     )
 )
    
    fig.show()
    #pio.write_html(fig, file='Geo_Graphs.html', auto_open=True)

interactive(children=(Dropdown(description='opt', options=('commits', 'forks', 'contributors'), value='commits…

In [34]:
all_df.head(3)

Unnamed: 0,dacat,dacat_name,cr_item,cr_name,forks,commits,contributors,subject,Geology_Palaeontology
0,r3d100010313dc,National Earthquake Information Center,247252173cr,chinapedia/wikipedia.ko,0,326,1,3,Other
1,r3d100010313dc,National Earthquake Information Center,247252173cr,chinapedia/wikipedia.ko,0,326,1,34,Other
2,r3d100010313dc,National Earthquake Information Center,18522395cr,usgs/earthquake-website,54,3641,29,3,Other


In [26]:
grouped_all_df = all_df.groupby(['subject', 'dacat_name']).agg({'forks':'mean', 'commits':'mean', 'contributors':'mean'}).reset_index()
grouped_all_df.sort_values(by='commits')

@interact
def histogram_plot(opt = ['commits', 'forks', 'contributors']):
    
    df = grouped_all_df.copy()  
    df = df[df[opt] >= 1]
    
    title_str = opt.capitalize() + '- All Repos by DaCat within Throughput.'
    
    fig = go.Figure()
    
    # Data for Geo
    geo_sel = df[df['subject'] == 314]
    geo_sel = geo_sel.sort_values(by = 'dacat_name').reset_index()
    geo_sel = geo_sel.drop_duplicates(subset = ['dacat_name'])
    list_314 = geo_sel['dacat_name'].to_list()
    
    # Data for Other
    other_sel = df[df['subject'] != 314] 
    other_sel = other_sel[other_sel['dacat_name'].isin(list_314) == False]
    other_sel = other_sel.sort_values(by = 'dacat_name').reset_index()
    other_sel = other_sel.drop_duplicates(subset = ['dacat_name'])
    
    # Trace for Other 
    fig.add_trace(go.Histogram(histfunc = 'avg', x = other_sel['dacat_name'], y = other_sel[opt], name = 'Other', opacity=1.0, marker_color = 'silver'))
    
    # Trace for Geo
    fig.add_trace(go.Histogram(histfunc = 'avg', x = geo_sel['dacat_name'], y = geo_sel[opt], name = 'Geology_Palaeontology', opacity=1.0,marker_color = 'black' ))
    
    
    # layout
    fig.update_layout(
                title = title_str, # Graph title
                hovermode ='closest', # handles multiple points landing on the same vertical
                template='none',
                xaxis_showgrid=False, 
                yaxis_showgrid=False
                )
    fig.update_xaxes(title = 'Data Repositories', categoryorder = 'total descending',showticklabels=False, 
                     linecolor='black', zeroline=True,)
    
    if opt == 'commits':
        fig.update_yaxes(title = opt.capitalize(), linecolor='black', zeroline=True, range=(0,30000))
    elif opt == 'forks':
        fig.update_yaxes(title = opt.capitalize(), linecolor='black', zeroline=True, range=(0,500))
    else:
        fig.update_yaxes(title = opt.capitalize(), linecolor='black', zeroline=True)
        
    file_name = 'ThroughputGraphs_'+opt.capitalize()+'.svg'
    fig.write_image(file_name)
    
    fig.show()
    #pio.write_html(fig, file='All_Graphs.html', auto_open=True)

interactive(children=(Dropdown(description='opt', options=('commits', 'forks', 'contributors'), value='commits…