### Using D3 modules excluded from make_js_viz_data.ipynb

In [1]:
import requests
import logging
import pandas as pd
import numpy as np
import urllib2
import dill
import time
import json
from datetime import date, timedelta
from bs4 import BeautifulSoup
import unicodedata
import csv
import sys
sys.path.append('../code')

In [2]:
with open('../output/D3mods_pkgs.csv', 'rb') as f:
    reader = csv.reader(f)
    D3_modules = list(reader)
    D3_modules = D3_modules[0]

In [3]:
print D3_modules

['d3-hcg', 'd3-table', 'mpld3', 'd3-timelines', 'd3-horizon-chart', 'd3-axes', 'forest-d3', 'd3-component', 'd3-resume', 'd3-timeline', 'd3-hexbin', 'd3', 'd3-mesh', 'd3-geo-warp', 'd3-geo', 'd3pie', 'd3-shape', 'd3-scale-chromatic', 'react-d3-components', 'd3-x3dom-shape', 'd3-format', 'd3-binarytree', 'd3.chart', 'd3-components', 'd3-area-label', 'd3-force-cluster', 'd3-summary-tiles', 'd3-annotation', 'd3-source-sink', 'd3-axis', 'd3-tube-map', 'd3-kit', 'd3fc', 'd3-collection', 'd3-canvas-transition', 'd3-bar', 'd3-binaryree', 'd3-wrap', 'd3-view', 'd3-peaks', '@plotly/d3-sankey', 'd3-bboxCollide', 'd3-force-3d', 'd3-scale-interactive', 'd3-czip', 'd3-polygon', 'd3-hierarchy', 'd3-geo-projection', 'd3-brush-2', 'd3-let', '@zambezi/d3-rebind', 'd3-brush', 'd3-foodweb', 'd3-icon', 'd3-force-bounce', 'd3-beeswarm', 'd3-ellipse-force', 'd3-interpolate', 'g-chartcolour', 'd3panels', 'bki-d3-timer', 'd3-sankeyseq', 'd3-nelson-rules', 'd3-hypergraph', 'd3-hist2d', 'd3act', 'd3-graphviz', 

In [4]:
len(D3_modules)

193

In [5]:
D3_modules.remove('d3') #only want to look at modules, not d3 as a whole
D3_modules.remove('d3.js')

### Get github forks and stars

In [6]:
with open("../code/secrets/github-token.nogit", "rb") as f:
    token = f.read()
headers = {'Authorization': 'token %s' % token}

In [7]:
def get_data_from_github(query):
    """Use github search to return stats from top query result"""
    
    r = requests.get('https://api.github.com/search/repositories?q='+query.replace("@",""),
                     headers=headers)
    r.raise_for_status()
    try:
        res = r.json()['items'][0]
        return {'module':query, 'github_repo': res['full_name'], 'stars': res['stargazers_count'], 'forks': res['forks_count'], 
                'description': unicodedata.normalize('NFKD', res['description']).encode('ascii','ignore')}
    except:
        return {'module':query, 'github_repo': 'NA', 'stars': np.NaN, 'forks': np.NaN, 'description': 'NA'}

In [8]:
# use generator to avoid repeat API calls; API limit with token: 30 api calls/min
github_data = []
for ii in range(int(len(D3_modules)/20)):
    start = ii*20
    end = (ii+1)*20
    data = [res for res in (get_data_from_github(query) for query in D3_modules[start:end])
        if res is not None]
    github_data.extend(data)
    time.sleep(61) 
data = [res for res in (get_data_from_github(query) for query in D3_modules[end:])
        if res is not None]
github_data.extend(data)
print "DONE"

DONE


In [9]:
githubDF = pd.DataFrame(github_data)[['module','github_repo', 'stars', 'forks']]

In [10]:
githubDF.sort_values(['stars'],ascending=False).head()

Unnamed: 0,module,github_repo,stars,forks
120,d3-cloud,jasondavies/d3-cloud,2175.0,757.0
168,react-d3,esbullington/react-d3,1640.0,180.0
2,mpld3,mpld3/mpld3,1490.0,259.0
15,d3-shape,d3/d3-shape,1444.0,125.0
151,d3-queue,d3/d3-queue,1320.0,128.0


### Get stack overflow tags and questions

In [11]:
baseurl = 'https://api.stackexchange.com/2.2/search/advanced'

def get_so_tag_counts(tag):
#Given tag, return tag counts
    params = {
    "site": "stackoverflow",
    "key": "y38PeNERQJQIC8EPliKAVQ((",
    "tagged": tag,  
    "filter": 'total'}
    try:
        r = requests.get(baseurl, params=params)
        return r.json()['total']
    except:
        return 0

def get_so_question_counts(package):
#Given package, return count of questions containing package name
    params = {
    "site": "stackoverflow",
    "key": "y38PeNERQJQIC8EPliKAVQ((",
    "q": package,  
    "filter": "total"}
    try:
        r = requests.get(baseurl, params=params)
        return r.json()['total']
    except:
        return 0    

In [12]:
so_data = [{'module':module, 'tag_count':get_so_tag_counts(module),'question_count':get_so_question_counts(module)}
 for module in D3_modules]

In [13]:
so_DF = pd.DataFrame(so_data)[['module','tag_count', 'question_count']]

In [14]:
so_DF.sort_values(['tag_count'],ascending=False).head()

Unnamed: 0,module,tag_count,question_count
2,mpld3,85,127
182,dagre-d3,52,74
186,d3plus,49,52
168,react-d3,29,350
31,d3fc,5,6


### Get npmjs downloads and growth rate

In [15]:
def extract_count(html):
    total_dl = 0
    day_queries = json.loads(html).get(u'downloads')
    for day in day_queries:
        total_dl += day.get(u'downloads')
    return total_dl

def get_npmjs_counts(package, start_date, end_date):
    #retruns total downloads between dates
    #enter package and dates as strings, date format YYYY-MM-DD
    try:
        response = urllib2.urlopen('https://api.npmjs.org/downloads/range/'+start_date+':'+end_date+'/'+package)
        return extract_count(response.read())
    except urllib2.HTTPError, e:
        return np.nan

def get_npmjs_cmgr(package, first_month, last_month):
    #returns compound monthly growth rate
    #enter package as string, date as list of two integers [YYYY, M]
    try:
        d1 = date(first_month[0], first_month[1], 1)
        d2 = date(last_month[0], last_month[1], 1)
        month_diff = (d2.year - d1.year) * 12 + d2.month - d1.month + 1
        d1_end = d1 + timedelta(days=30)
        month1_counts = extract_count(urllib2.urlopen('https://api.npmjs.org/downloads/range/'
                                                  +d1.strftime("%Y-%m-%d")+':'+d1_end.strftime("%Y-%m-%d")+'/'+package).read())
        d2_end = d2 + timedelta(days=30)
        month2_counts = extract_count(urllib2.urlopen('https://api.npmjs.org/downloads/range/'
                                                  +d2.strftime("%Y-%m-%d")+':'+d2_end.strftime("%Y-%m-%d")+'/'+package).read())
        if month1_counts == 0:
            return np.nan
        return (((month2_counts/float(month1_counts))**(1.0/month_diff))-1.0)
    except urllib2.HTTPError, e:
        return np.nan
    

def get_npmjs_stats(npm_package_list):
    #returns downloads in last 3 years, and 6 month compound monthly growth rate
    downloads =[]
    growth_rates = []
    for package in npm_package_list:
        downloads.append(get_npmjs_counts(package, '2014-08-01','2017-07-31'))
        growth_rates.append(get_npmjs_cmgr(package, [2017, 2], [2017, 7]))
    return downloads, growth_rates

In [16]:
[downloads, growth_rates] = get_npmjs_stats(D3_modules)
downloadsDF = pd.DataFrame(np.column_stack([D3_modules, downloads, growth_rates]), 
                               columns=['module', 'downloads', 'growth_rates'])

### put all stats together

In [17]:
bigDF = githubDF.merge(so_DF,on='module').merge(downloadsDF,on='module')

In [18]:
bigDF.head()

Unnamed: 0,module,github_repo,stars,forks,tag_count,question_count,downloads,growth_rates
0,d3-hcg,d3/d3-hcg,9.0,3.0,0,1,1021.0,-0.166516548034
1,d3-table,RajanRastogi/d3-table,4.0,1.0,0,1434,211.0,0.064440073412
2,mpld3,mpld3/mpld3,1490.0,259.0,85,127,,
3,d3-timelines,jiahuang/d3-timeline,770.0,236.0,0,136,246.0,
4,d3-horizon-chart,kmandov/d3-horizon-chart,38.0,7.0,0,14,2330.0,0.0844578398263


In [19]:
bigDF.to_csv("../output/D3_modules_Data.csv")