## Ranking Deep Learning Libraries

In [1]:
import requests
import logging
import pandas as pd
import numpy as np
import urllib2
import dill
import time
import json
from datetime import date, timedelta
from bs4 import BeautifulSoup
import unicodedata
import csv
import sys
import get_stack_overflow_data as gso
sys.path.append('../code')
sys.path.append('../data')

### Create list of libraries

### 1 https://svds.com/understanding-ai-toolkits/

In [2]:
r1 = requests.get('https://svds.com/understanding-ai-toolkits/')
soup1 = BeautifulSoup(r1.text, 'lxml')
toolkit_list1 = []
for litags in soup1.findAll('li', attrs={'class':None}):
    for atag in litags.find('a'):
        toolkit_list1.append(atag.encode('utf-8').lower())

In [3]:
toolkit_list1 = toolkit_list1[0:8]

In [4]:
toolkit_list1

['tensorflow',
 'mxnet',
 'deeplearning4j',
 'torch',
 'pytorch',
 'ctnk',
 'caffe',
 'theano']

### 2 https://en.wikipedia.org/wiki/Comparison_of_deep_learning_software

In [5]:
r2 = requests.get('https://en.wikipedia.org/wiki/Comparison_of_deep_learning_software')
soup2 = BeautifulSoup(r2.text, 'lxml')
toolkit_list2 = []

In [6]:
for tr in soup2.findAll('tr'):
    if tr.find('td'):
        toolkit_list2.append(tr.find('td').text.encode('utf-8').lower())

In [7]:
toolkit_list2

['apache singa',
 'bigdl',
 'caffe',
 'deeplearning4j',
 'dlib',
 'keras',
 'matconvnet',
 'microsoft cognitive toolkit',
 'mxnet',
 'neural designer',
 'opennn',
 'tensorflow',
 'theano',
 'torch',
 'wolfram mathematica']

### 3 https://www.packtpub.com/books/content/top-10-deep-learning-frameworks

In [8]:
#by hand, request 403 Forbidden 
toolkit_list3 = ['tensorflow','theano','keras','caffe','torch','deeplearning4j','mxnet','microsoft cognitive toolkit','lasagne','bigdl'] 

In [9]:
toolkit_list3

['tensorflow',
 'theano',
 'keras',
 'caffe',
 'torch',
 'deeplearning4j',
 'mxnet',
 'microsoft cognitive toolkit',
 'lasagne',
 'bigdl']

### 4 https://twitter.com/fchollet/status/882995652233371648

In [10]:
#by hand from jpgs
toolkit_list4 = ['tensorflow','keras','mxnet','caffe2','pytorch','caffe','paddle','cntk',
                'deeplearning4j','tflearn','dlib','theano','chainer','digits','dynet', 'sonnet']

In [11]:
toolkit_list4

['tensorflow',
 'keras',
 'mxnet',
 'caffe2',
 'pytorch',
 'caffe',
 'paddle',
 'cntk',
 'deeplearning4j',
 'tflearn',
 'dlib',
 'theano',
 'chainer',
 'digits',
 'dynet',
 'sonnet']

### 5 https://svds.com/wp-content/uploads/2017/02/Deep_learning_ratings_final-1024x563.png

In [12]:
# by hand from png
toolkit_list5 = ['theano', 'tensorflow', 'torch', 'caffe', 'mxnet', 'neon', 'cntk']

In [13]:
toolkit_list5

['theano', 'tensorflow', 'torch', 'caffe', 'mxnet', 'neon', 'cntk']

### combine and edit lists

In [14]:
toolkits = list(set(toolkit_list1 + toolkit_list2 + toolkit_list3 + toolkit_list4 + toolkit_list5))

In [15]:
print toolkits

['sonnet', 'torch', 'neon', 'neural designer', 'tflearn', 'opennn', 'deeplearning4j', 'keras', 'paddle', 'caffe', 'tensorflow', 'ctnk', 'bigdl', 'cntk', 'pytorch', 'matconvnet', 'dlib', 'digits', 'microsoft cognitive toolkit', 'mxnet', 'caffe2', 'wolfram mathematica', 'lasagne', 'chainer', 'dynet', 'theano', 'apache singa']


In [16]:
toolkits.remove('ctnk')
toolkits.remove('microsoft cognitive toolkit')
# ctnk is typo, microsoft cognitive toolkit is cntk

toolkits.remove('digits')
toolkits.append('nvidia digits')

toolkits.remove('neon')
toolkits.append('nervana neon')

toolkits.remove('paddle')
toolkits.append('paddlepaddle')

toolkits.remove('neural designer')
toolkits.remove('wolfram mathematica')
#neural designer and wolfram mathematica are proprietary with no github repositories 

In [17]:
with open("../data/dl_toolkits_final.txt", "w") as f:
    f.writelines(" ".join(toolkits))

In [18]:
toolkits.sort()
print toolkits

['apache singa', 'bigdl', 'caffe', 'caffe2', 'chainer', 'cntk', 'deeplearning4j', 'dlib', 'dynet', 'keras', 'lasagne', 'matconvnet', 'mxnet', 'nervana neon', 'nvidia digits', 'opennn', 'paddlepaddle', 'pytorch', 'sonnet', 'tensorflow', 'tflearn', 'theano', 'torch']


### get metrics

In [19]:
logging.basicConfig(level=logging.INFO)

### github stats

In [20]:
with open("../code/secrets/github-token.nogit", "rb") as f:
    token = f.read()

headers = {'Authorization': 'token %s' % token}

def get_data_from_search(query):
    """Use github search to return stars, forks for top query result"""
    
    r = requests.get('https://api.github.com/search/repositories?q='+\
                             query, 
                     headers=headers)
    r.raise_for_status()
    try:
        res = r.json()['items'][0]
        return {'toolkit': query, 'full_name': res['full_name'],
                'stars': int(res['stargazers_count']), 'forks': int(res['forks_count'])}
    except:
        return None

In [21]:
data = [res for res in (get_data_from_search(q) for q in toolkits)
        if res is not None]

In [22]:
github = pd.DataFrame(data)[['toolkit', 'full_name', 'forks', 'stars']]

In [23]:
github

Unnamed: 0,toolkit,full_name,forks,stars
0,apache singa,apache/incubator-singa,253,1036
1,bigdl,intel-analytics/BigDL,401,1990
2,caffe,BVLC/caffe,12371,20155
3,caffe2,caffe2/caffe2,1233,5628
4,chainer,chainer/chainer,772,2925
5,cntk,Microsoft/CNTK,3190,12366
6,deeplearning4j,deeplearning4j/deeplearning4j,3590,7175
7,dlib,davisking/dlib,941,2724
8,dynet,clab/dynet,435,1723
9,keras,fchollet/keras,7067,19504


In [24]:
#github search returned wrong repo for nvidia digits, torch, paddlepaddle, and intel bigdl- so they are changed manually below
r = requests.get('https://api.github.com/repos/NVIDIA/DIGITS', headers=headers)
res = r.json()
github.loc[github['toolkit'] == 'nvidia digits', 'full_name'] = 'NVIDIA/DIGITS'
github.loc[github['toolkit'] == 'nvidia digits', 'forks'] = res['forks_count']
github.loc[github['toolkit'] == 'nvidia digits', 'stars'] = res['stargazers_count']

r = requests.get('https://api.github.com/repos/torch/torch7', headers=headers)
res = r.json()
github.loc[github['toolkit'] == 'torch', 'full_name'] = 'torch/torch7'
github.loc[github['toolkit'] == 'torch', 'forks'] = res['forks_count']
github.loc[github['toolkit'] == 'torch', 'stars'] = res['stargazers_count']

r = requests.get('https://api.github.com/repos/PaddlePaddle/Paddle', headers=headers)
res = r.json()
github.loc[github['toolkit'] == 'paddlepaddle', 'full_name'] = 'PaddlePaddle/Paddle'
github.loc[github['toolkit'] == 'paddlepaddle', 'forks'] = res['forks_count']
github.loc[github['toolkit'] == 'paddlepaddle', 'stars'] = res['stargazers_count']

In [25]:
github.sort_values(['stars'], ascending=False).head()

Unnamed: 0,toolkit,full_name,forks,stars
19,tensorflow,tensorflow/tensorflow,34355,69781
2,caffe,BVLC/caffe,12371,20155
9,keras,fchollet/keras,7067,19504
5,cntk,Microsoft/CNTK,3190,12366
12,mxnet,apache/incubator-mxnet,4179,11127


In [26]:
github.to_csv("../data/DL_toolkits_results_github.csv", index=False)

### stackoverflow stats

In [27]:
tag_list = [toolkit.replace(' ','-') for toolkit in toolkits]
tag_counts = gso.get_tag_counts(tag_list)
df_tags = pd.DataFrame(tag_counts)[['name', 'count']]
df_tags.columns = ['toolkit', 'so_tag_counts']
df_tags

Unnamed: 0,toolkit,so_tag_counts
0,tensorflow,16462
1,keras,4098
2,theano,2329
3,caffe,2191
4,torch,774
5,dlib,421
6,cntk,301
7,tflearn,245
8,lasagne,227
9,pytorch,224


In [28]:
body_counts = {toolkit: gso.get_body_count(toolkit) for toolkit in tag_list}

In [29]:
df_questions = pd.DataFrame.from_dict(body_counts, orient='index')
df_questions.reset_index(inplace=True)
df_questions.columns = ['toolkit', 'so_question_count']

In [30]:
so = df_tags.merge(df_questions, on='toolkit', how='outer')

In [31]:
so['toolkit'] = so['toolkit'].apply(lambda x: str(x).replace('-',' '))
so.sort_values(['so_tag_counts'], ascending=False).head()

Unnamed: 0,toolkit,so_tag_counts,so_question_count
0,tensorflow,16462.0,17641
1,keras,4098.0,4529
2,theano,2329.0,2935
3,caffe,2191.0,2636
4,torch,774.0,1848


In [32]:
##chainer is over counted in questions because chainer is a common word, opting for using chainer framework instead
so.loc[so['toolkit'] == 'chainer', 'so_question_count'] = gso.get_body_count('chainer framework')

In [33]:
so.to_csv("../data/DL_toolkits_results_stackoverflow.csv", index=False)

### google search results stats

In [34]:
from googleapiclient.discovery import build

In [35]:
with open("../code/secrets/google_token.nogit", "rb") as f:
    my_api_key = f.read()
    
with open("../code/secrets/cse_token.nogit", "rb") as f:
    my_cse_id = f.read()

In [51]:
def google_search_results_count(search_term, api_key, cse_id):
    toolkit = search_term
    search_term = search_term.replace(' ','+')
    r= requests.get('https://www.googleapis.com/customsearch/v1?q="deep+learning"&alt=json&cx='+
                    my_cse_id+'&c2coff=1&dateRestrict=y5&exactTerms='+search_term+'&rc=1&key='+my_api_key)
    res = r.json()['queries']['request'][0]
    return {'toolkit': toolkit, 'search_results': int(res['totalResults'])}

def google_quarterly_growth_rate(search_term, api_key, cse_id):
    toolkit = search_term
    search_term = search_term.replace(' ','+')    
    ##get count for last 6 months--- dateRestrict=m6
    r= requests.get('https://www.googleapis.com/customsearch/v1?q="deep+learning"&alt=json&cx='+
                    my_cse_id+'&c2coff=1&dateRestrict=m6&exactTerms='+search_term+'&rc=1&key='+my_api_key)
    res = r.json()['queries']['request'][0]
    six_months = int(res['totalResults'])
    ##get count for last 3 months--- dateRestrict=m3    
    r= requests.get('https://www.googleapis.com/customsearch/v1?q="deep+learning"&alt=json&cx='+
                    my_cse_id+'&c2coff=1&dateRestrict=m3&exactTerms='+search_term+'&rc=1&key='+my_api_key)
    res = r.json()['queries']['request'][0]
    current_quarter = int(res['totalResults'])
    
    last_quarter = six_months - current_quarter
    growth_rate = (float(current_quarter)-float(last_quarter))/float(last_quarter)
    return {'toolkit': toolkit, 'growth_rate': growth_rate}

In [47]:
results = [res for res in (google_search_results_count(q, my_api_key, my_cse_id) for q in toolkits)
        if res is not None]
resultsDF = pd.DataFrame(results)[['toolkit', 'search_results']]

In [49]:
resultsDF.sort_values(['search_results'], ascending=False).head()

Unnamed: 0,toolkit,search_results
19,tensorflow,98500
2,caffe,46300
9,keras,30800
21,theano,28400
12,mxnet,18400


In [38]:
resultsDF.sort_values(['search_results'], ascending=False).head()

Unnamed: 0,toolkit,search_results
22,torch,374000
19,tensorflow,178000
2,caffe,67800
9,keras,43900
18,sonnet,34000


In [52]:
growth_rate = [res for res in (google_quarterly_growth_rate(q, my_api_key, my_cse_id) for q in toolkits)
        if res is not None]
growthDF =  pd.DataFrame(growth_rate)[['toolkit', 'growth_rate']]

In [53]:
growthDF.sort_values(['growth_rate'], ascending=False).head()

Unnamed: 0,toolkit,growth_rate
18,sonnet,2.724409
17,pytorch,2.36036
19,tensorflow,1.280576
5,cntk,1.23622
7,dlib,1.209486


In [54]:
googleDF = growthDF.merge(resultsDF, on='toolkit', copy = False)

In [55]:
googleDF.to_csv("../data/DL_toolkits_results_google.csv", index=False)

## Combine all data

In [56]:
dltkDF = github.merge(so, on='toolkit', copy = False)
dltkDF = dltkDF.merge(googleDF, on='toolkit', copy = False)

In [57]:
dltkDF.head()

Unnamed: 0,toolkit,full_name,forks,stars,so_tag_counts,so_question_count,growth_rate,search_results
0,apache singa,apache/incubator-singa,253,1036,,1,0.6,233
1,bigdl,intel-analytics/BigDL,401,1990,,3,0.791176,1490
2,caffe,BVLC/caffe,12371,20155,2191.0,2636,0.575758,46300
3,caffe2,caffe2/caffe2,1233,5628,26.0,34,1.08642,3690
4,chainer,chainer/chainer,772,2925,49.0,940,0.956522,6590


In [58]:
dltkDF.to_csv("../output/deep_learning_data.csv", index=False)