## Ranking Deep Learning Libraries

In [94]:
import requests
import logging
import pandas as pd
import numpy as np
import urllib2
import dill
import time
import json
from datetime import date, timedelta
from bs4 import BeautifulSoup
import unicodedata
import csv
import sys
import get_stack_overflow_data as gso
sys.path.append('../code')
sys.path.append('../data')

### Create list of libraries

### 1 https://svds.com/understanding-ai-toolkits/

In [95]:
r1 = requests.get('https://svds.com/understanding-ai-toolkits/')
soup1 = BeautifulSoup(r1.text, 'lxml')
toolkit_list1 = []
for litags in soup1.findAll('li', attrs={'class':None}):
    for atag in litags.find('a'):
        toolkit_list1.append(atag.encode('utf-8').lower())

In [96]:
toolkit_list1 = toolkit_list1[0:8]

In [97]:
toolkit_list1

['tensorflow',
 'mxnet',
 'deeplearning4j',
 'torch',
 'pytorch',
 'ctnk',
 'caffe',
 'theano']

### 2 https://en.wikipedia.org/wiki/Comparison_of_deep_learning_software

In [98]:
r2 = requests.get('https://en.wikipedia.org/wiki/Comparison_of_deep_learning_software')
soup2 = BeautifulSoup(r2.text, 'lxml')
toolkit_list2 = []

In [99]:
for tr in soup2.findAll('tr'):
    if tr.find('td'):
        toolkit_list2.append(tr.find('td').text.encode('utf-8').lower())

In [100]:
toolkit_list2

['apache singa',
 'bigdl',
 'caffe',
 'deeplearning4j',
 'dlib',
 'keras',
 'matconvnet',
 'microsoft cognitive toolkit',
 'mxnet',
 'neural designer',
 'opennn',
 'tensorflow',
 'theano',
 'torch',
 'wolfram mathematica']

### 3 https://www.packtpub.com/books/content/top-10-deep-learning-frameworks

In [101]:
#by hand, request 403 Forbidden 
toolkit_list3 = ['tensorflow','theano','keras','caffe','torch','deeplearning4j','mxnet','microsoft cognitive toolkit','lasagne','bigdl'] 

In [102]:
toolkit_list3

['tensorflow',
 'theano',
 'keras',
 'caffe',
 'torch',
 'deeplearning4j',
 'mxnet',
 'microsoft cognitive toolkit',
 'lasagne',
 'bigdl']

### 4 https://twitter.com/fchollet/status/882995652233371648

In [103]:
#by hand from jpgs
toolkit_list4 = ['tensorflow','keras','mxnet','caffe2','pytorch','caffe','paddle','cntk',
                'deeplearning4j','tflearn','dlib','theano','chainer','digits','dynet', 'sonnet']

In [104]:
toolkit_list4

['tensorflow',
 'keras',
 'mxnet',
 'caffe2',
 'pytorch',
 'caffe',
 'paddle',
 'cntk',
 'deeplearning4j',
 'tflearn',
 'dlib',
 'theano',
 'chainer',
 'digits',
 'dynet',
 'sonnet']

### 5 https://svds.com/wp-content/uploads/2017/02/Deep_learning_ratings_final-1024x563.png

In [105]:
# by hand from png
toolkit_list5 = ['theano', 'tensorflow', 'torch', 'caffe', 'mxnet', 'neon', 'cntk']

In [106]:
toolkit_list5

['theano', 'tensorflow', 'torch', 'caffe', 'mxnet', 'neon', 'cntk']

### combine and edit lists

In [107]:
toolkits = list(set(toolkit_list1 + toolkit_list2 + toolkit_list3 + toolkit_list4 + toolkit_list5))

In [108]:
print toolkits

['sonnet', 'torch', 'neon', 'neural designer', 'tflearn', 'opennn', 'deeplearning4j', 'keras', 'paddle', 'caffe', 'tensorflow', 'ctnk', 'bigdl', 'cntk', 'pytorch', 'matconvnet', 'dlib', 'digits', 'microsoft cognitive toolkit', 'mxnet', 'caffe2', 'wolfram mathematica', 'lasagne', 'chainer', 'dynet', 'theano', 'apache singa']


In [109]:
toolkits.remove('ctnk')
toolkits.remove('microsoft cognitive toolkit')
# ctnk is typo, microsoft cognitive toolkit is cntk

toolkits.remove('digits')
toolkits.append('nvidia digits')

toolkits.remove('neon')
toolkits.append('nervana neon')

toolkits.remove('paddle')
toolkits.append('paddlepaddle')

toolkits.remove('neural designer')
toolkits.remove('wolfram mathematica')
#neural designer and wolfram mathematica are proprietary with no github repositories 

In [110]:
with open("../data/dl_toolkits_final.txt", "w") as f:
    f.writelines(" ".join(toolkits))

In [111]:
toolkits.sort()
print toolkits

['apache singa', 'bigdl', 'caffe', 'caffe2', 'chainer', 'cntk', 'deeplearning4j', 'dlib', 'dynet', 'keras', 'lasagne', 'matconvnet', 'mxnet', 'nervana neon', 'nvidia digits', 'opennn', 'paddlepaddle', 'pytorch', 'sonnet', 'tensorflow', 'tflearn', 'theano', 'torch']


### get metrics

In [112]:
logging.basicConfig(level=logging.INFO)

### github stats

In [113]:
with open("../code/secrets/github-token.nogit", "rb") as f:
    token = f.read()

headers = {'Authorization': 'token %s' % token}

def get_data_from_search(query):
    """Use github search to return stars, forks for top query result"""
    
    r = requests.get('https://api.github.com/search/repositories?q='+\
                             query, 
                     headers=headers)
    r.raise_for_status()
    try:
        res = r.json()['items'][0]
        return {'toolkit': query, 'full_name': res['full_name'],
                'stars': res['stargazers_count'], 'forks': res['forks_count']}
    except:
        return None

In [114]:
data = [res for res in (get_data_from_search(q) for q in toolkits)
        if res is not None]

In [115]:
github = pd.DataFrame(data)[['toolkit', 'full_name', 'forks', 'stars']]

In [116]:
github

Unnamed: 0,toolkit,full_name,forks,stars
0,apache singa,apache/incubator-singa,252,1034
1,bigdl,intel-analytics/BigDL,397,1972
2,caffe,BVLC/caffe,12294,20035
3,caffe2,caffe2/caffe2,1223,5580
4,chainer,chainer/chainer,764,2898
5,cntk,Microsoft/CNTK,3160,12285
6,deeplearning4j,deeplearning4j/deeplearning4j,3538,7132
7,dlib,davisking/dlib,928,2678
8,dynet,clab/dynet,431,1692
9,keras,fchollet/keras,7003,19318


In [117]:
#github search returned wrong repo for nvidia digits, torch, paddlepaddle, and intel bigdl- so they are changed manually below
r = requests.get('https://api.github.com/repos/NVIDIA/DIGITS', headers=headers)
res = r.json()
github.loc[github['toolkit'] == 'nvidia digits', 'full_name'] = 'NVIDIA/DIGITS'
github.loc[github['toolkit'] == 'nvidia digits', 'forks'] = res['forks_count']
github.loc[github['toolkit'] == 'nvidia digits', 'stars'] = res['stargazers_count']

r = requests.get('https://api.github.com/repos/torch/torch7', headers=headers)
res = r.json()
github.loc[github['toolkit'] == 'torch', 'full_name'] = 'torch/torch7'
github.loc[github['toolkit'] == 'torch', 'forks'] = res['forks_count']
github.loc[github['toolkit'] == 'torch', 'stars'] = res['stargazers_count']

r = requests.get('https://api.github.com/repos/PaddlePaddle/Paddle', headers=headers)
res = r.json()
github.loc[github['toolkit'] == 'paddlepaddle', 'full_name'] = 'PaddlePaddle/Paddle'
github.loc[github['toolkit'] == 'paddlepaddle', 'forks'] = res['forks_count']
github.loc[github['toolkit'] == 'paddlepaddle', 'stars'] = res['stargazers_count']

In [118]:
github

Unnamed: 0,toolkit,full_name,forks,stars
0,apache singa,apache/incubator-singa,252,1034
1,bigdl,intel-analytics/BigDL,397,1972
2,caffe,BVLC/caffe,12294,20035
3,caffe2,caffe2/caffe2,1223,5580
4,chainer,chainer/chainer,764,2898
5,cntk,Microsoft/CNTK,3160,12285
6,deeplearning4j,deeplearning4j/deeplearning4j,3538,7132
7,dlib,davisking/dlib,928,2678
8,dynet,clab/dynet,431,1692
9,keras,fchollet/keras,7003,19318


In [125]:
github.to_csv("../data/DL_toolkits_results_github.csv", index=False)

### stackoverflow stats

In [126]:
tag_list = [toolkit.replace(' ','-') for toolkit in toolkits]
tag_counts = gso.get_tag_counts(tag_list)
df_tags = pd.DataFrame(tag_counts)[['name', 'count']]
df_tags.columns = ['toolkit', 'so_tag_counts']
df_tags

Unnamed: 0,toolkit,so_tag_counts
0,tensorflow,16195
1,keras,4023
2,theano,2316
3,caffe,2179
4,torch,776
5,dlib,413
6,cntk,295
7,tflearn,240
8,lasagne,225
9,pytorch,214


In [127]:
body_counts = {toolkit: gso.get_body_count(toolkit) for toolkit in tag_list}

In [128]:
df_questions = pd.DataFrame.from_dict(body_counts, orient='index')
df_questions.reset_index(inplace=True)
df_questions.columns = ['toolkit', 'so_question_count']

In [129]:
so = df_tags.merge(df_questions, on='toolkit', how='outer')

In [130]:
so['toolkit'] = so['toolkit'].apply(lambda x: str(x).replace('-',' '))
so

Unnamed: 0,toolkit,so_tag_counts,so_question_count
0,tensorflow,16195.0,17343
1,keras,4023.0,4443
2,theano,2316.0,2920
3,caffe,2179.0,2623
4,torch,776.0,1845
5,dlib,413.0,614
6,cntk,295.0,310
7,tflearn,240.0,366
8,lasagne,225.0,351
9,pytorch,214.0,238


In [131]:
##chainer is over counted in questions because chainer is a common word, opting for using chainer framework instead
so.loc[so['toolkit'] == 'chainer', 'so_question_count'] = gso.get_body_count('chainer framework')

In [132]:
so.to_csv("../data/DL_toolkits_results_stackoverflow.csv", index=False)

### google search results stats

In [133]:
from googleapiclient.discovery import build

In [134]:
with open("../code/secrets/google_token.nogit", "rb") as f:
    my_api_key = f.read()
    
with open("../code/secrets/cse_token.nogit", "rb") as f:
    my_cse_id = f.read()

In [135]:
def google_search_results_count(search_term, api_key, cse_id):
    toolkit = search_term
    search_term = search_term.replace(' ','+')
    r= requests.get('https://www.googleapis.com/customsearch/v1?q="'+search_term+'"+"deep+learning"+&alt=json&cx='+my_cse_id+
                    '&rc=1&key='+my_api_key)
    res = r.json()['searchInformation']
    return {'toolkit': toolkit, 'search_term': "'" + search_term + "' + 'deep learning'", 'search_results': res['totalResults']}

In [136]:
data = [res for res in (google_search_results_count(q, my_api_key, my_cse_id) for q in toolkits)
        if res is not None]

In [137]:
google = pd.DataFrame(data)[['toolkit', 'search_term', 'search_results']]

In [138]:
google

Unnamed: 0,toolkit,search_term,search_results
0,apache singa,'apache+singa' + 'deep learning',278
1,bigdl,'bigdl' + 'deep learning',1520
2,caffe,'caffe' + 'deep learning',49000
3,caffe2,'caffe2' + 'deep learning',4370
4,chainer,'chainer' + 'deep learning',7560
5,cntk,'cntk' + 'deep learning',8580
6,deeplearning4j,'deeplearning4j' + 'deep learning',9480
7,dlib,'dlib' + 'deep learning',2760
8,dynet,'dynet' + 'deep learning',843
9,keras,'keras' + 'deep learning',35700


In [139]:
#google search for apache signa is recalculated using only signa
res = google_search_results_count('signa', my_api_key, my_cse_id)
google.loc[google['toolkit'] == 'apache singa', 'search_term'] = res['search_term']
google.loc[google['toolkit'] == 'apache singa', 'search_results'] = res['search_results']

In [140]:
google

Unnamed: 0,toolkit,search_term,search_results
0,apache singa,'signa' + 'deep learning',2910
1,bigdl,'bigdl' + 'deep learning',1520
2,caffe,'caffe' + 'deep learning',49000
3,caffe2,'caffe2' + 'deep learning',4370
4,chainer,'chainer' + 'deep learning',7560
5,cntk,'cntk' + 'deep learning',8580
6,deeplearning4j,'deeplearning4j' + 'deep learning',9480
7,dlib,'dlib' + 'deep learning',2760
8,dynet,'dynet' + 'deep learning',843
9,keras,'keras' + 'deep learning',35700


In [141]:
google.to_csv("../data/DL_toolkits_results_google.csv", index=False)

## Combine all data

In [142]:
dltkDF = github.merge(so, on='toolkit', copy = False)
dltkDF = dltkDF.merge(google, on='toolkit', copy = False)

In [145]:
dltkDF.sort_values('forks')

Unnamed: 0,toolkit,full_name,forks,stars,so_tag_counts,so_question_count,search_term,search_results
15,opennn,Artelnics/OpenNN,135,363,1.0,13,'opennn' + 'deep learning',636
0,apache singa,apache/incubator-singa,252,1034,,1,'signa' + 'deep learning',2910
1,bigdl,intel-analytics/BigDL,397,1972,,3,'bigdl' + 'deep learning',1520
8,dynet,clab/dynet,431,1692,,2,'dynet' + 'deep learning',843
11,matconvnet,vlfeat/matconvnet,573,825,58.0,99,'matconvnet' + 'deep learning',3120
18,sonnet,deepmind/sonnet,645,5267,1.0,46,'sonnet' + 'deep learning',2340
13,nervana neon,NervanaSystems/neon,702,3201,3.0,4,'nervana+neon' + 'deep learning',239
4,chainer,chainer/chainer,764,2898,47.0,938,'chainer' + 'deep learning',7560
10,lasagne,Lasagne/Lasagne,902,3218,225.0,351,'lasagne' + 'deep learning',3780
14,nvidia digits,NVIDIA/DIGITS,907,2455,69.0,91,'nvidia+digits' + 'deep learning',2540


In [144]:
dltkDF.to_csv("../output/deep_learning_data.csv", index=False)