## Ranking Deep Learning and Artificial Intelligence Libraries

In [None]:
import requests
from retrying import retry
import logging
import pandas as pd
import numpy as np
import urllib2
import dill
import time
import json
from datetime import date, timedelta
from bs4 import BeautifulSoup
import unicodedata
import csv
import sys
import get_stack_overflow_data as gso
sys.path.append('../code')
sys.path.append('../data')

#function that adds a delay before running a function, to be used as a decorator.
def sleeper(secs):
    def realsleeper(func):
        def wrapper(*args,**kwargs):
            time.sleep(secs)
            return func(*args,**kwargs)
        return wrapper
    return realsleeper

### Create list of libraries

### 1 https://svds.com/understanding-ai-toolkits/

In [None]:
r1 = requests.get('https://svds.com/understanding-ai-toolkits/')
soup1 = BeautifulSoup(r1.text, 'lxml')
toolkit_list1 = []
for litags in soup1.findAll('li', attrs={'class':None}):
    for atag in litags.find('a'):
        toolkit_list1.append(atag.encode('utf-8').lower())

In [None]:
toolkit_list1 = toolkit_list1[0:8]

In [None]:
toolkit_list1

### 2 https://en.wikipedia.org/wiki/Comparison_of_deep_learning_software

In [None]:
r2 = requests.get('https://en.wikipedia.org/wiki/Comparison_of_deep_learning_software')
soup2 = BeautifulSoup(r2.text, 'lxml')
toolkit_list2 = []

In [None]:
for tr in soup2.findAll('tr'):
    if tr.find('td'):
        toolkit_list2.append(tr.find('td').text.encode('utf-8').lower())

In [None]:
toolkit_list2

### 3 https://www.packtpub.com/books/content/top-10-deep-learning-frameworks

In [None]:
#by hand, request 403 Forbidden 
toolkit_list3 = ['tensorflow','theano','keras','caffe','torch','deeplearning4j','mxnet','microsoft cognitive toolkit','lasagne','bigdl'] 

In [None]:
toolkit_list3

### 4 https://twitter.com/fchollet/status/882995652233371648

In [None]:
#by hand from jpgs
toolkit_list4 = ['tensorflow','keras','mxnet','caffe2','pytorch','caffe','paddle','cntk',
                'deeplearning4j','tflearn','dlib','theano','chainer','digits','dynet', 'sonnet']

In [None]:
toolkit_list4

### 5 https://svds.com/wp-content/uploads/2017/02/Deep_learning_ratings_final-1024x563.png

In [None]:
# by hand from png
toolkit_list5 = ['theano', 'tensorflow', 'torch', 'caffe', 'mxnet', 'neon', 'cntk']

In [None]:
toolkit_list5

### 6 https://www.quora.com/Which-are-the-artificial-intelligence-libraries-in-Python

In [None]:
# by hand
toolkit_list6 = ['tensorflow','scikit-learn','theano','nupic','pybrain','pyevolve','pattern','caffe','shogun','mlpy','scikit-image','opencv','nltk','pynlpl','pymc','pgmpy','libpgm','deeppy','nolearn','hebel']

### 7 https://wiki.python.org/moin/PythonForArtificialIntelligence

In [None]:
toolkit_list7 = ['aima','pydatalog','simpleai','easyai','graphlab','featureforge','orange','pybrain','pyml','mlpy','milk','scikit-learn','shogun','mdp-toolkit','libsvm','weka','monte','som','yalign','nltk','gensim','quepy','neurolab','ffnet','fann','pyann','pyrenn']

### combine and edit lists

In [None]:
toolkits = list(set(toolkit_list1 + toolkit_list2 + toolkit_list3 + toolkit_list4 + toolkit_list5 + toolkit_list6))

In [None]:
print sorted(toolkits)

In [None]:
toolkits.remove('ctnk')
toolkits.remove('microsoft cognitive toolkit')
# ctnk is typo, microsoft cognitive toolkit is cntk

toolkits.remove('digits')
toolkits.append('nvidia digits')

toolkits.remove('neon')
toolkits.append('nervana neon')

toolkits.remove('paddle')
toolkits.append('paddlepaddle')

toolkits.remove('neural designer')
toolkits.remove('wolfram mathematica')
#neural designer and wolfram mathematica are proprietary with no github repositories 

toolkits.remove('mxnet')
#mxnet is already in list as apache mxnet

In [None]:
with open("../data/dlai_toolkits_final.txt", "w") as f:
    f.writelines(" ".join(toolkits))

In [None]:
toolkits.sort()
print toolkits

### get metrics

In [None]:
logging.basicConfig(level=logging.INFO)

### github stats

In [None]:
with open("../code/secrets/github-token.nogit", "rb") as f:
    token = f.read()

headers = {'Authorization': 'token %s' % token}

@sleeper(3)#github will temporarily block requests from a user that makes more than 30 requests within a 60 second period. To be safe, use a 3 second pause between requests to limit rate to 20 requests per minute.
@retry(wait_exponential_multiplier=3000,wait_exponential_max=12000,stop_max_attempt_number=3)#in case request fails, retry up to 3 times, starting with a 3 second wait period and doubling that period each time.
def get_data_from_search_helper(query):
    r = requests.get('https://api.github.com/search/repositories?q='+\
                             query, 
                     headers=headers)
    return r

def get_data_from_search(query):
    """Use github search to return stars, forks for top query result"""
    
    r = get_data_from_search_helper(query)
    #r.raise_for_status()
    try:
        res = r.json()['items'][0]
        return {'toolkit': query, 'full_name': res['full_name'],
                'stars': int(res['stargazers_count']), 'forks': int(res['forks_count'])}
    except:
        return None

In [None]:
data = [res for res in (get_data_from_search(q) for q in toolkits)
        if res is not None]

In [None]:
github = pd.DataFrame(data)[['toolkit', 'full_name', 'forks', 'stars']]

In [None]:
github

In [None]:
#github search returned wrong repo for mxnet, nvidia digits, torch, paddlepaddle, and intel bigdl- so they are changed manually below
r = requests.get('https://api.github.com/repos/apache/incubator-mxnet', headers=headers)
res = r.json()
github.loc[github['toolkit'] == 'apache mxnet', 'full_name'] = 'apache/incubator-mxnet'
github.loc[github['toolkit'] == 'apache mxnet', 'forks'] = res['forks_count']
github.loc[github['toolkit'] == 'apache mxnet', 'stars'] = res['stargazers_count']

r = requests.get('https://api.github.com/repos/NVIDIA/DIGITS', headers=headers)
res = r.json()
github.loc[github['toolkit'] == 'nvidia digits', 'full_name'] = 'NVIDIA/DIGITS'
github.loc[github['toolkit'] == 'nvidia digits', 'forks'] = res['forks_count']
github.loc[github['toolkit'] == 'nvidia digits', 'stars'] = res['stargazers_count']

r = requests.get('https://api.github.com/repos/torch/torch7', headers=headers)
res = r.json()
github.loc[github['toolkit'] == 'torch', 'full_name'] = 'torch/torch7'
github.loc[github['toolkit'] == 'torch', 'forks'] = res['forks_count']
github.loc[github['toolkit'] == 'torch', 'stars'] = res['stargazers_count']

r = requests.get('https://api.github.com/repos/PaddlePaddle/Paddle', headers=headers)
res = r.json()
github.loc[github['toolkit'] == 'paddlepaddle', 'full_name'] = 'PaddlePaddle/Paddle'
github.loc[github['toolkit'] == 'paddlepaddle', 'forks'] = res['forks_count']
github.loc[github['toolkit'] == 'paddlepaddle', 'stars'] = res['stargazers_count']

In [None]:
github.sort_values(['stars'], ascending=False).head()

In [None]:
github.to_csv("../data/DLAI_toolkits_results_github.csv", index=False)

### stackoverflow stats

In [None]:
tag_list = [toolkit.replace(' ','-') for toolkit in toolkits]
#tag_counts = gso.get_tag_counts(tag_list)

#function for getting stack overflow tag counts
@sleeper(3)#attempt to avoid throttling
@retry(wait_exponential_multiplier=60000,wait_exponential_max=240000,stop_max_attempt_number=3)#in case request fails, retry up to 3 times, starting with a 1 minute wait period and doubling that period each time.
def tag_counts_builder_helper(list_to_build,list_entry):
    list_to_build += gso.get_tag_counts([list_entry])

#build list of tag counts
tag_counts = []

for x in tag_list:
    tag_counts_builder_helper(tag_counts,x)

df_tags = pd.DataFrame(tag_counts)[['name', 'count']]
df_tags.columns = ['toolkit', 'so_tag_counts']
df_tags

In [None]:
@sleeper(3)#attempt to avoid throttling
@retry(wait_exponential_multiplier=60000,wait_exponential_max=240000,stop_max_attempt_number=3)#in case request fails, retry up to 3 times, starting with a 1 minute wait period and doubling that period each time.
#function used in building a dict each of whose values is the body count of the corresponding key
def body_counts_builder_helper(dict_to_build,key):
    dict_to_build[key] = gso.get_body_count([key])

#build dict of body counts
body_counts = {}
i = 0

for x in tag_list:
    i += 1
    print i
    body_counts_builder_helper(body_counts,x)

In [None]:
df_questions = pd.DataFrame.from_dict(body_counts, orient='index')
df_questions.reset_index(inplace=True)
df_questions.columns = ['toolkit', 'so_question_count']

In [None]:
so = df_tags.merge(df_questions, on='toolkit', how='outer')

In [None]:
so['toolkit'] = so['toolkit'].apply(lambda x: str(x).replace('-',' '))
so.sort_values(['so_tag_counts'], ascending=False).head()

In [None]:
##chainer is over counted in questions because chainer is a common word, opting for using chainer framework instead
so.loc[so['toolkit'] == 'chainer', 'so_question_count'] = gso.get_body_count('chainer framework')

##likewise with pattern
so.loc[so['toolkit'] == 'pattern', 'so_question_count'] = gso.get_body_count('pattern web mining')


In [None]:
so.to_csv("../data/DLAI_toolkits_results_stackoverflow.csv", index=False)

### google search results stats

In [None]:
from googleapiclient.discovery import build

In [None]:
with open("../code/secrets/google_token.nogit", "rb") as f:
    my_api_key = f.read()
    
with open("../code/secrets/cse_token.nogit", "rb") as f:
    my_cse_id = f.read()

In [None]:
def search_term_modifier(search_term):
    #replace space with +, indicating to search for both words:
    search_term = search_term.replace(' ','+')
    #Since pattern is word that appears commonly in deep learning and ai applications unrelated to the library called pattern, 
    #we need to make the search more specific. At the same time, we want to avoid eliminating relevant results. To do this,
    #we include the term web mining which we expect will appear in most pages relevant to the pattern library. This is not a perfect fix, 
    #as we will still get results for those search terms having nothing to do with the pattern library.
    #Note that the concern we have for pattern does not apply to other common words like shogun and sonnet since our custom search
    #includes the restriction that the result contain either the term "deep learning", "artificial intelligence", or "machine learning".
    if (search_term == 'pattern'):
        search_term = 'pattern+web+mining'
    return search_term

In [None]:
#function for getting number of google search results
def google_search_results_count(search_term, api_key, cse_id):
    toolkit = search_term
    search_term = search_term_modifier(search_term)
    r= requests.get('https://www.googleapis.com/customsearch/v1?q="deep+learning"+OR+"artifical+intelligence"+OR+"machine learning"&alt=json&cx='+
                    my_cse_id+'&c2coff=1&dateRestrict=y5&exactTerms='+search_term+'&rc=1&key='+my_api_key)
    res = r.json()['queries']['request'][0]
    return {'toolkit': toolkit, 'search_results': int(res['totalResults'])}

#function for getting growth rate of google search results
def google_quarterly_growth_rate(search_term, api_key, cse_id):
    toolkit = search_term
    search_term = search_term_modifier(search_term)
    ##get count for last 6 months--- dateRestrict=m6
    r= requests.get('https://www.googleapis.com/customsearch/v1?q="deep+learning"+OR+"artifical+intelligence"+OR+"machine learning"&alt=json&cx='+
                    my_cse_id+'&c2coff=1&dateRestrict=m6&exactTerms='+search_term+'&rc=1&key='+my_api_key)
    res = r.json()['queries']['request'][0]
    six_months = int(res['totalResults'])
    ##get count for last 3 months--- dateRestrict=m3    
    r= requests.get('https://www.googleapis.com/customsearch/v1?q="deep+learning"+OR+"artifical+intelligence"+OR+"machine learning"&alt=json&cx='+
                    my_cse_id+'&c2coff=1&dateRestrict=m3&exactTerms='+search_term+'&rc=1&key='+my_api_key)
    res = r.json()['queries']['request'][0]
    current_quarter = int(res['totalResults'])
    
    last_quarter = six_months - current_quarter
    if (last_quarter == 0):#for handling the divide by 0 case
        growth_rate = float('NaN')
    else:
        growth_rate = (float(current_quarter)-float(last_quarter))/float(last_quarter)
    return {'toolkit': toolkit, 'growth_rate': growth_rate}

In [None]:
@sleeper(2)#attempt to avoid throttling
@retry(wait_exponential_multiplier=2000,wait_exponential_max=8000,stop_max_attempt_number=3)#in case request fails, retry up to 3 times, starting with a 2 second period and doubling that period each time.
#function used in building a list each of whose values is the google search results count
def google_results_builder_helper(list_to_build,query):
    res = google_search_results_count(query, my_api_key, my_cse_id)
    if res is not None:
        list_to_build.append(res)
        
results = []
for q in toolkits:
    google_results_builder_helper(results,q)
    
resultsDF = pd.DataFrame(results)[['toolkit', 'search_results']]

In [None]:
resultsDF.sort_values(['search_results'], ascending=False).head()

In [None]:
resultsDF.sort_values(['search_results'], ascending=False).head()

In [None]:
@sleeper(8)#attempt to avoid throttling
@retry(wait_exponential_multiplier=8000,wait_exponential_max=32000,stop_max_attempt_number=3)#in case request fails, retry up to 3 times, starting with a 2 second period and doubling that period each time.
#function used in building a list each of whose values is the google quarterly growth rate
def growth_rate_builder_helper(list_to_build,query):
    res = google_quarterly_growth_rate(query, my_api_key, my_cse_id)
    if res is not None:
        list_to_build.append(res)

#build list of growth rates
growth_rate = []
i = 0
for q in toolkits:
    print q
    print i
    i += 1
    growth_rate_builder_helper(growth_rate,q)
    
growthDF =  pd.DataFrame(growth_rate)[['toolkit', 'growth_rate']]

In [None]:
growthDF.sort_values(['growth_rate'], ascending=False).head()

In [None]:
googleDF = growthDF.merge(resultsDF, on='toolkit', copy = False)

In [None]:
googleDF.to_csv("../data/DLAI_toolkits_results_google.csv", index=False)

## Combine all data

In [None]:
dltkDF = github.merge(so, on='toolkit', copy = False)
dltkDF = dltkDF.merge(googleDF, on='toolkit', copy = False)

In [None]:
dltkDF.head()

In [None]:
dltkDF.to_csv("../output/deep_learning_and_ai_data.csv", index=False)