# How to identify "top R packages for Machine Learning"

1. get a list of R packages for ML
1. use this list to get metrics from SO, github
1. use the metrics to rank the package list

# Load stuff

In [14]:
import pandas as pd

from code import utils as ut
from code import get_stack_overflow_data as gsod

# Get a list of R packages for ML

From the ["CRAN Task View: Machine Learning & Statistical Learning"](https://cran.r-project.org/web/views/MachineLearning.html)

In [2]:
package_list = ut.read_package_txt("package-list-from-cran-task-view.txt")

In [3]:
len(package_list)

87

Here is a list of R packages used for machine learning:

In [96]:
print package_list

['ahaz', 'arules', 'BayesTree', 'biglasso', 'bigRR', 'bmrm', 'Boruta', 'bst', 'C50', 'caret', 'CORElearn', 'CoxBoost', 'Cubist', 'e1071', 'earth', 'elasticnet', 'ElemStatLearn', 'evclass', 'evtree', 'FCNN4R', 'frbs', 'GAMBoost', 'gamboostLSS', 'gbm', 'glmnet', 'glmpath', 'GMMBoost', 'gmum.r', 'grplasso', 'grpreg', 'h2o', 'hda', 'hdi', 'hdm', 'ipred', 'kernlab', 'klaR', 'lars', 'lasso2', 'LiblineaR', 'LogicForest', 'LogicReg', 'maptree', 'mboost', 'mlr', 'ncvreg', 'nnet', 'oblique.tree', 'OneR', 'pamr', 'party', 'partykit', 'penalized', 'penalizedLDA', 'penalizedSVM', 'quantregForest', 'randomForest', 'randomForestSRC', 'ranger', 'rattle', 'Rborist', 'rda', 'rdetools', 'REEMtree', 'relaxo', 'rgenoud', 'rgp', 'Rmalschains', 'rminer', 'rnn', 'ROCR', 'RoughSets', 'rpart', 'RPMM', 'RSNNS', 'RWeka', 'RXshrink', 'sda', 'SIS', 'spa', 'stabs', 'SuperLearner', 'svmpath', 'tgp', 'tree', 'varSelRF', 'vcrpart']


# Get usage statistics on the ML packages

In [8]:
tag_counts = gsod.get_tag_counts(package_list)

In [9]:
question_body_counts = { item: gsod.get_body_count(item) for item in package_list}

In [29]:
tag = 'r'
question_body_counts_r = { item: gsod.get_body_count(item, tag=tag) for item in package_list}

In [10]:
def count_dict_to_pd(mydict, kind):
    df = pd.DataFrame.from_dict(mydict, orient='index')
    df.reset_index(inplace=True)
    df.columns = [['name', 'count']]
    df['kind'] = kind
    df.sort_values(by='count', inplace=True, ascending=False)
    return df

## TAG counts on SO

In [45]:
df_tag = pd.DataFrame(tag_counts)
# filter to remove synonyms, which SO seems to add to return
df_tag = df_tag[df_tag['name'].isin(package_list)]
df_tag['kind'] = 'tag'
df_tag['rank'] = df_tag['count'].rank(ascending=False)
df_tag = df_tag[['name', 'count', 'rank', 'kind']]

In [46]:
df_tag.head(20)

Unnamed: 0,name,count,rank,kind
0,tree,10536,1.0,tag
3,caret,559,2.0,tag
4,h2o,261,3.0,tag
5,rpart,226,4.0,tag
6,glmnet,193,5.0,tag
7,arules,175,6.0,tag
8,gbm,132,7.0,tag
9,party,115,8.0,tag
10,nnet,83,9.0,tag
12,kernlab,62,10.0,tag


## Question body counts on SO

In [122]:
df_body = count_dict_to_pd(question_body_counts, kind="body")
df_body['rank'] = df_body['count'].rank(ascending=False)

df_body.head(20)

Unnamed: 0,name,count,kind,rank
22,OneR,2541420,body,1.0
62,tree,123352,body,2.0
80,party,74708,body,3.0
50,caret,12829,body,4.0
2,earth,10916,body,5.0
74,spa,6275,body,6.0
81,bst,4009,body,7.0
55,sda,2331,body,8.0
43,stabs,1533,body,9.0
76,randomForest,873,body,10.0


## Question body + 'R' tag, on SO

In [123]:
df_body_r = count_dict_to_pd(question_body_counts_r, kind="body_r_tag")
df_body_r['rank'] = df_body_r['count'].rank(ascending=False)


df_body_r.head(20)

Unnamed: 0,name,count,kind,rank
22,OneR,37941,body_r_tag,1.0
62,tree,1883,body_r_tag,2.0
50,caret,1066,body_r_tag,3.0
76,randomForest,639,body_r_tag,4.0
13,rpart,513,body_r_tag,5.0
80,party,433,body_r_tag,6.0
25,e1071,376,body_r_tag,7.0
83,glmnet,280,body_r_tag,8.0
24,nnet,279,body_r_tag,9.0
36,arules,248,body_r_tag,10.0


## CONCAT

In [174]:
df_concat = pd.concat([df_tag, df_body_r, df_body])
df_concat['rank'] = df_concat['rank'].astype(object)

In [179]:
df_pivot = df_concat.pivot(index='name', columns='kind', values='rank')
#df_pivot.reset_index(inplace=True)
df_pivot.sort_values(by='body_r_tag', inplace=True)
df_pivot.reset_index(inplace=True)
df_pivot = df_pivot[['name', 'body_r_tag', 'tag', 'body']]

## Stack overflow rankings

Ranks based on 3 types of Stack searches are below. "OneR" has "none" for tag due to SO search API considering it a typo. Its "body" results are therefore also unreliable.

In [180]:
df_pivot.head(20)

kind,name,body_r_tag,tag,body
0,OneR,1.0,,1
1,tree,2.0,1.0,2
2,caret,3.0,2.0,4
3,randomForest,4.0,,10
4,rpart,5.0,4.0,14
5,party,6.0,8.0,3
6,e1071,7.0,,20
7,glmnet,8.0,5.0,22
8,nnet,9.0,9.0,19
9,arules,10.0,6.0,23


# Visualize this

In [None]:
from bokeh.io import output_notebook, show
from bokeh.charts import Bar
from bokeh.charts.attributes import cat

In [59]:
output_notebook()

In [82]:
df = df_merged.sort_values('count', ascending=False)

In [83]:
df = df.query("kind=='tag'")
df = df.head(20)

In [84]:
df

Unnamed: 0,name,count,kind
0,tree,10530,tag
1,binary-search-tree,3219,tag
2,single-page-application,2353,tag
3,caret,556,tag
4,h2o,261,tag
5,rpart,226,tag
6,glmnet,192,tag
7,arules,174,tag
8,gbm,132,tag
9,party,115,tag


In [85]:
p = Bar(df, label=cat('name', sort=False), values='count', group='kind',
        title="PKGS FTW!", legend='top_right')

show(p)