# How to identify "top R packages for Machine Learning"

1. get a list of R packages for ML
1. use this list to get metrics from SO, github
1. use the metrics to rank the package list

# Get a list of R packages for ML

From the ["CRAN Task View: Machine Learning & Statistical Learning"](https://cran.r-project.org/web/views/MachineLearning.html)

In [1]:
import requests
import re
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

In [2]:
URL = 'https://cran.r-project.org/web/views/MachineLearning.html'

In [3]:
def get_package_list(url):
    
    r = requests.get(URL)

    soup = BeautifulSoup(r.text, 'lxml')
    h3 = soup.find('h3')

    descendants_list = h3.findNextSibling().text.strip().split("\n")
    return [re.sub(" \([A-z]+\)", "", item) for item in descendants_list]

In [4]:
package_list = get_package_list(URL)

In [5]:
len(package_list)

87

In [6]:
print package_list

[u'ahaz', u'arules', u'BayesTree', u'biglasso', u'bigRR', u'bmrm', u'Boruta', u'bst', u'C50', u'caret', u'CORElearn', u'CoxBoost', u'Cubist', u'e1071', u'earth', u'elasticnet', u'ElemStatLearn', u'evclass', u'evtree', u'FCNN4R', u'frbs', u'GAMBoost', u'gamboostLSS', u'gbm', u'glmnet', u'glmpath', u'GMMBoost', u'gmum.r', u'grplasso', u'grpreg', u'h2o', u'hda', u'hdi', u'hdm', u'ipred', u'kernlab', u'klaR', u'lars', u'lasso2', u'LiblineaR', u'LogicForest', u'LogicReg', u'maptree', u'mboost', u'mlr', u'ncvreg', u'nnet', u'oblique.tree', u'OneR', u'pamr', u'party', u'partykit', u'penalized', u'penalizedLDA', u'penalizedSVM', u'quantregForest', u'randomForest', u'randomForestSRC', u'ranger', u'rattle', u'Rborist', u'rda', u'rdetools', u'REEMtree', u'relaxo', u'rgenoud', u'rgp', u'Rmalschains', u'rminer', u'rnn', u'ROCR', u'RoughSets', u'rpart', u'RPMM', u'RSNNS', u'RWeka', u'RXshrink', u'sda', u'SIS', u'spa', u'stabs', u'SuperLearner', u'svmpath', u'tgp', u'tree', u'varSelRF', u'vcrpart']


# Get usage statistics on the ML packages

In [7]:
from code import get_stack_overflow_data as gsod

In [9]:
tag_counts = gsod.get_tag_counts(package_list)

In [10]:
question_body_counts = { item: gsod.get_body_count(item) for item in package_list}

In [14]:
tag = 'r'
question_body_counts_r = { item: gsod.get_body_count(item, tag=tag) for item in package_list}

In [30]:
def count_dict_to_pd(mydict, kind):
    df = pd.DataFrame.from_dict(mydict, orient='index')
    df.reset_index(inplace=True)
    df.columns = [['name', 'count']]
    df['kind'] = kind
    df.sort_values(by='count', inplace=True, ascending=False)
    return df

## SO Tag counts

In [33]:
df_tag = pd.DataFrame(tag_counts)

df_tag.head(20)

Unnamed: 0,count,has_synonyms,is_moderator_only,is_required,name
0,10530,True,False,False,tree
1,3219,True,False,False,binary-search-tree
2,2353,True,False,False,single-page-application
3,556,False,False,False,caret
4,261,False,False,False,h2o
5,226,False,False,False,rpart
6,192,False,False,False,glmnet
7,174,False,False,False,arules
8,132,False,False,False,gbm
9,115,False,False,False,party


## SO Question body counts

In [31]:
df_body = count_dict_to_pd(question_body_counts, kind="in_question_body")

df_body.head(20)

Unnamed: 0,name,count,kind
22,OneR,2540219,in_question_body
62,tree,123297,in_question_body
80,party,74667,in_question_body
50,caret,12819,in_question_body
2,earth,10913,in_question_body
74,spa,6271,in_question_body
81,bst,4006,in_question_body
55,sda,2329,in_question_body
43,stabs,1533,in_question_body
76,randomForest,872,in_question_body


In [32]:
df_body_r = count_dict_to_pd(question_body_counts_r, kind="in_question_body_r")

df_body_r.head(20)

Unnamed: 0,name,count,kind
22,OneR,37911,in_question_body_r
62,tree,1881,in_question_body_r
50,caret,1064,in_question_body_r
76,randomForest,638,in_question_body_r
13,rpart,513,in_question_body_r
80,party,433,in_question_body_r
25,e1071,376,in_question_body_r
24,nnet,279,in_question_body_r
83,glmnet,278,in_question_body_r
36,arules,246,in_question_body_r


# Visualize this