In [1]:
from sklearn import preprocessing
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
fields = ['module','stars','forks','tag_count','question_count','downloads','growth_rates']
DF = pd.read_csv("../output/D3_modules_Data.csv", usecols=fields)

In [3]:
DF.head()

Unnamed: 0,module,stars,forks,tag_count,question_count,downloads,growth_rates
0,d3-hcg,9.0,3.0,0,1,1021.0,-0.166517
1,d3-table,4.0,1.0,0,1434,211.0,0.06444
2,mpld3,1490.0,259.0,85,127,,
3,d3-timelines,770.0,236.0,0,136,246.0,
4,d3-horizon-chart,38.0,7.0,0,14,2330.0,0.084458


In [4]:
#take out all packages with any nan values and put them in their own DF
nanDF = DF[pd.isnull(DF).any(axis=1)]

DF.dropna(axis=0, how='any', inplace=True)

In [5]:
#scale the values, such that for each metric: mean = 0 and std = 1
#use the same scaling for the nanDF dataset
metrics = ['stars','forks','tag_count','question_count','downloads','growth_rates']
scaled_DF = pd.DataFrame(columns=metrics)
scaled_NaN = pd.DataFrame(columns=metrics)

for metric in metrics:
    mean = DF[metric].mean()
    std = DF[metric].std()
    scaled_DF[metric] = DF[metric].apply(lambda x: (x-mean)/std)
    scaled_NaN[metric] = nanDF[metric].apply(lambda x: (x-mean)/std)

scaled_DF['module']= DF['module']
scaled_NaN['module'] = nanDF['module']

In [6]:
#merge forks and stars to one GitHub metric, Tags and Questions to Stack Overflow, downloads and dl growth to Downloads
#overall is sum of all scaled metrics
final_DF = pd.DataFrame(columns=['Package', 'Rank', 'Overall', 'Github', 'Stack Overflow', 'Downloads'])
final_DF['Package'] = scaled_DF['module']
final_DF['Github'] = scaled_DF[['forks', 'stars']].mean(axis=1)
final_DF['Stack Overflow'] = scaled_DF[['tag_count', 'question_count']].mean(axis=1)
final_DF['Downloads'] = scaled_DF[['downloads','growth_rates']].mean(axis=1)
final_DF['Overall'] = final_DF[['Github','Stack Overflow','Downloads']].sum(axis=1)
final_DF['Rank'] = final_DF['Overall'].rank(ascending=0).astype(int)

In [7]:
#combine nanDF metrics to be added to the bottom fo the DF
final_nan_DF = pd.DataFrame(columns=['Package', 'Github', 'Stack Overflow', 'Downloads'])
final_nan_DF['Package'] = scaled_NaN['module']
final_nan_DF['Github'] = scaled_NaN[['forks', 'stars']].mean(axis=1)
final_nan_DF['Stack Overflow'] = scaled_NaN[['tag_count', 'question_count']].mean(axis=1)
final_nan_DF['Downloads'] = scaled_NaN[['downloads','growth_rates']].mean(axis=1)

ranked_data = final_DF.append(final_nan_DF)

In [8]:
ranked_data.sort_values(['Overall'], axis=0, ascending=False, inplace=True)
ranked_data = ranked_data.reindex_axis(['Package','Rank','Overall','Github','Stack Overflow','Downloads'], axis=1)

In [9]:
ranked_data.style.set_table_styles([
    {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}
])

Unnamed: 0,Package,Rank,Overall,Github,Stack Overflow,Downloads
120,d3-cloud,1.0,6.3851,7.01115,-0.267668,-0.358378
186,d3plus,2.0,5.51154,2.29387,3.35328,-0.135606
182,dagre-d3,3.0,5.48656,2.43727,3.58486,-0.535575
168,react-d3,4.0,4.46758,2.81276,1.96318,-0.308364
175,d3-scale,5.0,4.3885,1.3875,1.43168,1.56933
15,d3-shape,6.0,3.84412,2.21814,0.146789,1.47919
151,d3-queue,7.0,3.02291,2.06998,-0.225462,1.17839
70,d3-path,8.0,2.82459,-0.2831,1.75099,1.3567
22,d3-components,9.0,2.81364,2.19532,-0.150389,0.768708
21,d3.chart,10.0,2.77845,1.10594,2.45052,-0.778018


In [10]:
ranked_data.to_csv('../output/d3_modules_Rankings.csv', index = False)