In [2]:
from sklearn import preprocessing
import numpy as np
import pandas as pd
from collections import defaultdict

In [6]:
fields = ['package','forks','stars','so_tag_counts','so_question_count','downloads','growth_rates']
DF = pd.read_csv('../output/JS_VIZ_data.csv', usecols=fields)

In [None]:
DF.sort_values(['so_questions_counts'],ascending=False).head()

In [7]:
#take out all packages with any nan values and put them in their own DF
nanDF = DF[pd.isnull(DF).any(axis=1)]

DF.dropna(axis=0, how='any', inplace=True)

In [9]:
#scale the values, such that for each metric: mean = 0 and std = 1
#use the same scaling for the nanDF dataset
metrics = ['forks','stars','so_tag_counts','so_question_count','downloads','growth_rates']
scaled_DF = pd.DataFrame(columns=metrics)
scaled_NaN = pd.DataFrame(columns=metrics)

for metric in metrics:
    mean = DF[metric].mean()
    std = DF[metric].std()
    scaled_DF[metric] = DF[metric].apply(lambda x: (x-mean)/std)
    scaled_NaN[metric] = nanDF[metric].apply(lambda x: (x-mean)/std)

scaled_DF['package']= DF['package']
scaled_NaN['package'] = nanDF['package']

In [10]:
#merge forks and stars to one GitHub metric, Tags and Questions to Stack Overflow, downloads and dl growth to Downloads
#overall is sum of all scaled metrics
final_DF = pd.DataFrame(columns=['Package', 'Rank', 'Overall', 'Github', 'Stack Overflow', 'Downloads'])
final_DF['Package'] = scaled_DF['package']
final_DF['Github'] = scaled_DF[['forks', 'stars']].mean(axis=1)
final_DF['Stack Overflow'] = scaled_DF[['so_tag_counts', 'so_question_count']].mean(axis=1)
final_DF['Downloads'] = scaled_DF[['downloads','growth_rates']].mean(axis=1)
final_DF['Overall'] = final_DF[['Github','Stack Overflow','Downloads']].sum(axis=1)
final_DF['Rank'] = final_DF['Overall'].rank(ascending=0).astype(int)

In [11]:
#combine nanDF metrics to be added to the bottom fo the DF
final_nan_DF = pd.DataFrame(columns=['Package', 'Github', 'Stack Overflow', 'Downloads'])
final_nan_DF['Package'] = scaled_NaN['package']
final_nan_DF['Github'] = scaled_NaN[['forks', 'stars']].mean(axis=1)
final_nan_DF['Stack Overflow'] = scaled_NaN[['so_tag_counts', 'so_question_count']].mean(axis=1)
final_nan_DF['Downloads'] = scaled_NaN[['downloads','growth_rates']].mean(axis=1)

ranked_data = final_DF.append(final_nan_DF)

In [12]:
ranked_data.sort_values(['Overall'], axis=0, ascending=False, inplace=True)
ranked_data = ranked_data.reindex_axis(['Package','Rank','Overall','Github','Stack Overflow','Downloads'], axis=1)

In [13]:
ranked_data.style.set_table_styles([
    {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}
])

Unnamed: 0,Package,Rank,Overall,Github,Stack Overflow,Downloads
24,d3,1.0,16.3416,7.50237,4.84197,3.99722
10,chart.js,2.0,4.93473,3.2727,0.453536,1.20849
48,highcharts,3.0,3.95795,0.446472,2.68054,0.83094
81,plottable,4.0,3.73661,-0.225459,2.88095,1.08112
80,plotly.js,5.0,3.18098,0.159345,3.16288,-0.141242
61,leaflet,6.0,2.92287,1.46342,0.784361,0.675093
4,britecharts,7.0,2.41148,-0.220147,-0.230052,2.86168
33,echarts,8.0,2.31238,2.16394,-0.211544,0.359985
6,c3,9.0,0.90838,0.288898,0.510305,0.109176
47,graphael,10.0,0.704229,-0.392196,-0.199538,1.29596


In [None]:
ranked_data.to_csv('../output/js_viz_final_Rankings.csv', index = False)