In [1]:
from sklearn import preprocessing
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
fields = ['package','forks','stars','so_tag_count','so_question_count','search_results', 'growth_rate']
DF = pd.read_csv('../output/distributed_computing_data.csv', usecols=fields)

In [3]:
DF.dropna(subset = ['search_results', 'growth_rate'], inplace = True)

In [4]:
DF.fillna(0)

Unnamed: 0,package,forks,stars,so_tag_count,so_question_count,search_results,growth_rate
0,apache storm,3314,4496,2086,2209,44000,-0.205128
1,stratio crossdata,40,154,0,0,83,4.000000
2,kite,203,301,9,330,217,0.444444
3,twitter gizzard,208,2144,0,4,46,-1.000000
4,apache knox,62,37,20,33,1640,-0.280769
5,apache reef,88,72,0,2,266,-0.125000
6,apache tajo,95,97,7,12,1610,-0.539007
7,pachyderm,206,2159,0,6,58,-0.600000
8,nextflow,77,379,0,4,1730,-0.004082
9,apache sqoop,293,334,256,397,6740,-0.108776


In [5]:
#scale the values, such that for each metric: mean = 0 and std = 1
metrics = ['forks','stars','so_tag_count','so_question_count','search_results', 'growth_rate']
scaled_DF = pd.DataFrame(columns=metrics)

for metric in metrics:
    mean = DF[metric].mean()
    std = DF[metric].std()
    scaled_DF[metric] = DF[metric].apply(lambda x: (x-mean)/std)

scaled_DF['package']= DF['package']

In [6]:
scaled_DF

Unnamed: 0,forks,stars,so_tag_count,so_question_count,search_results,growth_rate,package
0,2.197530,2.162668,0.285872,0.195479,1.233819,-0.349045,apache storm
1,-0.277179,-0.367623,-0.204319,-0.255571,-0.241263,5.405144,stratio crossdata
2,-0.153973,-0.281959,-0.202204,-0.188189,-0.236762,0.539814,kite
3,-0.150193,0.792045,-0.204319,-0.254754,-0.242506,-1.436727,twitter gizzard
4,-0.260550,-0.435804,-0.199619,-0.248833,-0.188967,-0.452550,apache knox
5,-0.240897,-0.415408,-0.204319,-0.255162,-0.235117,-0.239399,apache reef
6,-0.235606,-0.400839,-0.202674,-0.253121,-0.189974,-0.805916,apache tajo
7,-0.151705,0.800786,-0.204319,-0.254346,-0.242103,-0.889377,pachyderm
8,-0.249212,-0.236504,-0.204319,-0.254754,-0.185944,-0.073938,nextflow
9,-0.085944,-0.262728,-0.144161,-0.174508,-0.017668,-0.217199,apache sqoop


In [7]:
#merge forks and stars to one GitHub metric, Tags and Questions to Stack Overflow
final_DF = pd.DataFrame(columns=['Package', 'Rank', 'Overall', 'Github', 'Stack Overflow', 'Search Results'])
final_DF['Package'] = scaled_DF['package']
final_DF['Github'] = scaled_DF[['forks', 'stars']].mean(axis=1)
final_DF['Stack Overflow'] = scaled_DF[['so_tag_count', 'so_question_count']].mean(axis=1)
final_DF['Search Results'] = scaled_DF[['search_results','growth_rate']].mean(axis=1)
final_DF['Overall'] = final_DF[['Github','Stack Overflow','Search Results']].sum(axis=1)
final_DF['Rank'] = final_DF['Overall'].rank(ascending=0).astype(int)

In [8]:
final_DF.sort_values(['Overall'], axis=0, ascending=False, inplace=True)
final_DF = final_DF.reindex_axis(['Package','Rank','Overall','Github','Stack Overflow','Search Results'], axis=1)
final_DF.style.set_table_styles([
    {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}
])

Unnamed: 0,Package,Rank,Overall,Github,Stack Overflow,Search Results
85,apache spark,1,20.1132,8.88036,6.76704,4.46584
122,apache hadoop,2,13.3312,2.02512,8.20096,3.10511
91,apache kafka,3,5.38997,2.83868,1.17682,1.37447
0,apache storm,4,2.86316,2.1801,0.240675,0.442387
129,presto,5,2.48102,2.31167,-0.170952,0.3403
1,stratio crossdata,6,2.02959,-0.322401,-0.229945,2.58194
88,apache couchdb,7,1.89437,0.821726,1.00734,0.0653108
100,apache flink,8,1.8485,1.11443,0.044611,0.689458
67,twitter heron,9,1.52012,0.655072,-0.22923,1.09427
42,hazelcast,10,1.39984,0.612041,0.106827,0.680967


In [9]:
final_DF.to_csv('../output/DC_packages_final_Rankings.csv', index = False)