## Analysing the [Public Git Archive](https://pga.sourced.tech) published by [source{d}](https://sourced.tech/)

### Necessary imports 

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from collections import Counter

### Reading the dataset

In [2]:
data = pd.read_csv("pga-data.csv")

### Exploring dataset's shape and columns

In [3]:
data.shape
data.columns

Index(['URL', 'SIVA_FILENAMES', 'FILE_COUNT', 'LANGS', 'LANGS_BYTE_COUNT',
       'LANGS_LINES_COUNT', 'LANGS_FILES_COUNT', 'COMMITS_COUNT',
       'BRANCHES_COUNT', 'FORK_COUNT', 'EMPTY_LINES_COUNT', 'CODE_LINES_COUNT',
       'COMMENT_LINES_COUNT', 'LICENSE'],
      dtype='object')

### Exploring the first 20 rows

In [4]:
data.head(20)

Unnamed: 0,URL,SIVA_FILENAMES,FILE_COUNT,LANGS,LANGS_BYTE_COUNT,LANGS_LINES_COUNT,LANGS_FILES_COUNT,COMMITS_COUNT,BRANCHES_COUNT,FORK_COUNT,EMPTY_LINES_COUNT,CODE_LINES_COUNT,COMMENT_LINES_COUNT,LICENSE
0,https://github.com/powmedia/buildify,4a14cc02da0a9280538cd3f3242365601d72f241.siva,12,"JSON,JavaScript,Markdown,Text",6531861146691094,2984921622,1514,38,25,0,0190670,284741480,016100,"JSON:0.757,MIT:0.905,MIT-feh:0.479"
1,https://github.com/leon/play-salat,ee02fdfc0942ff17949f456d6ee3a27b976b7141.siva,53,"CSS,HTML,INI,Markdown,Scala,YAML",12981692381238834281132,114304343105616,13219191,104,74,0,040791564,11325024876611,00001020,
2,https://github.com/adammark/Markup.js,00e71912d4ec215337d5d959a34ab0ce44fdd3b4.siva,22,"CSS,HTML,JSON,JavaScript,Markdown,Shell",6537114146781822033080791,8349230560012045,1211311,295,30,0,29108573470,803952937268570,040100800,
3,https://github.com/heroku/heroku-buildpack-scala,eb7aa1e50236c65bf44529ebb9a75fae68e1d6b0.siva,33,"JSON,Markdown,Ruby,Scala,Shell,Text,YAML",58555286850494695951070711,231602292224251032,12521812,560,206,0,058192001,22100112180029,0080000,"JSON:0.747,MIT:0.914"
4,https://github.com/brandonwamboldt/utilphp,"993cc5b5ca2603f06a82555df151fbb398114543.siva,...",12,"JSON,Markdown,PHP,XML,YAML",72313409157618714342,2533739862623,15311,267,112,0,09154804,242422505018,0093000,"JSON:0.667,MIT:0.817"
5,https://github.com/tias/xinput_calibrator,ecb5809790cb86afa70e958bf19d2b968a8981c7.siva,41,"C++,M4Sugar,Makefile,Roff,SVG,Shell,Text,XPM,d...",13150635325289385659761363216765956300,372910414512113741204226812,2216113111,201,72,0,407014000000,1945048000000,575077000000,
6,https://github.com/substack/node-mkdirp,284bc24eaafca75786c7e86f1da66ad61adacdb7.siva,23,"JSON,JavaScript,Markdown,Text,YAML",5561697820951456132,286141013511,118121,82,158,0,0753700,2747063010,018000,"JSON:0.591,MIT:0.735"
7,https://github.com/CakeDC/recaptcha,f4c4682073fcc4c67b38583594d73a880d648934.siva,24,"Gettext Catalog,JSON,Markdown,PHP,Text,YAML",99007415861254081243936,2462817810062446,616811,114,53,0,005412307,027120397038,00047900,MIT:0.773
8,https://github.com/luislavena/rake-compiler,28e938cf837c6859197000cc647f84f92668f0b6.siva,46,"Gherkin,RDoc,Ruby,Text,YAML",10680156828790514012802,258439278343343,813023,432,116,1,29046701,22101970039,0024100,"JSON:0.769,MIT:0.911,MIT-feh:0.497"
9,https://github.com/be9/acl9,b54c2cb6363f42394b25028d3dbde6930ad8c7ed.siva,130,"Markdown,Ruby,Text,YAML",1648711510210721524,48742161073,311313,192,81,0,14773507,3372734034,0440029,"JSON:0.780,MIT:0.942"


### Check for NaN Values

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182014 entries, 0 to 182013
Data columns (total 14 columns):
URL                    182014 non-null object
SIVA_FILENAMES         182014 non-null object
FILE_COUNT             182014 non-null int64
LANGS                  181900 non-null object
LANGS_BYTE_COUNT       181900 non-null object
LANGS_LINES_COUNT      181900 non-null object
LANGS_FILES_COUNT      181900 non-null object
COMMITS_COUNT          182014 non-null int64
BRANCHES_COUNT         182014 non-null int64
FORK_COUNT             182014 non-null int64
EMPTY_LINES_COUNT      181900 non-null object
CODE_LINES_COUNT       181900 non-null object
COMMENT_LINES_COUNT    181900 non-null object
LICENSE                102897 non-null object
dtypes: int64(4), object(10)
memory usage: 19.4+ MB


Above it seems that a lot of Licence data are missing 

### Which and how many repositories do not containt any license

In [6]:
count = 0
no_license_repos = []
for i in range(len(data)):
    row = data.iloc[i]
    if row.LICENSE is np.nan:
        count = count + 1
        no_license_repos.append(row.URL)
#print(no_license_repos)

### Discover in which repository a specific language is used

In [7]:
input_language = "Java"
languages = []
count = 0
for i in range(len(data)):
    row = data.iloc[i]
    try:
        if input_language in row.LANGS:
            languages.append(row.URL)
            count = count + 1
    except TypeError:
        continue
print(count)
#print(languages)

89185


### How many times an organisation is contained in the dataset, meaning that has repositories with more than 50 stars


In [8]:
urls = data.URL
account_name = []
for row in urls:
    token = row.split('/')[3]
    account_name.append(token)
print(Counter(account_name))



### Which repos have the most commits? eg. top let's see the top 10

In [9]:
threshold = 10
data.nlargest(threshold, 'COMMITS_COUNT')

Unnamed: 0,URL,SIVA_FILENAMES,FILE_COUNT,LANGS,LANGS_BYTE_COUNT,LANGS_LINES_COUNT,LANGS_FILES_COUNT,COMMITS_COUNT,BRANCHES_COUNT,FORK_COUNT,EMPTY_LINES_COUNT,CODE_LINES_COUNT,COMMENT_LINES_COUNT,LICENSE
181821,https://github.com/altera-opensource/linux-soc...,1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.siva,62861,"ASN.1,Assembly,Awk,C,C++,CSS,CSV,Clojure,GDB,G...","9958,72707,52660,705190432,4284576,2258,10001,...","457,1581,2131,22961376,134826,90,174,72,687,47...","7,4,13,45898,668,1,4,24,3,5,1,8,5,1,208,7,25,1...",735972,998,16,"0,50163,171,2568172,287,18,0,0,0,0,28,0,669,0,...","0,239774,1397,12758709,1837,44,0,0,0,0,157,0,5...","0,127568,157,2564188,78,27,0,0,0,0,49,0,1,0,0,...","GPL-2.0-only:0.955,GPL-2.0-or-later:0.955,Linu..."
181605,https://github.com/OpenChannelSSD/linux,1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.siva,61290,"ASN.1,Assembly,Awk,C,C++,CSS,CSV,Clojure,GDB,G...","7817,5470,56755,698796333,4258807,2258,9798,14...","279,196,2290,22538003,134478,90,168,72,687,477...","6,3,14,44830,657,1,3,24,3,5,1,8,3,1,183,7,24,1...",708228,42,16,"0,49337,185,2506321,287,18,0,0,0,0,28,0,577,0,...","0,236435,1519,12442802,1837,44,0,0,0,0,155,0,4...","0,126431,179,2523504,78,27,0,0,0,0,49,0,1,0,0,...","GPL-2.0-only:0.955,GPL-2.0-or-later:0.955,Linu..."
181539,https://github.com/google/capsicum-linux,1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.siva,58017,"ASN.1,Assembly,Awk,C,C++,CSS,CSV,Clojure,GDB,G...","7817,8671,56432,596605961,4047870,1829,10134,1...","279,313,2281,20625821,129896,73,167,53,687,477...","6,5,14,42212,624,1,3,22,3,5,1,6,3,1,157,7,24,1...",664239,61,16,"0,49242,185,2358353,287,14,0,0,0,0,28,0,537,0,...","0,237373,1519,11732672,1837,35,0,0,0,0,153,0,4...","0,126485,171,2408097,72,23,0,0,0,0,49,0,1,0,0,...","GPL-2.0-only:0.955,GPL-2.0-or-later:0.955,Linu..."
181522,https://github.com/ljalves/linux_media,1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.siva,57316,"ASN.1,Assembly,Awk,C,C++,CSS,CSV,Clojure,GDB,G...","7817,8671,47507,590363137,3888412,1829,10134,9...","279,313,1905,20404514,124864,73,167,51,687,477...","6,5,13,41828,611,1,3,21,3,5,1,6,2,1,137,7,24,1...",648041,36,16,"0,48738,142,2330780,287,14,0,0,0,0,28,0,458,0,...","0,235237,1222,11591078,1837,35,0,0,0,0,153,0,3...","0,125951,136,2385209,72,23,0,0,0,0,49,0,0,0,0,...","GPL-2.0-only:0.955,GPL-2.0-or-later:0.955,Linu..."
182011,https://github.com/rockchip-linux/kernel,1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.siva,62283,"ASN.1,Assembly,Awk,C,C++,Clojure,Filebench WML...","8195,8671,34295,757416364,6556611,945,146954,1...","301,313,1410,25712592,190991,51,1793,687,4775,...","9,5,11,47457,893,21,1,3,5,1,9,2,11,7,24,1,2596...",606698,26,16,"0,47712,132,2837819,231,0,0,0,0,24,0,58,0,0,0,...","0,311201,1137,14153897,1578,0,0,0,0,121,0,378,...","0,124681,130,2687293,59,0,0,0,0,33,0,0,0,0,0,0...","GPL-2.0-only:0.955,GPL-2.0-or-later:0.955,Linu..."
179647,https://github.com/libos-nuse/net-next-nuse,1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.siva,54529,"ASN.1,Assembly,Awk,C,C++,Clojure,GDB,Gettext C...","8065,8671,43832,564209456,3409620,971,18113,12...","296,313,1800,19528433,111797,51,687,4775,228,1...","8,5,12,40260,581,21,3,5,1,5,4,1,7,24,1,2422,2,...",604331,56,16,"0,47451,132,2228531,231,0,0,0,28,0,509,0,0,0,0...","0,230002,1139,11094710,1580,0,0,0,150,0,4284,0...","0,122537,130,2307569,59,0,0,0,49,0,0,0,0,0,0,7...","AGPL-1.0:0.237,GPL-2.0-only:0.805,GPL-2.0-or-l..."
181969,https://github.com/mjg59/linux,1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.siva,53618,"ASN.1,Assembly,Awk,C,C++,Clojure,GDB,Gettext C...","8065,8671,43850,553845156,3409206,945,18113,12...","296,313,1800,19225169,111633,51,687,4775,220,1...","8,5,12,39690,569,21,3,5,1,5,3,1,7,24,1,2385,1,...",588180,442,16,"0,47219,132,2194343,231,0,0,0,28,0,398,0,0,0,0...","0,229551,1139,10941645,1580,0,0,0,142,0,2933,0...","0,121839,130,2284314,59,0,0,0,49,0,2,0,0,0,0,7...","GPL-2.0-only:0.955,GPL-2.0-or-later:0.955,Linu..."
181597,https://github.com/google/ktsan,1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.siva,50827,"ASN.1,Assembly,Awk,C,C++,Clojure,GDB,Gettext C...","6262,8671,23041,525505668,3382792,864,18113,12...","229,313,947,18285889,108690,47,687,4775,172,16...","5,5,9,38007,547,19,3,5,1,5,2,1,7,23,1,2252,98,...",535479,100,16,"0,47291,96,2084055,209,0,0,0,24,0,58,0,0,0,0,7...","0,229083,753,10404156,1531,0,0,0,114,0,378,0,0...","0,121978,89,2202572,58,0,0,0,33,0,0,0,0,0,0,75...","GPL-2.0-only:0.955,GPL-2.0-or-later:0.955,Linu..."
181881,https://github.com/NextThingCo/CHIP-linux,1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.siva,50836,"ASN.1,Assembly,Awk,C,C++,Clojure,GDB,Gettext C...","6262,8671,23041,525295748,3367813,864,18113,12...","229,313,947,18278154,108035,47,687,4775,172,16...","5,5,9,38015,548,19,3,5,1,5,2,1,7,23,1,2252,98,...",533155,136,16,"0,47279,96,2083015,209,0,0,0,24,0,58,0,0,0,0,7...","0,229063,753,10400270,1531,0,0,0,114,0,378,0,0...","0,121898,89,2201940,58,0,0,0,33,0,0,0,0,0,0,75...","GPL-2.0-only:0.955,GPL-2.0-or-later:0.955,Linu..."
175905,https://github.com/ARM-software/linux,1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.siva,49660,"ASN.1,Assembly,Awk,C,C++,Clojure,GDB,Gettext C...","6155,8671,23041,493706941,3306902,695,18113,12...","223,313,947,17565719,105788,40,687,4775,172,34...","4,5,9,37301,533,17,3,5,1,7,2,1,7,21,1,2205,97,...",520275,43,16,"0,47024,96,2038540,209,0,0,0,24,0,58,0,0,0,0,7...","0,228642,753,10170064,1531,0,0,0,114,0,378,0,0...","0,121394,89,2169406,58,0,0,0,33,0,0,0,0,0,0,74...","AGPL-1.0:0.237,GPL-2.0-only:0.805,GPL-2.0-or-l..."


### Which repos have the most branches? eg. top let's see the top 10

In [10]:
threshold = 10
data.nlargest(threshold, 'BRANCHES_COUNT')

Unnamed: 0,URL,SIVA_FILENAMES,FILE_COUNT,LANGS,LANGS_BYTE_COUNT,LANGS_LINES_COUNT,LANGS_FILES_COUNT,COMMITS_COUNT,BRANCHES_COUNT,FORK_COUNT,EMPTY_LINES_COUNT,CODE_LINES_COUNT,COMMENT_LINES_COUNT,LICENSE
181505,https://github.com/JetBrains/intellij-plugins,ba67a1023bb29d61980469c433f25f41a9e423d4.siva,14080,"ActionScript,Ant Build System,Batchfile,CSS,Co...","1417236,10566,7194,62871,102823,29042,1052573,...","52519,339,278,3379,4105,872,36771,282,14,521,2...","1875,10,6,107,417,4,675,2,3,67,7,8,464,102,1,3...",13395,59321,0,"8164,0,0,569,5,0,3692,0,0,57,0,174,438,0,4,0,0...","34321,0,0,2553,16,0,19878,0,0,387,0,1130,12781...","6237,0,0,185,0,0,11532,0,0,1,0,27,87,0,1,0,0,0...",
182013,https://github.com/openstack/cinder,"0689bc785fdd59ad6dda13c17e830c47948c804b.siva,...",2349,"Gettext Catalog,HTML,INI,JSON,Makefile,Markdow...","2810148,557,15100,143453,621,8759,8757,42836,2...","81875,18,436,5239,22,235,193,892,66,477127,273...",111632013229110905841382511252,14789,48515,0,0203225000067841800005630,"0,15,0,4087,19,182,0,0,0,343453,18,0,0,0,26,10...",00000000064567240000870,Apache-2.0:0.790
171589,https://github.com/openstack/horizon,"8028fe572b33f2a2683eb91512d88c8c87f29763.siva,...",2696,"CSV,Gettext Catalog,HTML,INI,JSON,JavaScript,P...","1195,9287217,557517,7547,87259,2296041,4890827...","17,301889,15497,282,1986,70686,124787,6097,302...",2114586424552922100656514399,13771,44321,1,00108401068407204380000260,0013734018604281284624000013070,00129001891618770000030,"Apache-2.0:0.635,ECL-2.0:0.377"
181768,https://github.com/servo/servo,ce30d4520d67f2c6ef960571a9b3e450c5dcbebe.siva,74508,"ApacheConf,Batchfile,C,CMake,CSS,Common Lisp,D...","6682,22661,157769,4037,421392,86,12139,8883,46...","228,830,5298,124,7589,6,317,315,134640,340,8,2...","11,9,16,2,432,1,9,4,2815,3,1,48087,2,7035,588,...",31529,43022,0,"0,0,202,14,923,0,0,0,2,0,0,126376,25,0,333,0,3...","0,0,3167,101,5405,0,0,0,0,0,0,1321598,137,0,76...","0,0,1303,7,703,0,0,0,2,0,0,56279,5,0,0,0,18,81...","MPL-2.0:0.965,MPL-2.0-no-copyleft-exception:0.965"
172180,https://github.com/openstack/keystone,"419c2cb95f5ce0c515fd8636d90065dfcf784c8c.siva,...",1314,"CSS,Gettext Catalog,HTML,INI,JSON,Makefile,PHP...","638,608050,665,30138,214752,5753,2794,5184773,...","47,15009,22,913,6684,160,53,128026,88,598,437,...",1131122441163815187165160,12622,38795,0,110003625019721000372670,360220616512908115300034855760,00000502510900032620,"Apache-2.0:0.635,ECL-2.0:0.377"
182012,https://github.com/google/angle,"0003f3aac941c7cf052dc510aa146fbfcd8987a0.siva,...",1665,"Batchfile,C,C++,Diff,GLSL,GN,HLSL,INI,JSON,Lex...","15633,1854915,16249597,23522,693,83516,34698,1...","246,40900,454739,692,31,2956,1273,37,6517,1081...",315612644310711623028147271112,7929,38327,2,"0,508,51497,0,5,0,177,0,76,0,365,0,319,0,802,0...","0,2448,288899,0,14,0,1060,0,6425,0,1641,0,1514...","0,206,22020,0,10,0,29,0,0,0,0,0,125,0,2136,0,0...","BSD-2-Clause:0.802,BSD-3-Clause:0.858"
181468,https://github.com/openstack/heat,"3ab800fe7e93f3b3c113d72b67ce59a132b529c8.siva,...",1239,"ApacheConf,Gettext Catalog,INI,JSON,Makefile,M...","163,2793347,23099,54309,5753,2606,9136098,6086...","4,75214,570,1627,160,78,235692,1991,347,3381,1...",11796112853207108133,14461,37602,1,0002252933728001150,0001559129471781080031480,0000502280800230,"Apache-2.0:0.635,ECL-2.0:0.377"
181487,https://github.com/openstack/tempest,"9f432ae99c6475d6922687f00cbe237e25a1721e.siva,...",1100,"ApacheConf,INI,Python,Shell,Text,YAML,reStruct...",629156421120867921213160018132459,228310326620223518633409,138173620053,11794,36089,0,001445200730,00657960015830,002220100110,Apache-2.0:0.790
181628,https://github.com/rust-lang/rust,c01efc669f09508b55eced32d3c88702578a7c3e.siva,11691,"Awk,Batchfile,C,C++,CSS,Diff,Dockerfile,HTML,I...","647,756,351872,109330,46367,5021,50980,72,110,...","12,21,12662,3381,2445,141,1898,2,7,370,3948,24...","1,2,72,8,5,4,47,1,1,3,27,12,1,2,178,180,3,7,3,...",74126,35094,2,"0,0,1563,416,365,0,0,0,0,0,0,224,0,0,386,3547,...","10,0,7038,2574,1996,0,0,0,0,0,3921,1933,0,0,17...","1,0,1594,254,79,0,0,0,0,0,0,287,0,0,328,0,0,0,...","Apache-2.0:1.000,ECL-2.0:0.875,JSON:0.933,MIT:..."
181592,https://github.com/deadlyvipers/dojo_rules,"04f7258e99088e4098a9e3b362b885d23e8e6695.siva,...",4,"Markdown,Ruby",29070,154,22,10,31920,0,20,110,02,


### Which repos have the most forks? eg. top let's see the top 10

In [11]:
threshold = 10
data.nlargest(threshold, 'FORK_COUNT')

Unnamed: 0,URL,SIVA_FILENAMES,FILE_COUNT,LANGS,LANGS_BYTE_COUNT,LANGS_LINES_COUNT,LANGS_FILES_COUNT,COMMITS_COUNT,BRANCHES_COUNT,FORK_COUNT,EMPTY_LINES_COUNT,CODE_LINES_COUNT,COMMENT_LINES_COUNT,LICENSE
170253,https://github.com/aosp-mirror/platform_develo...,"000a569bfaab838e9aba04669653a622bf872553.siva,...",9155,"Ant Build System,Batchfile,C,C++,CMake,CSS,CSV...","6552,7212,147270,643935,22038,11631,60274,1825...","162,273,3624,19515,245,674,1164,472,73,191,393...","3,3,28,88,2,5,4,3,1,9,69,47,6,2087,6,108,9,14,...",23710,6042,1843,"0,0,62,1266,12,94,0,0,0,0,499,0,1,34749,319,3,...","0,0,318,9372,201,542,0,0,0,0,3366,0,70,152909,...","0,0,193,1421,30,34,0,0,0,0,24,0,0,68318,91,55,...",
180848,https://github.com/android/platform_development,"000a569bfaab838e9aba04669653a622bf872553.siva,...",9155,"Ant Build System,Batchfile,C,C++,CMake,CSS,CSV...","6552,7212,147270,643935,22038,11631,60274,1825...","162,273,3624,19515,245,674,1164,472,73,191,393...","3,3,28,88,2,5,4,3,1,9,69,47,6,2087,6,108,9,14,...",23710,6042,1843,"0,0,62,1266,12,94,0,0,0,0,499,0,1,34749,319,3,...","0,0,318,9372,201,542,0,0,0,0,3366,0,70,152909,...","0,0,193,1421,30,34,0,0,0,0,24,0,0,68318,91,55,...",
3075,https://github.com/SpringSource/spring-social,"171918e41aab85c4d6609dfaabfc8abbef44dbd9.siva,...",361,"AsciiDoc,Batchfile,CSS,Gradle,HTML,INI,JSON,Ja...","119727,2260,9541,19190,335,810,1894,800107,704...","2674,85,806,572,13,32,72,19966,224,13,32,603,1...",315414102173123015631,1728,317,83,5460160000027317501000191370,"2125,0,625,0,12,0,70,10779,146,0,26,0,0,0,83,5...",0017000062410050008800,"Apache-2.0:0.986,ECL-2.0:0.748"
80671,https://github.com/dangdangdotcom/dubbox,"131444e3cb807fd6bbf235664c258e7f8f5c8c7e.siva,...",1895,"Batchfile,CSS,INI,JFlex,Java,JavaScript,Markdo...","3753,18582,90741,2077,5784523,135322,13421,250...","109,1107,1835,69,173161,2257,290,6863,265,233,...",541811360102731143149,1919,220,83,0440024489143640000593,0101100109432170222400003607,048003861040700000466,"Apache-2.0:0.985,ECL-2.0:0.748"
123574,https://github.com/yjmyzz/dubbox,"131444e3cb807fd6bbf235664c258e7f8f5c8c7e.siva,...",1665,"Batchfile,CSS,INI,JFlex,Java,JavaScript,Markdo...","3238,18582,86329,2077,5273289,135322,10627,181...","106,1107,1699,69,156542,2257,214,4574,18,256,2...",24111121910158153194,1945,60,83,04400214431435500000355,0101100987511702159000002231,0480035846407000000301,"Apache-2.0:0.985,ECL-2.0:0.748"
141448,https://github.com/hutai123/dubbox,"131444e3cb807fd6bbf235664c258e7f8f5c8c7e.siva,...",1907,"Batchfile,CSS,INI,JFlex,Java,JavaScript,Markdo...","3753,18582,91586,2077,5799386,135322,14070,263...","109,1107,1854,69,173606,2257,297,7154,265,233,...",541911365102751143151,1922,52,83,0440024568143650000597,0101100109733170223000003611,048003867140700000469,"Apache-2.0:0.985,ECL-2.0:0.748"
170999,https://github.com/SpringSource/spring-framework,"171918e41aab85c4d6609dfaabfc8abbef44dbd9.siva,...",7979,"AsciiDoc,AspectJ,Batchfile,CSS,FreeMarker,GAP,...","2142608,43278,6877,54217,15915,6137,66698,3806...","56669,1187,209,1364,482,269,1942,1605,1223,2,1...","39,13,2,24,4,1,29,21,28,2,88,1,6771,4,15,51,3,...",15990,3186,83,"12615,0,0,274,0,0,0,473,122,0,0,0,157437,0,13,...","44016,0,0,962,0,0,0,2616,1080,0,0,5,586328,0,1...","0,0,0,98,0,0,0,370,2,0,0,0,321329,0,133,1704,0...",
173650,https://github.com/spring-projects/spring-fram...,"171918e41aab85c4d6609dfaabfc8abbef44dbd9.siva,...",7979,"AsciiDoc,AspectJ,Batchfile,CSS,FreeMarker,GAP,...","2142608,43278,6877,54217,15915,6137,66698,3806...","56669,1187,209,1364,482,269,1942,1605,1223,2,1...","39,13,2,24,4,1,29,21,28,2,88,1,6771,4,15,51,3,...",15994,3188,83,"12615,0,0,274,0,0,0,473,122,0,0,0,157456,0,13,...","44016,0,0,962,0,0,0,2616,1080,0,0,5,586510,0,1...","0,0,0,98,0,0,0,370,2,0,0,0,321365,0,133,1704,0...",
173658,https://github.com/spring-projects/spring-social,"171918e41aab85c4d6609dfaabfc8abbef44dbd9.siva,...",361,"AsciiDoc,Batchfile,CSS,Gradle,HTML,INI,JSON,Ja...","119727,2260,9541,19190,335,810,1894,800107,704...","2674,85,806,572,13,32,72,19966,224,13,32,603,1...",315414102173123015631,1728,317,83,5460160000027317501000191370,"2125,0,625,0,12,0,70,10779,146,0,26,0,0,0,83,5...",0017000062410050008800,"Apache-2.0:0.986,ECL-2.0:0.748"
35825,https://github.com/SpringSource/spring-petclinic,"171918e41aab85c4d6609dfaabfc8abbef44dbd9.siva,...",91,"Batchfile,HTML,INI,Java,Less,Markdown,Maven PO...","5186,15285,2330,83577,8583,6902,8524,1930,1785...","146,497,85,2539,417,120,268,65,66,108,9158,235...",11293641111221212,576,285,82,07203620330030000000,041701447086002050000611,02069400000000000,


In [12]:
data_new = data[data['LANGS_BYTE_COUNT'].notnull()]


In [13]:
data_new.head(10)

Unnamed: 0,URL,SIVA_FILENAMES,FILE_COUNT,LANGS,LANGS_BYTE_COUNT,LANGS_LINES_COUNT,LANGS_FILES_COUNT,COMMITS_COUNT,BRANCHES_COUNT,FORK_COUNT,EMPTY_LINES_COUNT,CODE_LINES_COUNT,COMMENT_LINES_COUNT,LICENSE
0,https://github.com/powmedia/buildify,4a14cc02da0a9280538cd3f3242365601d72f241.siva,12,"JSON,JavaScript,Markdown,Text",6531861146691094,2984921622,1514,38,25,0,190670,284741480,16100,"JSON:0.757,MIT:0.905,MIT-feh:0.479"
1,https://github.com/leon/play-salat,ee02fdfc0942ff17949f456d6ee3a27b976b7141.siva,53,"CSS,HTML,INI,Markdown,Scala,YAML",12981692381238834281132,114304343105616,13219191,104,74,0,40791564,11325024876611,1020,
2,https://github.com/adammark/Markup.js,00e71912d4ec215337d5d959a34ab0ce44fdd3b4.siva,22,"CSS,HTML,JSON,JavaScript,Markdown,Shell",6537114146781822033080791,8349230560012045,1211311,295,30,0,29108573470,803952937268570,40100800,
3,https://github.com/heroku/heroku-buildpack-scala,eb7aa1e50236c65bf44529ebb9a75fae68e1d6b0.siva,33,"JSON,Markdown,Ruby,Scala,Shell,Text,YAML",58555286850494695951070711,231602292224251032,12521812,560,206,0,58192001,22100112180029,80000,"JSON:0.747,MIT:0.914"
4,https://github.com/brandonwamboldt/utilphp,"993cc5b5ca2603f06a82555df151fbb398114543.siva,...",12,"JSON,Markdown,PHP,XML,YAML",72313409157618714342,2533739862623,15311,267,112,0,9154804,242422505018,93000,"JSON:0.667,MIT:0.817"
5,https://github.com/tias/xinput_calibrator,ecb5809790cb86afa70e958bf19d2b968a8981c7.siva,41,"C++,M4Sugar,Makefile,Roff,SVG,Shell,Text,XPM,d...",13150635325289385659761363216765956300,372910414512113741204226812,2216113111,201,72,0,407014000000,1945048000000,575077000000,
6,https://github.com/substack/node-mkdirp,284bc24eaafca75786c7e86f1da66ad61adacdb7.siva,23,"JSON,JavaScript,Markdown,Text,YAML",5561697820951456132,286141013511,118121,82,158,0,753700,2747063010,18000,"JSON:0.591,MIT:0.735"
7,https://github.com/CakeDC/recaptcha,f4c4682073fcc4c67b38583594d73a880d648934.siva,24,"Gettext Catalog,JSON,Markdown,PHP,Text,YAML",99007415861254081243936,2462817810062446,616811,114,53,0,5412307,27120397038,47900,MIT:0.773
8,https://github.com/luislavena/rake-compiler,28e938cf837c6859197000cc647f84f92668f0b6.siva,46,"Gherkin,RDoc,Ruby,Text,YAML",10680156828790514012802,258439278343343,813023,432,116,1,29046701,22101970039,24100,"JSON:0.769,MIT:0.911,MIT-feh:0.497"
9,https://github.com/be9/acl9,b54c2cb6363f42394b25028d3dbde6930ad8c7ed.siva,130,"Markdown,Ruby,Text,YAML",1648711510210721524,48742161073,311313,192,81,0,14773507,3372734034,440029,"JSON:0.780,MIT:0.942"


In [14]:
pd.options.mode.chained_assignment = None
data_new['TOTAL_BYTE_COUNT'] = data_new['LANGS_BYTE_COUNT'].apply(lambda x: sum(map(int, x.split(',')))) 

In [15]:
data_new.head(10)

Unnamed: 0,URL,SIVA_FILENAMES,FILE_COUNT,LANGS,LANGS_BYTE_COUNT,LANGS_LINES_COUNT,LANGS_FILES_COUNT,COMMITS_COUNT,BRANCHES_COUNT,FORK_COUNT,EMPTY_LINES_COUNT,CODE_LINES_COUNT,COMMENT_LINES_COUNT,LICENSE,TOTAL_BYTE_COUNT
0,https://github.com/powmedia/buildify,4a14cc02da0a9280538cd3f3242365601d72f241.siva,12,"JSON,JavaScript,Markdown,Text",6531861146691094,2984921622,1514,38,25,0,190670,284741480,16100,"JSON:0.757,MIT:0.905,MIT-feh:0.479",25027
1,https://github.com/leon/play-salat,ee02fdfc0942ff17949f456d6ee3a27b976b7141.siva,53,"CSS,HTML,INI,Markdown,Scala,YAML",12981692381238834281132,114304343105616,13219191,104,74,0,40791564,11325024876611,1020,,60512
2,https://github.com/adammark/Markup.js,00e71912d4ec215337d5d959a34ab0ce44fdd3b4.siva,22,"CSS,HTML,JSON,JavaScript,Markdown,Shell",6537114146781822033080791,8349230560012045,1211311,295,30,0,29108573470,803952937268570,40100800,,231730
3,https://github.com/heroku/heroku-buildpack-scala,eb7aa1e50236c65bf44529ebb9a75fae68e1d6b0.siva,33,"JSON,Markdown,Ruby,Scala,Shell,Text,YAML",58555286850494695951070711,231602292224251032,12521812,560,206,0,58192001,22100112180029,80000,"JSON:0.747,MIT:0.914",84833
4,https://github.com/brandonwamboldt/utilphp,"993cc5b5ca2603f06a82555df151fbb398114543.siva,...",12,"JSON,Markdown,PHP,XML,YAML",72313409157618714342,2533739862623,15311,267,112,0,9154804,242422505018,93000,"JSON:0.667,MIT:0.817",172806
5,https://github.com/tias/xinput_calibrator,ecb5809790cb86afa70e958bf19d2b968a8981c7.siva,41,"C++,M4Sugar,Makefile,Roff,SVG,Shell,Text,XPM,d...",13150635325289385659761363216765956300,372910414512113741204226812,2216113111,201,72,0,407014000000,1945048000000,575077000000,,215508
6,https://github.com/substack/node-mkdirp,284bc24eaafca75786c7e86f1da66ad61adacdb7.siva,23,"JSON,JavaScript,Markdown,Text,YAML",5561697820951456132,286141013511,118121,82,158,0,753700,2747063010,18000,"JSON:0.591,MIT:0.735",21217
7,https://github.com/CakeDC/recaptcha,f4c4682073fcc4c67b38583594d73a880d648934.siva,24,"Gettext Catalog,JSON,Markdown,PHP,Text,YAML",99007415861254081243936,2462817810062446,616811,114,53,0,5412307,27120397038,47900,MIT:0.773,44089
8,https://github.com/luislavena/rake-compiler,28e938cf837c6859197000cc647f84f92668f0b6.siva,46,"Gherkin,RDoc,Ruby,Text,YAML",10680156828790514012802,258439278343343,813023,432,116,1,29046701,22101970039,24100,"JSON:0.769,MIT:0.911,MIT-feh:0.497",129081
9,https://github.com/be9/acl9,b54c2cb6363f42394b25028d3dbde6930ad8c7ed.siva,130,"Markdown,Ruby,Text,YAML",1648711510210721524,48742161073,311313,192,81,0,14773507,3372734034,440029,"JSON:0.780,MIT:0.942",134185


In [16]:
data_new.nlargest(100, 'TOTAL_BYTE_COUNT')

Unnamed: 0,URL,SIVA_FILENAMES,FILE_COUNT,LANGS,LANGS_BYTE_COUNT,LANGS_LINES_COUNT,LANGS_FILES_COUNT,COMMITS_COUNT,BRANCHES_COUNT,FORK_COUNT,EMPTY_LINES_COUNT,CODE_LINES_COUNT,COMMENT_LINES_COUNT,LICENSE,TOTAL_BYTE_COUNT
181503,https://github.com/benbalter/dc-maps,716cf0facc0f6547c8838ceac21577eb2787fd01.siva,852,"JSON,Markdown,Ruby",11109107433644693681,890884166,84813,24,13,0,0918,0874107,000,"Info-ZIP:0.500,MS-PL:0.500,MS-RL:0.500",11109175583
172767,https://github.com/jamesrobertlloyd/gp-structu...,45070ea1ee6a34ffa24e5c6da3845954933f8084.siva,7381,"C,C++,CSS,CSV,Fortran,HTML,Makefile,Markdown,M...","1284,42466,77,1240678,141064,51261,2229,4012,9...","36,1376,4,66885,4441,1111,66,62,320,11492,298,...",1161109121111281182214802,655,4,0,813410015515260000181520940,1950320095429350000620574450,813300002100000122522320,MIT:0.802,6752715481
181492,https://github.com/mirror/dd-wrt,"114440b0ddf314d050cf0bd43b161e9e7653138b.siva,...",617795,"AGS Script,ASN.1,ASP,Ada,Ant Build System,Apac...","21876,112777,1001137,22012578,8300,1681,140535...","896,3988,23883,601358,225,81,4115,213708,18237...","6,42,218,1498,8,5,25,934,190,204,1,383833,153,...",31729,23,0,"0,0,0,113681,0,0,676,218549,1217,0,0,14486399,...","0,0,0,310889,0,0,3414,1321946,10010,0,0,736371...","0,0,0,166099,0,0,0,526008,2423,0,0,15916460,11...",,6602332086
171575,https://github.com/jswanner/RailsDiff,4dd64fafe2a1890bc8e4e017fa406f19a4b6c4fe.siva,22935,"CSS,Diff,HTML,Haml,JSON,JavaScript,Markdown,Ru...","5526,1570006725,3058312059,4720,34775,25126,99...","92,48584877,49144824,125,2,80,45,399,2150,97,1...",210349124966111465231,163,8,0,5010287270017127031300,25048027756016132560162007,6007584500102024900,,4629040286
181815,https://github.com/DiseaseOntology/HumanDiseas...,ec799f4cbfbcd7faa00a085a87fc8d59abc8f3ac.siva,2355,"CSV,JSON,Makefile,Markdown,Perl,SPARQL,Text,We...","6760878,430596228,15123,12463,281,2616,2922959...","121488,11857680,333,291,10,90,35006,47744710,1...",794221672079181,562,89,0,007996000006,09796641157193900013918,00950000034,CC0-1.0:1.000,4527018037
172515,https://github.com/petewarden/openheatmap,c7c9c54fb0ca4707e692f8f4b9d7c0b46e4c2bec.siva,4356,"ActionScript,C,C++,CSS,CSV,HTML,INI,JSON,JavaS...","11809,487130,7462,39244,382042883,68950,161,41...","409,14353,239,785,7404792,1948,8,5945102,9639,...",33213614142118132471179101814261,269,4,0,"80,2053,19,63,0,428,0,5738,1684,53,1973,22,348...","307,7572,178,628,0,1475,0,5939254,7250,154,727...","22,3883,41,93,0,32,0,0,698,14,1118,89,37,103,0...",,3645694471
175487,https://github.com/Soren-Nordstrom/userscript,a8a499f2c381e7498413709ea19d952733654c8f.siva,129107,"JavaScript,Markdown,Shell",3574567221888419,635087161617,12910411,27,4,0,561693820,48504214130,731415700,,3574568528
172315,https://github.com/slashbeast/grsecurity-scrape,8c5734e23a8ede0682c9d6f57887e64ce8524dab.siva,1307,"Diff,Perl,Text",352367259625571549816,958712079628284,81311,712,2,0,0210,0740,000,,3525224969
95371,https://github.com/franksouza183/EvolvereSuit,7988641fd2f71e0cbc82ae793efb10f6c0ad0535.siva,957966,"CSS,HTML,Markdown,SVG,Shell,Text,XML,desktop",165202326416093287877771149113926518644024,55410514445685898652982031132,21319411911114111,253,4,0,902641900000,4467232400000,1628000000,,3289318194
170896,https://github.com/RMerl/asuswrt-merlin.382,0d61b4f16f3bb361a02f2d293946a2e29f85d510.siva,273037,"1C Enterprise,AGS Script,ASN.1,ASP,ActionScrip...","28362,21859,170224,4804451,102191,1359726,1751...","688,913,6000,121740,2961,34422,475,27627,14717...","1,7,43,300,8,187,2,105,139,159,166385,254,3812...",491,21,0,"0,0,0,0,339,2921,0,114172,806,0,5642000,6300,3...","0,0,0,0,2303,14150,0,547404,7969,0,29702973,38...","0,0,0,0,312,7739,0,260486,1887,0,6793233,19147...","AGPL-1.0:0.659,GPL-2.0-only:0.411,GPL-2.0-or-l...",2992403756
