# Ranking python packages

In [1]:
import requests
import logging
import pandas as pd
import urllib
import dill
import time

In [2]:
import sys
sys.path.append('../code')

import utils as utils
import get_stack_overflow_data as gso

# Create package list

# 1
https://github.com/rasbt/pattern_classification/blob/master/resources/python_data_libraries.md

In [3]:
with open("../data/python_pkgs1.txt", 'rb') as f:
    lines = f.readlines()
    src_url1 = lines[0].strip()
    pkgs1 = [p.strip().lower() for p in lines[1:]]

In [4]:
src_url1

'https://github.com/rasbt/pattern_classification/blob/master/resources/python_data_libraries.md'

In [5]:
print pkgs1

['ipython-notebook', 'numpy', 'pandas', 'scipy', 'sympy', 'statsmodels', 'scikit-learn', 'shogun', 'pybrain', 'pylearn2', 'pymc', 'bokeh', 'd3py', 'ggplot', 'matplotlib', 'plotly', 'prettyplotlib', 'seaborn', 'csvkit', 'pytables', 'sqlite3']


## 2
https://www.upwork.com/hiring/data/15-python-libraries-data-science/

In [6]:
from bs4 import BeautifulSoup

In [7]:
URL = 'https://www.upwork.com/hiring/data/15-python-libraries-data-science/'

def scrape_package_list(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')
    pkg_list = [ol.find_all('strong') for ol in soup.find_all('ol')]
    pkgs = [[str(item.text.lower()) for item in l] for l in pkg_list]
    return sum(pkgs, [])

In [8]:
pkgs2 = scrape_package_list(URL)

In [9]:
print pkgs2

['numpy', 'scipy', 'pandas', 'ipython', 'matplotlib', 'scikit-learn', 'theano', 'tensorflow', 'scrapy', 'nltk', 'pattern', 'seaborn', 'bokeh', 'basemap', 'networkx']


## 3
http://www.datasciencecentral.com/profiles/blogs/9-python-analytics-libraries-1

In [10]:
# only one extra
pkgs3 = ['mlpy']

## 4 FIX!
manually add a few (also, ggplot is now ggpy) and correct any 2 words to dash-separated

plotly.py is for github search

In [11]:
pkgs4 = ['ggpy', 'plotly.py', 'ipython', 'jupyter', 'jupyter-notebook', 'ipython-notebook']

## Combine list

In [12]:
print map(len, [pkgs1, pkgs2, pkgs3, pkgs4])

[21, 15, 1, 6]


In [13]:
pkgs = [p.lower() for p in list(set(pkgs1 + pkgs2 + pkgs3 + pkgs4))]
pkgs.remove('sqlite3')  # base python
print len(pkgs)

33


In [14]:
pkgs

['plotly.py',
 'networkx',
 'jupyter-notebook',
 'seaborn',
 'pattern',
 'scrapy',
 'ggpy',
 'bokeh',
 'sympy',
 'numpy',
 'pandas',
 'pybrain',
 'shogun',
 'pylearn2',
 'matplotlib',
 'ggplot',
 'csvkit',
 'scipy',
 'tensorflow',
 'ipython',
 'prettyplotlib',
 'jupyter',
 'pymc',
 'pytables',
 'scikit-learn',
 'mlpy',
 'plotly',
 'statsmodels',
 'theano',
 'd3py',
 'ipython-notebook',
 'basemap',
 'nltk']

In [15]:
with open("../data/python_pkgs_final.txt", "w") as f:
    f.writelines(" ".join(pkgs))

# Now, get metrics and save to csv

In [16]:
logging.basicConfig(level=logging.INFO)

# Get Github stats

In [17]:
with open("../code/secrets/github-token.nogit", "rb") as f:
    token = f.read()

In [18]:
headers = {'Authorization': 'token %s' % token}

In [19]:
def get_data_from_search(query):
    """Use github search to return stars, forks for top query result"""
    
    r = requests.get('https://api.github.com/search/repositories?q='+\
                             query, 
                     headers=headers)
    r.raise_for_status()
    try:
        res = r.json()['items'][0]
        return {'package': query, 'full_name': res['full_name'],
                'stars': res['stargazers_count'], 'forks': res['forks_count']}
    except:
        return None

In [20]:
res = get_data_from_search('jupyter notebook')
res

{'forks': 559,
 'full_name': u'jupyter/notebook',
 'package': 'jupyter notebook',
 'stars': 1877}

In [21]:
# if get error "403 Client Error: Forbidden for url:"
# then wait 60 seconds before running this cell

# use generator to avoid repeat API calls
data1 = [res for res in (get_data_from_search(q) for q in pkgs[:28])
        if res is not None]

time.sleep(61)      # API limit with token: 30 api calls/min

data2 = [res for res in (get_data_from_search(q) for q in pkgs[28:])
        if res is not None]

print "DONE"

DONE


In [22]:
# use plotly.py result
github = pd.DataFrame(data1 + data2)[['package', 'full_name', 'forks', 'stars']]
github

Unnamed: 0,package,full_name,forks,stars
0,plotly.py,plotly/plotly.py,441,1822
1,networkx,networkx/networkx,819,2662
2,jupyter-notebook,jupyter/notebook,559,1877
3,seaborn,mwaskom/seaborn,542,3656
4,pattern,clips/pattern,950,5109
5,scrapy,scrapy/scrapy,4954,18025
6,ggpy,yhat/ggpy,399,2852
7,bokeh,bokeh/bokeh,1260,5328
8,sympy,sympy/sympy,1787,3546
9,numpy,numpy/numpy,1864,3823


In [23]:
github.to_csv("../data/python_results_github.csv", index=False)

# Get Stack Overflow

In [24]:
# check
# returns ipython-notebook
gso.get_tag_counts(['jupyter-notebook'])

[{u'count': 1917,
  u'has_synonyms': False,
  u'is_moderator_only': False,
  u'is_required': False,
  u'name': u'ipython-notebook'}]

In [25]:
tag_counts = gso.get_tag_counts(pkgs)

In [26]:
df_tags = pd.DataFrame(tag_counts)[['name', 'count']]
df_tags.columns = ['package', 'so_tag_counts']
df_tags.head()

Unnamed: 0,package,so_tag_counts
0,pandas,35647
1,numpy,31915
2,design-patterns,23043
3,matplotlib,21080
4,ggplot2,15142


In [27]:
body_counts = {pkg: gso.get_body_count(pkg) for pkg in pkgs}
print "DONE"

DONE


In [28]:
df_questions = pd.DataFrame.from_dict(body_counts, orient='index')
df_questions.reset_index(inplace=True)
df_questions.columns = ['package', 'so_question_count']

In [29]:
so = df_tags.merge(df_questions, on='package', how='outer')
so.head()

Unnamed: 0,package,so_tag_counts,so_question_count
0,pandas,35647.0,41472.0
1,numpy,31915.0,55381.0
2,design-patterns,23043.0,
3,matplotlib,21080.0,24728.0
4,ggplot2,15142.0,


In [30]:
so.to_csv("../data/python_results_so.csv", index=False)

# Get download counts

**NOTE**: this uses file created separately!

In [31]:
def extract_count(line):
    parts = line.split(" - ")
    if len(parts) == 2:
        package = parts[0].split(":")[-1]
        count = int(parts[1].replace(",", ""))
        return package, count
    else:
        return None

def get_pypi_counts():
    with open("../data/python_pypi_counts.log", "r") as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]
    return [extract_count(line) for line in lines
            if extract_count(line) is not None]

counts = get_pypi_counts()

pypi = pd.DataFrame(counts, columns=['package', 'downloads'])
pypi.head()

Unnamed: 0,package,downloads
0,jupyter,398702
1,notebook,791071
2,networkx,3838384
3,seaborn,414400
4,Pattern,302116


In [32]:
pypi.drop_duplicates(inplace=True)

In [33]:
pypi.to_csv("../data/python_results_pypi.csv", index=False)

# Combine all data

In [34]:
# do this in /code/python-analysis.R

In [35]:
df1 = github.merge(so, on='package', how='outer')
df = df1.merge(pypi, on='package', how='outer')
df.head()

Unnamed: 0,package,full_name,forks,stars,so_tag_counts,so_question_count,downloads
0,plotly.py,plotly/plotly.py,441.0,1822.0,,22.0,
1,networkx,networkx/networkx,819.0,2662.0,1447.0,1899.0,3838384.0
2,jupyter-notebook,jupyter/notebook,559.0,1877.0,,2858.0,
3,seaborn,mwaskom/seaborn,542.0,3656.0,1021.0,1304.0,414400.0
4,pattern,clips/pattern,950.0,5109.0,,204535.0,


In [36]:
def extract_count(line):
    parts = line.split(" - ")
    if len(parts) == 2:
        package = parts[0].split(":")[-1]
        count = int(parts[1].replace(",", ""))
        return package, count
    else:
        return None

def get_pypi_counts():
    with open("results.txt", "r") as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]
    return [extract_count(line) for line in lines
            if extract_count(line) is not None]

counts = get_pypi_counts()

pypi = pd.DataFrame(counts, columns=['package', 'downloads'])
pypi.head()

Unnamed: 0,package,downloads
0,ipython,9219641
1,jupyter,398702
2,Theano,608210
3,tensorflow,35501


In [37]:
# sxt