In [1]:
from traitlets.config.manager import BaseJSONConfigManager
path = '/Users/jmk/anaconda2/envs/data601/etc/jupyter/nbconfig'
cm = BaseJSONConfigManager(config_dir=path)
cm.update('livereveal', {
              'theme': 'night',
              'scroll': True,
              #'transition': 'zoom',
              'start_slideshow_at': 'selected',
})

# theme names: biege, blood, default, moon, night, serif, simple, sky, solarized

{'scroll': True, 'start_slideshow_at': 'selected', 'theme': 'night'}

# Numerical Python

NumPy, short for Numerical Python, is the fundamental package required
for high performance scientific computing and data analysis. It is the
foundation on which nearly all of the higher-level tools in this book
are built. Here are some of the things it provides:ndarray, a fast and

* space-efficient multidimensional array providing vectorized arithmetic
* operations and sophisticated broadcasting capabilities
* Standard mathematical functions for fast operations on entire
* arrays of data without having to write loops
* Tools for reading / writing array data to disk and working with memory-mapped files
* Linear algebra, random number generation, and Fourier transform capabilities
* Tools for integrating code written in C, C++, and Fortran


# Our Focus

An understanding of NumPy arrays and array-oriented computing will
help you use tools like pandas much more effectively


* Fast vectorized array operations for data mungingand cleaning,
* subsetting and filtering, transformation, and any other kinds of computations
* Common array algorithms like sorting, unique, and set operations
* Efficient descriptive statistics and aggregating/summarizing data
* Data alignment and relational data manipulations for merging and joining together heterogeneous data sets
* Expressing conditional logic as array expressions instead of loops with if-elif-else branches
* Group-wise data manipulations (aggregation, transformation, function application).


# ndarray Basics

* has a dtype
* has a shape



In [3]:
import numpy as np   #  This is so traditional it's in almost every tutorial _ever_

data = [4,5,12,13,1,7]
arr = np.array(data)
print("data's type is: ", arr.dtype)
print("data's shape is: ", arr.shape)
print(arr)

data's type is:  int64
data's shape is:  (6,)
[ 4  5 12 13  1  7]


In [4]:
arr = arr.reshape((2,3,))
print("data's type is: ", arr.dtype)
print("data's shape is: ", arr.shape)
print(arr)

data's type is:  int64
data's shape is:  (2, 3)
[[ 4  5 12]
 [13  1  7]]


# Making Arrays

There are some shortcuts...

In [5]:
import numpy as np 
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [6]:
np.ones(10)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [7]:
np.random.randint(1, 100, (10,))

array([37, 95, 39, 71,  2, 27, 22, 17, 51,  6])

In [9]:
#  Can also create an array with an explicit type
arr = np.array([1,2,3], dtype=np.float64)
print(arr, 'of type', arr.dtype)

#  And some shorthand for dtypes like ...
print(np.ones(12, dtype='f8').dtype)

# ... where the letter is the type and the number is how many bytes (e.g. u4 == uint32).  Why?

[1. 2. 3.] of type float64
float64


In [10]:
grid = np.array([1,2,3,4,5,6,7,8,9]).reshape((3,3,))
print(grid)

print(grid * 1.2)
print(grid * grid)
print(grid - grid)

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[ 1.2  2.4  3.6]
 [ 4.8  6.   7.2]
 [ 8.4  9.6 10.8]]
[[ 1  4  9]
 [16 25 36]
 [49 64 81]]
[[0 0 0]
 [0 0 0]
 [0 0 0]]


In [11]:
#  NumPy gives us a nice consisten way to get pieces of an array:  slicing
print(grid[1:3])
print(grid[0:1] * 2)

[[4 5 6]
 [7 8 9]]
[[2 4 6]]


In [12]:
print(grid)
print(grid[1:3,2:3])

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[6]
 [9]]


# Interlude:  Object-oriented Python

Object-oriented programming is a way to bundle together data structures (like dicts and arrays) that represent something specific along with the functions that work with them.

* This is convenient, but not necessarily more expressive than procedural programming.
* Python, as with many languages, uses the class as the basis of its OOP features.
* It's useful to know, since we'll get lots of objects out of libraries
* As a data scientist, you may not define your own classes often

In [13]:
import pandas as pd
from pandas import DataFrame, Series
import requests

class DataSets(object):
    def __init__(self, dataset_url_dict):
        self.dataset_urls = dataset_url_dict
        self.dataframes = {}
        for (label, url) in self.dataset_urls.items():
            self.dataframes[label] = pd.read_csv(url)
    def get(self, label):
        return self.dataframes[label]
    
#  We can initialize a single object that lets us fetch our datasets as dataframes.
ds = DataSets({'planets': 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/planets.csv'})

#  ... so now we don't have to remember how to load them, we can just load them.
planets = ds.get('planets')
print(ds.dataset_urls)

{'planets': 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/planets.csv'}


In [14]:
date = {}
date['year'] = 2017
date['month'] = 9
date['day'] = 27
date

{'day': 27, 'month': 9, 'year': 2017}

In [15]:
class DayOfYear(object):
    def __init__(self, month, day, year):
        self.month = month
        self.day = day
        self.year = year
    def __str__(self):
        return '%d/%d/%d' % (self.month, self.day, self.year)
    def __repr__(self):
        print('repr called')
        return str(self)

today = DayOfYear(9, 27, 2017)
today.year -= 1
print(repr(today))
print('%r' % today)

repr called
9/27/2016
repr called
9/27/2016


Unfortunately, this isn't ideal, since it loads all the datasets at once, not when we need them.  If our list gets big, it'll take a lot of memory.

In [16]:
class DataSetsOnDemand(object):
    def __init__(self, dataset_url_dict):
        self.dataset_urls = dataset_url_dict
        self.dataframes = {}
    def get(self, label):
        if label not in self.dataframes:
            self.dataframes[label] = pd.read_csv(self.dataset_urls[label])
        return self.dataframes[label]
    
#  We can initialize a single object that lets us fetch our datasets as dataframes.
ds = DataSetsOnDemand({'planets': 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/planets.csv'})

#  ... so now we don't have to remember how to load them, we can just load them.
planets = ds.get('planets')
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


Note that we've been able to change _how_ we did it without users of the object being any the wiser.  This is called _encapsulation_ and it makes it much easier to separate _what_ things do from _how_ they do them.

#  Now, back to pandas!

In [18]:
#  Note that slices are not copies.  They are "views" onto the array, so we can modify them too.
grid[1:3,2:3] = [[42],[42]]   #  The value to put into the slice has to be the same shape as the slice
print(grid)

[[ 1  2  3]
 [ 4  5 42]
 [ 7  8 42]]


In [20]:
#  Or be something numpy can "broadcast" into that shape.
grid[1:3,2:3] = 86

print(grid)

[[ 1  2  3]
 [ 4  5 86]
 [ 7  8 86]]


In [21]:
#  Linspace is another shortcut for making an array.  In this case it gives us 9 evenly spaced samples between 0 and 1.
a = np.linspace(0, 1, 9)
print(a)

#  We can index backwards in python (from the right end of the array)
print(a[-4])

#  And we can slice backwards too
print(a[-4:])

[0.    0.125 0.25  0.375 0.5   0.625 0.75  0.875 1.   ]
0.625
[0.625 0.75  0.875 1.   ]


# Broadcasting

* The smaller array is “broadcast” across the larger array so that they have compatible shapes. 
* Provides a means of vectorizing array operations so that looping occurs in C instead of Python. 
* It does this without making needless copies of data and usually leads to efficient algorithm implementations. 

In [23]:
import numpy as np

#  Let's say we have an array of the fats, protein, and carbs in grams
macros = np.array([
  [0.3, 2.5, 3.5],
  [2.9, 27.5, 0],
  [0.4, 1.3, 23.9],
  [14.4, 6, 2.3]])

#  ... and we know that to compute calories, we scale that by 9 cal/g of fat, 4 cal/g of protein and carbs.
cal_per_macro = np.array([9, 4, 4])

print(macros * cal_per_macro)

[[  2.7  10.   14. ]
 [ 26.1 110.    0. ]
 [  3.6   5.2  95.6]
 [129.6  24.    9.2]]


##  So what happened here?
Broadcasting replicates the array along the mismatched dimension

In [24]:
# Use the 'tile' function to replicate cal_per_macro over the number
# of rows 'macros' has (rows is the first element of the shape tuple for
# a 2-D array).
cal_per_macro_stretch = np.tile(cal_per_macro, (macros.shape[0], 1))

print(cal_per_macro_stretch)


[[9 4 4]
 [9 4 4]
 [9 4 4]
 [9 4 4]]


In [25]:
macros * cal_per_macro_stretch == macros * cal_per_macro

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

## Heh.  Also broadcasting.  :)

We can use `np.array_equal(a1, a2)` to get a single True/False value for are the arrays (a) the same size and (b) have equal values for each element.

In [26]:
#  Also ...
print((macros * cal_per_macro_stretch == macros * cal_per_macro).all())

True


# WHAT?!

`.all()` is `True` iff all elements are `True`.  It's similar to `reduce(lambda x, y: x and y, a)`

`.any()` is like that except it performs a reduce by logical `OR`.

In [27]:
# Other useful operations on numpy arrays
import numpy as np

a = np.array([1,2,3,4])

#  Some basic stats...
print("max:", a.max())
print("min:", a.min())
print("mean:", a.mean())
print("std:", a.std())

#  We can also find the index of the min and max (the so-called argmin and argmax)
print("argmin:", a.argmin())
print("argmax:", a.argmax())

max: 4
min: 1
mean: 2.5
std: 1.118033988749895
argmin: 0
argmax: 3


In [28]:
#  Let's take a closer look at 2D arrays
grid = np.arange(36)
grid.resize((6,6))
grid

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

In [29]:
grid[2,2]

14

In [30]:
grid[0, 3:6]

array([3, 4, 5])

In [31]:
# We can use the full start:stop:stride syntax at each level.  Here we get elements from [3,5], but only every other.
grid[0, 3:6:2]

array([3, 5])

In [32]:
# We can even use the stride argument for some odd effects
grid[0, ::-1]

array([5, 4, 3, 2, 1, 0])

In [33]:
#  NumPy supports conditional indexing too.  You can use the name of the array as a stand-in for each element.
grid[grid % 2 == 0]

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32,
       34])

... which is equivalent to something kind of like `filter(lambda x: x % 2 == 0, grid)` but generalizes to higher dimensions.

In [34]:
grid[grid > 2]

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35])

In [35]:
#  Recall that when we make a boolean expression with a numpy array, we end up with an array of booleans
grid > 2

array([[False, False, False,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True]])

It turns out that this array-of-booleans output makes a _mask_ that we can use to index a NumPy array.

In [36]:
from datetime import datetime, timedelta

def days_ago(n=0):
    return (datetime.today() - timedelta(days=n)).date()

exercises = np.array(['pushups', 'pullups', 'pushups', 'pullups'])
data = np.array([[days_ago(0), 20],
                 [days_ago(0), 10],
                 [days_ago(1), 18],
                 [days_ago(1), 9]
               ])

data[exercises == 'pushups']

array([[datetime.date(2018, 4, 4), 20],
       [datetime.date(2018, 4, 3), 18]], dtype=object)

What happens if we don't use an `np.array()` for the names?

In [37]:
from datetime import datetime, timedelta

def days_ago(n=0):
    return (datetime.today() - timedelta(days=n)).date()

exercises = np.array(['pushups', 'pullups', 'pushups', 'pullups'])
data = np.array([[days_ago(0), 20],
                 [days_ago(0), 10],
                 [days_ago(1), 18],
                 [days_ago(1), 9]
               ])

#  This boolean expression isn't broadcast because both operands are not np.array types.  Why does that matter?
data[exercises == 'pushups']

array([[datetime.date(2018, 4, 4), 20],
       [datetime.date(2018, 4, 3), 18]], dtype=object)

## There's actually a ton more to numpy

* Axis swapping
* Universal Functions (ufuncs)
* Sorting
* Statistical functions
* Serialization (in a binary format that's much more efficient than JSON)
* Linear Algebra
* Uniqueness and set logic

Much of this we'll see through the lens of Pandas, the swiss army knife of data management in python.

# Pandas

"Python has long been great for data munging and preparation, but less so for data analysis and modeling. pandas helps fill this gap, enabling you to carry out your entire data analysis workflow in Python without having to switch to a more domain specific language like R."

- Quote from pandas.pydata.org


Pandas has several major data structures worth knowing:
* Series, which is like a labelled one-dimensional NumPy array
* DataFrame, which is like a `dict` of `Series`  
  * Conceptually this is like a spreadsheet or a database table
  * You can see where this would be useful.

In [38]:
import pandas as pd
from pandas import Series, DataFrame

s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [39]:
#  ... which you can slice by index name
s['a':'c']

a   -0.151254
b    0.428503
c   -0.460047
dtype: float64

In [40]:
# ... and also stride by 2, etc.
s[::2]

a   -0.151254
c   -0.460047
e   -0.266020
dtype: float64

In [41]:
#  We can make a dataframe from a dictionary of Series
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df.head()

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [42]:
#  You can also get each column from a dataframe as a Series, 
#  indexing either like a named dict key or as if it was an attribute of an object.
df['one']  # or df.one

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [43]:
# If we try to check for equality, it gets a little weird.  Recall that pandas objects LOVE to broadcast.
df.one == df['one']

a     True
b     True
c     True
d    False
Name: one, dtype: bool

In [44]:
#  Curiously, Nan != NaN apparently.
#
#  If we want to check to see if two dataframes or series are equal,
#  we can call .equals()
df['one'].equals(df.one)

True

In [45]:
#  We can add whole new series via broadcasting
df['three'] = 3.0
df.head()

Unnamed: 0,one,two,three
a,1.0,1.0,3.0
b,2.0,2.0,3.0
c,3.0,3.0,3.0
d,,4.0,3.0


In [46]:
#  Or explicitly, if we've got data of the right shape.
df.four = [4, 4, 4, 4]
df.head()

  


Unnamed: 0,one,two,three
a,1.0,1.0,3.0
b,2.0,2.0,3.0
c,3.0,3.0,3.0
d,,4.0,3.0


In [47]:
# Reindexing lets us reorder the existing rows...
df.reindex(['a', 'c', 'b', 'd'])

Unnamed: 0,one,two,three
a,1.0,1.0,3.0
c,3.0,3.0,3.0
b,2.0,2.0,3.0
d,,4.0,3.0


In [48]:
#  And when we reindex, if we _stretch_ the series, we can "Fill"
#  missing values based on either a forward fill (ffill) or 
#  backward fill (bfill)
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.head()

0      blue
2    purple
4    yellow
dtype: object

In [49]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [50]:
#  We can also pick out specific columns
first_two = df[['one', 'two']]
first_two.head()

df.head()

Unnamed: 0,one,two,three
a,1.0,1.0,3.0
b,2.0,2.0,3.0
c,3.0,3.0,3.0
d,,4.0,3.0
