In [2]:
%run talktools.py

# Importing modules

import pandas as pd
import numpy as np
import time

### Performance of Pandas by Example

#### *Numpy arrays:*

- efficient implementation of array operations

- point directly to data in memory


![alt tag](array_vs_list.png)
https://jakevdp.github.io/blog/2014/05/09/why-python-is-slow/

#### *Why Pandas?*


When working with data, we need to deal with

* labeled arrays
* mixed data types
* missing values

---
*Pandas DataFrames have column and row labels:*

In [3]:
df = pd.DataFrame(np.ones((3,3)), index = ['a','b','c'],columns = ['A','B','C'])

In [4]:
df

Unnamed: 0,A,B,C
a,1,1,1
b,1,1,1
c,1,1,1


---
*Pandas DataFrames can have mixed types across rows ... and across columns!*

In [5]:
df['C'] = ['s','s','s']
df

Unnamed: 0,A,B,C
a,1,1,s
b,1,1,s
c,1,1,s


In [6]:
df.loc['a','A'] = 'k'
df

Unnamed: 0,A,B,C
a,k,1,s
b,1,1,s
c,1,1,s


In [7]:
type(df.loc['a','A'])

str

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, a to c
Data columns (total 3 columns):
A    3 non-null object
B    3 non-null float64
C    3 non-null object
dtypes: float64(1), object(2)
memory usage: 96.0+ bytes


---
*Pandas DataFrames are smart about dealing with missing values:*

In [9]:
df.loc['a','B'] = np.nan
df.loc['c','B'] = None
df.loc['a','C'] = np.nan

In [9]:
df

Unnamed: 0,A,B,C
a,k,,
b,1,1.0,s
c,1,,s


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, a to c
Data columns (total 3 columns):
A    3 non-null object
B    1 non-null float64
C    2 non-null object
dtypes: float64(1), object(2)
memory usage: 96.0+ bytes


---
*Pandas DataFrames are just easy to work with:*

In [11]:
# Reading a csv file
df = pd.read_csv('file.csv')

In [12]:
# Reading a csv file without pandas
import csv
f = open('file.csv')
csv_f = csv.reader(f)

rows = []
for row in csv_f:
  rows.append(row)
f.close()

# convert to numpy array only if we know all values are numeric
df = np.array(rows)

---
### Perfomance evaluation

In [13]:
pd.version.version #

'0.16.2'

In [14]:
np.version.version # my numpy version

'1.9.2'

#### Evaluating elements:

In [15]:
N = 10000

In [17]:
# creating a numpy array
ones = np.ones((1,N))
ones

array([[ 1.,  1.,  1., ...,  1.,  1.,  1.]])

In [19]:
# creating a DataFrame from this array
df_ones = pd.DataFrame(ones)
df_ones.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [21]:
# creating an index of 100 random values
indx = np.random.choice(df_ones.columns,100, replace=False)
indx

array([9288, 1311,  544, 3405, 2580, 7593, 4914, 1302, 8563,  558, 3243,
       1097, 5721, 5510, 4794, 5394, 5948, 4197, 2306, 5304, 3217, 6879,
       9427, 2089, 9246, 3204, 9568, 7295, 7020, 8593, 3780, 2123, 4492,
       6113, 9563,  322, 5612, 4261, 4959, 4574, 1806, 7328, 6191, 2830,
       2827, 4818, 2404, 9182, 4923, 8343, 7063, 4173, 4201, 8373, 6609,
       1582, 9597, 1208, 2199, 8597,  313, 1117, 9578, 2457, 7358, 9484,
       8251, 5797, 8885,  164, 4149, 6974, 2995,   12, 3358, 4506, 6259,
       1019, 2240, 6117, 8664, 6542, 5635, 9264,  598, 6371,  488, 7038,
       2915, 4611, 6437, 4936, 7411, 6747, 2609,  300, 4840, 5509, 8847,
       9844])

In [22]:
# subsetting those value from the numpy array
%timeit ones[:,indx]

The slowest run took 2524.77 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 3: 2.56 µs per loop


In [24]:
# subsetting those values from the DataFrame using the column names
%timeit df_ones[indx]

1000 loops, best of 3: 361 µs per loop


In [26]:
df_ones.values

array([[ 1.,  1.,  1., ...,  1.,  1.,  1.]])

In [27]:
# subsetting those values from the DataFrame using the numerical indeces
%timeit df_ones.iloc[:,indx]

1000 loops, best of 3: 414 µs per loop


Evaluating a Numpy array is faster than evaluating a Pandas DataFrame!

In [28]:
# converting to a vertical format
df_ones = (df_ones).transpose()

In [29]:
# subsetting the columns using the numerical index
%timeit df_ones.iloc[indx,:]

1000 loops, best of 3: 335 µs per loop


*Subsetting columns is a bit faster than subsetting rows!*

---
Subsetting bigger multidimensional arrays

In [31]:
print('Subsetting across numeric values:')
# creating a NxN dataframe of numeric values
df_ones = pd.DataFrame(np.ones((N,N)))
        
%timeit df_ones.iloc[indx,:100]

Subsetting across numeric values:
100 loops, best of 3: 4.38 ms per loop


In [32]:
print('Subsetting across types:')
# setting every other column to a column of strings
a = ['a' for j in range(N)]
for i in np.arange(0,100,2):
    df_ones[i] = a
df_ones.head()
    
%timeit df_ones.iloc[indx,:100]
df_ones.head()

Subsetting across types:
100 loops, best of 3: 14.5 ms per loop


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,a,1,a,1,a,1,a,1,a,1,...,1,1,1,1,1,1,1,1,1,1
1,a,1,a,1,a,1,a,1,a,1,...,1,1,1,1,1,1,1,1,1,1
2,a,1,a,1,a,1,a,1,a,1,...,1,1,1,1,1,1,1,1,1,1
3,a,1,a,1,a,1,a,1,a,1,...,1,1,1,1,1,1,1,1,1,1
4,a,1,a,1,a,1,a,1,a,1,...,1,1,1,1,1,1,1,1,1,1


In [33]:
print('Subsetting across strings:')
# setting every other column to a column of strings
for i in np.arange(0,100,1):
    df_ones[i] = a
df_ones.head()
    
%timeit df_ones.iloc[indx,:100]
df_ones.head()

Subsetting across strings:
100 loops, best of 3: 14.6 ms per loop


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,a,a,a,a,a,a,a,a,a,a,...,1,1,1,1,1,1,1,1,1,1
1,a,a,a,a,a,a,a,a,a,a,...,1,1,1,1,1,1,1,1,1,1
2,a,a,a,a,a,a,a,a,a,a,...,1,1,1,1,1,1,1,1,1,1
3,a,a,a,a,a,a,a,a,a,a,...,1,1,1,1,1,1,1,1,1,1
4,a,a,a,a,a,a,a,a,a,a,...,1,1,1,1,1,1,1,1,1,1


*Subsetting within numeric values is faster than subsetting within string values which is faster than subsetting withing mixed type values.*

![alt tag](Tratner_block_slide.png)
http://www.jeffreytratner.com/slides/pandas-under-the-hood-pydata-seattle-2015.pdf

---
#### Doing arithmetic

In [34]:
N = 10000
ones = np.ones((N,N))
df_ones = pd.DataFrame(ones)

In [37]:
%timeit -n 10 temp1 = ones*ones

10 loops, best of 3: 504 ms per loop


In [38]:
%timeit -n 10 temp2 = df_ones*df_ones

10 loops, best of 3: 403 ms per loop


Pandas DataFrame first need to align indeces:

In [39]:
df1 = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], index = ['a','b','c'],columns = ['A','B','C'])
df2 = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], index = ['c','b','a'],columns = ['A','B','C'])

In [40]:
df1

Unnamed: 0,A,B,C
a,1,2,3
b,4,5,6
c,7,8,9


In [41]:
df2

Unnamed: 0,A,B,C
c,1,2,3
b,4,5,6
a,7,8,9


In [42]:
df1+df2

Unnamed: 0,A,B,C
a,8,10,12
b,8,10,12
c,8,10,12


---
#### Applying functions

In [43]:
N = 1000
df_ones = pd.DataFrame(np.ones((N,N)))

In [44]:
# built-in functions

In [45]:
%timeit df_ones.apply(sum,axis = 0)

10 loops, best of 3: 129 ms per loop


In [46]:
%timeit df_ones.apply(sum,axis = 1)

10 loops, best of 3: 119 ms per loop


In [81]:
# lambda functions

In [47]:
%timeit -n 10 df_ones.apply(lambda x:len(x),axis = 0)

10 loops, best of 3: 18.7 ms per loop


In [48]:
%timeit -n 10 df_ones.apply(lambda x:len(x),axis = 1)

10 loops, best of 3: 10.3 ms per loop


In [84]:
# user-defined functions

In [49]:
# creating a slow function to calculate the sum
def myFun(x):
    s = 0
    for i in range(len(x)):
        s = s + x[i]
    return(s)

In [50]:
%timeit -n 1 df_ones.apply(myFun,axis = 0)

1 loops, best of 3: 6.99 s per loop


In [51]:
%timeit -n 1 df_ones.apply(myFun,axis = 1)

1 loops, best of 3: 6.94 s per loop


It is faster to apply a function to columns than to rows.

---
#### Attaching rows/columns

Appending rows:

In [55]:
N = 1000
df_ones = pd.DataFrame(np.ones((1,N)))
df_ones.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [56]:
row = df_ones.iloc[0,:]

In [57]:
%%time
print('Appending rows:')
for i in range(N):
    df_ones = df_ones.append(row)

Appending rows:
CPU times: user 4.54 s, sys: 1.88 s, total: 6.42 s
Wall time: 6.42 s


In [58]:
%%time
print('Preallocating the DataFrame:')
start_time = time.time()
# Preallocating the data frame
df_ones = pd.DataFrame(np.nan, index=range(N), columns=df_ones.columns)

for i in range(N):
    df_ones.iloc[i,] = row

Preallocating the DataFrame:
CPU times: user 204 ms, sys: 2.53 ms, total: 207 ms
Wall time: 206 ms


Appending columns:

In [59]:
df_ones = pd.DataFrame(np.ones((N,1)))

In [60]:
column = df_ones.iloc[:,0]

In [61]:
%%time
print('Appending columns:')
for i in range(N):
    df_ones[i] = column

Appending columns:
CPU times: user 256 ms, sys: 29.9 ms, total: 286 ms
Wall time: 288 ms


In [62]:
df_ones = pd.DataFrame(np.ones((N,1)))

In [63]:
%%time
print('Preallocating the DataFrame:')
# Preallocating the data frame
df_ones = pd.DataFrame(np.nan, index = df_ones.index, columns=range(N))

for i in range(N):
    df_ones.iloc[:,i] = column

Preallocating the DataFrame:
CPU times: user 184 ms, sys: 883 µs, total: 185 ms
Wall time: 184 ms


*Appening columns is much faster than appending rows!*

Can we do even better?

In [None]:
df_ones = pd.DataFrame(np.ones((N,1)))

In [66]:
%%time
l = []
for i in range(N):
    l.append(row)
temp = pd.concat(l,axis = 0)

CPU times: user 36.6 ms, sys: 12.9 ms, total: 49.5 ms
Wall time: 42.2 ms


*Appending data frames is slow!!!*
- it requires copying the whole dataframe.
- it requires alligning all the labels.
- it requires checking if all types match.

Alternatives:
* preallocating the DataFrames
* concatenating many small DataFrames

More Details:

[Jake VanderPlas'  Blog Post](https://jakevdp.github.io/blog/2014/05/09/why-python-is-slow/)

[Jeffrey Tratner's talk at PyData-Seattle 2015](https://www.youtube.com/watch?v=DpyhdO4aM04)

[Jeff Reback's talk at PyData-London 2015](https://www.youtube.com/watch?v=xUBoPK6FGIU)