In [1]:
%run talktools.py

# Importing modules

import pandas as pd
import numpy as np
import time

### Performance of Pandas by Example

#### *Numpy arrays:*

- efficient implementation of array operations

- point directly to data in memory


![alt tag](array_vs_list.png)
https://jakevdp.github.io/blog/2014/05/09/why-python-is-slow/

#### *Why Pandas?*


When working with data, we need to deal with

* labeled arrays
* mixed data types
* missing values

---
*Pandas DataFrames have column and row labels:*

In [2]:
df = pd.DataFrame(np.ones((3,3)), index = ['a','b','c'],columns = ['A','B','C'])

In [3]:
df

Unnamed: 0,A,B,C
a,1,1,1
b,1,1,1
c,1,1,1


---
*Pandas DataFrames can have mixed types across rows ... and across columns!*

In [4]:
df['C'] = ['s','s','s']
df

Unnamed: 0,A,B,C
a,1,1,s
b,1,1,s
c,1,1,s


In [5]:
df.loc['a','A'] = 'k'
df

Unnamed: 0,A,B,C
a,k,1,s
b,1,1,s
c,1,1,s


In [6]:
type(df.loc['a','A'])

str

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, a to c
Data columns (total 3 columns):
A    3 non-null object
B    3 non-null float64
C    3 non-null object
dtypes: float64(1), object(2)
memory usage: 96.0+ bytes


---
*Pandas DataFrames are smart about dealing with missing values:*

In [8]:
df.loc['a','B'] = np.nan
df.loc['c','B'] = None
df.loc['a','C'] = np.nan

In [9]:
df

Unnamed: 0,A,B,C
a,k,,
b,1,1.0,s
c,1,,s


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, a to c
Data columns (total 3 columns):
A    3 non-null object
B    1 non-null float64
C    2 non-null object
dtypes: float64(1), object(2)
memory usage: 96.0+ bytes


---
*Pandas DataFrames are just easy to work with:*

In [14]:
# Reading a csv file
df = pd.read_csv('file.csv')

In [15]:
# Reading a csv file without pandas
import csv
f = open('file.csv')
csv_f = csv.reader(f)

rows = []
for row in csv_f:
  rows.append(row)
f.close()

# convert to numpy array only if we know all values are numeric
df = np.array(rows)

---
### Perfomance evaluation

In [16]:
pd.version.version #

'0.16.2'

In [17]:
np.version.version # my numpy version

'1.9.2'

#### Evaluating elements:

In [18]:
N = 10000

In [19]:
# creating a numpy array
ones = np.ones((1,N))

In [20]:
# creating a DataFrame from this array
df_ones = pd.DataFrame(ones)

In [21]:
# creating an index of 100 random values
indx = np.random.choice(df_ones.columns,100, replace=False)

In [22]:
# subsetting those value from the numpy array
%timeit ones[:,indx]

The slowest run took 14.92 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 3: 2.54 µs per loop


In [23]:
# subsetting those values from the DataFrame using the column names
%timeit df_ones[indx]

The slowest run took 5.07 times longer than the fastest. This could mean that an intermediate result is being cached 
1000 loops, best of 3: 356 µs per loop


In [24]:
# subsetting those values from the DataFrame using the numerical indeces
%timeit df_ones.iloc[:,indx]

1000 loops, best of 3: 397 µs per loop


Evaluating a Numpy array is faster than evaluating a Pandas DataFrame!

In [25]:
# converting to a vertical format
df_ones = (df_ones).transpose()

In [26]:
# subsetting the columns using the numerical index
%timeit df_ones.iloc[indx,:]

1000 loops, best of 3: 334 µs per loop


*Subsetting columns is a bit faster than subsetting rows!*

---
Subsetting bigger multidimensional arrays

In [None]:
print('Subsetting across numeric values:')
# creating a NxN dataframe of numeric values
df_ones = pd.DataFrame(np.ones((N,N)))
        
%timeit df_ones.iloc[indx,:100]

Subsetting across numeric values:
100 loops, best of 3: 5.92 ms per loop


In [None]:
print('Subsetting across types:')
# setting every other column to a column of strings
a = ['a' for j in range(N)]
for i in np.arange(0,100,2):
    df_ones[i] = a
df_ones.head()
    
%timeit df_ones.iloc[indx,:100]
df_ones.head()

In [None]:
print('Subsetting across strings:')
# setting every other column to a column of strings
for i in np.arange(0,100,1):
    df_ones[i] = a
df_ones.head()
    
%timeit df_ones.iloc[indx,:100]
df_ones.head()

*Subsetting within numeric values is faster than subsetting within string values which is faster than subsetting withing mixed type values.*

![alt tag](Tratner_block_slide.png)
http://www.jeffreytratner.com/slides/pandas-under-the-hood-pydata-seattle-2015.pdf

---
#### Doing arithmetic

In [None]:
N = 10000
ones = np.ones((N,N))
df_ones = pd.DataFrame(ones)

In [None]:
%timeit -n 10 temp1 = ones*ones

In [None]:
%timeit -n 10 temp2 = df_ones*df_ones

Pandas DataFrame first need to align indeces:

In [None]:
df1 = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], index = ['a','b','c'],columns = ['A','B','C'])
df2 = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], index = ['c','b','a'],columns = ['A','B','C'])

In [None]:
df1

In [None]:
df2

In [None]:
df1+df2

---
#### Applying functions

In [None]:
# built-in functions

In [None]:
%timeit df.apply(np.sum,axis = 0)

In [None]:
%timeit df.apply(np.sum,axis = 1)

In [None]:
# lambda functions

In [None]:
%timeit -n 10 df.apply(lambda x:len(x),axis = 0)

In [None]:
%timeit -n 10 df.apply(lambda x:len(x),axis = 1)

In [None]:
# user-defined functions

In [None]:
# creating a slow function to calculate the sum
def myFun(x):
    s = 0
    for i in range(len(x)):
        s = s + x[i]
    return(s)

In [None]:
%timeit -n 1 df.apply(myFun,axis = 0)

In [None]:
%timeit -n 1 df.apply(myFun,axis = 1)

It is faster to apply a function to columns than to rows.

---
#### Attaching rows/columns

Appending rows:

In [None]:
N = 10000
df_ones = pd.DataFrame(np.ones((1,N)))
df_ones.head()

In [None]:
row = df_ones.iloc[0,:]

In [None]:
%%time
print('Appending rows:')
for i in range(N):
    df_ones = df_ones.append(row)

In [None]:
%%time
print('Preallocating the DataFrame:')
start_time = time.time()
# Preallocating the data frame
df_ones = pd.DataFrame(np.nan, index=range(N), columns=df_ones.columns)

for i in range(N):
    df_ones.iloc[i,] = row

Appending columns:

In [None]:
df_ones = pd.DataFrame(np.ones((N,1)))

In [None]:
column = df_ones.iloc[:,0]

In [None]:
%%time
print('Appending columns:')
for i in range(N):
    df_ones[i] = column

In [None]:
df_ones = pd.DataFrame(np.ones((N,1)))

In [None]:
%%time
print('Preallocating the DataFrame:')
# Preallocating the data frame
df_ones = pd.DataFrame(np.nan, index = df_ones.index, columns=range(N))

for i in range(N):
    df_ones.iloc[:,i] = column

*Appening columns is much faster than appending rows!*

Can we do even better?

In [None]:
df_ones = pd.DataFrame(np.ones((N,1)))

In [None]:
# store rows in a list and concatenate in the end
%%time
l = []
for i in range(N):
    l.append(row)
temp = pd.concat(l,axis = 0)

*Appending data frames is slow!!!*
- it requires copying the whole dataframe.
- it requires alligning all the labels.
- it requires checking if all types match.

Alternatives:
* preallocating the DataFrames
* concatenating many small DataFrames

More Details:

[Jake VanderPlas'  Blog Post](https://jakevdp.github.io/blog/2014/05/09/why-python-is-slow/)

[Jeffrey Tratner's talk at PyData-Seattle 2015](https://www.youtube.com/watch?v=DpyhdO4aM04)

[Jeff Reback's talk at PyData-London 2015](https://www.youtube.com/watch?v=xUBoPK6FGIU)