# Data Munging
## Lecture 2: pandas

# pandas
* A powerful data science tool that extends NumPy
* High-level (pandas takes care of details for you)
* Can handle time series and non-time series, as well as missing data
* Two key data structures:
    * Series
    * DataFrame

In [1]:
import numpy as np
import pandas as pd

# Series
* Basically a one-dimensional array
* Each element in a seires has an index (or label)

In [2]:
ser = pd.Series([8, 3, 6, -6, 1])
ser  # left column shows the index of each element

0    8
1    3
2    6
3   -6
4    1
dtype: int64

In [3]:
# values and indices can be accessed separately
ser.values

array([ 8,  3,  6, -6,  1])

In [4]:
ser.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
# you can assign your own indices
ser = pd.Series([8, 3, 6, -6, 1], index=['a', 'b', 'c', 'd', 'e'])
ser

a    8
b    3
c    6
d   -6
e    1
dtype: int64

In [6]:
# series are like dictionaries in that values can accessed using index
ser['c']

6

In [7]:
ser[['d', 'b']]

d   -6
b    3
dtype: int64

In [8]:
ser[ser < 0]

d   -6
dtype: int64

In [9]:
'a' in ser

True

In [10]:
6 in ser  # this kind of search is for index, not element values

False

In [11]:
# based on the similarity with dict,
# you can guess that a dict can easily be converted to a series
pt_age = {'John': 38, 'Nancy': 23, 'Kate': 50}
ser2 = pd.Series(pt_age)
ser2

John     38
Nancy    23
Kate     50
dtype: int64

In [12]:
# you can also give a separate index list 
ser3 = pd.Series(pt_age, index=['John', 'Kate', 'Nancy', 'Peter'])
ser3  # note that NaN (not a number) is assigned to the missing index

John     38.0
Kate     50.0
Nancy    23.0
Peter     NaN
dtype: float64

In [13]:
# NaN values can be checked as follows
ser3.isnull()

John     False
Kate     False
Nancy    False
Peter     True
dtype: bool

In [14]:
ser3.notnull()

John      True
Kate      True
Nancy     True
Peter    False
dtype: bool

In [15]:
# many operations on series implement automatic alignment based on index 
ser2 + ser3

John      76.0
Kate     100.0
Nancy     46.0
Peter      NaN
dtype: float64

In [16]:
# you can give names to the series object as well as index
ser3.name = 'patient age'
ser3.index.name = 'name'
ser3

name
John     38.0
Kate     50.0
Nancy    23.0
Peter     NaN
Name: patient age, dtype: float64

# DataFrame
* A two-dimensional table
* Columns can be of different data types
* R has a similar data structure too

In [17]:
# there are many different ways to construct a dataframe
# one common way is to convert a dict
data = {'name': ['John', 'Kate', 'Nancy'], 
        'age': [38, 50, 23], 
        'gender': ['M', 'F', 'F']}
pt_df = pd.DataFrame(data)
pt_df  # see how nice the output format is!

Unnamed: 0,name,age,gender
0,John,38,M
1,Kate,50,F
2,Nancy,23,F


In [18]:
# you can order the columns and set the index for each row
pt_df2 = pd.DataFrame(data, columns=['patient id', 'name', 'gender', 'age'], index=['one', 'two', 'three'])
pt_df2

Unnamed: 0,patient id,name,gender,age
one,,John,M,38
two,,Kate,F,50
three,,Nancy,F,23


In [19]:
# each column can be sliced out as a series
pt_df2['name']

one       John
two       Kate
three    Nancy
Name: name, dtype: object

In [20]:
# columns can be accessed as object attributes as well
pt_df2.age

one      38
two      50
three    23
Name: age, dtype: int64

In [21]:
# rows can be retrieved too
pt_df2.loc['two']

patient id     NaN
name          Kate
gender           F
age             50
Name: two, dtype: object

In [22]:
# data in a column can be modified
pt_df2['patient id'] = range(3)
pt_df2

Unnamed: 0,patient id,name,gender,age
one,0,John,M,38
two,1,Kate,F,50
three,2,Nancy,F,23


In [23]:
# you can delete a column
del pt_df2['name']
pt_df2

Unnamed: 0,patient id,gender,age
one,0,M,38
two,1,F,50
three,2,F,23


In [25]:
# you can also transpose a dataframe
pt_df2.T

Unnamed: 0,one,two,three
patient id,0,1,2
gender,M,F,F
age,38,50,23


In [None]:
# axes can be named
pt_df2.index.name = 'row id'
pt_df2.columns.name = 'variables'
pt_df2

In [None]:
# data contained in a dataframe can be retrieved
# as a 2d array
pt_df2.values

# Index Operations

In [None]:
# let's work with the dataframe pt_df
pt_df

In [None]:
# you can reindex as follows:
pt_df.reindex(index=range(5), 
             method='ffill',  # interpolation method is forwarding filling
             columns=['name', 'age', 'gender', 'diagnosis'])

In [None]:
# here is another way
pt_df.ix[[0, 1, 2, 3, 4], ['name', 'age', 'gender', 'diagnosis']]

In [None]:
# you can drop entries from either axis of a dataframe
pt_df.drop([2, 1])

In [None]:
pt_df.drop(['age', 'gender'], axis=1)

In [None]:
# indexing is similar to ndarray indexing
# big difference is index labels can be used
pt_df2

In [None]:
pt_df2[['patient id', 'age']]

In [None]:
# select rows
pt_df2[:2]

In [None]:
# select based on a condition on a column
pt_df2[pt_df2['gender'] == 'M']

In [None]:
# ix method is useful for indexing
pt_df2.ix['two':, :'gender']

# Working with Series and DataFrames

In [None]:
# arithmetics with data alignment
pt_df2 + pt_df2

In [None]:
# arithmetic operations are possible 
# between a series and a dataframe
ser = pt_df2.ix[1]
ser

In [None]:
pt_df2 + ser

In [None]:
# let's create another dataframe
df = pd.DataFrame(np.random.randn(5, 4), 
                  index=range(5), 
                  columns=['var ' + str(x+1) for x in range(4)])
df

In [None]:
# you can apply a custom function to the dataframe
df.apply(lambda x: np.mean(x))

In [None]:
# along the other axis
df.apply(lambda x: np.mean(x), axis=1)

In [None]:
# element-wise opration
df.apply(lambda x: (x - np.mean(x)) / np.std(x))

# Sorting and Ranking

In [None]:
# let's sort the series below
ser2

In [None]:
# below sorts by index
ser2.sort_index()

In [None]:
# below sorts by values
ser2.sort_values()

In [None]:
# now let's rank this series
# ranking sorts the series and spits out ranks
ser2.rank()  

In [None]:
# how about sorting this dataframe?
pt_df2

In [None]:
pt_df2.sort_index()

In [None]:
# along the other axis
pt_df2.sort_index(axis=1)

In [None]:
# sort by column values
pt_df2.sort_values(by=['age', 'gender'])

In [None]:
# ranking can be done for each column
pt_df2.rank(ascending=False)  # note the rank of 2.5

In [None]:
# use a different tie breaking method
pt_df2.rank(ascending=False, method='max')

# Descriptive Statistics
* pandas makes it easy to compute summary statistics
* Missing data are excluded

In [None]:
# let's add a few missing data to df
df.ix[0, 'var 4'] = np.nan  # this is how you insert a NaN 
df.ix[2, 'var 1'] = np.nan
df.ix[3, 'var 2'] = np.nan
df

In [None]:
df.sum()

In [None]:
# aggregate over columns and include missing data
df.mean(axis=1, skipna=False)

In [None]:
# get index of maximum value
df.idxmax()

In [None]:
# describe() is a useful method that shows summary stats
df.describe()

In [None]:
# pairwise correlation between columns
df.corr()

In [None]:
# covariance matrix can be calculated too
df.cov()

# Handling Missing Data
* pandas makes it easy to work with missing data
* Missing data are prevalent in health data in general
* "None", "NaN", and "NA" are all condiered missing data

In [None]:
# drop missing data
# by default, below drops rows with at least one missing value
df.dropna()

In [None]:
# drop rows that are all missing
df.dropna(how='all')

In [None]:
# drop columns with missing data
df.dropna(axis=1)

In [None]:
# possible to fill in missing data
df.fillna(0)

In [None]:
# forward filling
df.fillna(method='ffill')

In [None]:
# fill with column means
df.fillna(df.mean())

# Hierarchical Indexing
* You can have multiple indexing levels on an axis
* This makes it easy to dissect the data from various angles

In [None]:
# let's apply two levels of indexing to a series
serh = pd.Series(np.random.randn(5), 
                index=[['a', 'a', 'b', 'b', 'c'], 
                      [1, 2, 1, 2, 1]])
serh

In [None]:
# partial indexing is possible
serh['b':'c']

In [None]:
serh[:, 2]

In [None]:
# it's easy to convert to a dataframe
serh.unstack()

In [None]:
# now create a dataframe with two-level indexing on each axis
# this data represent mutiple hospital admissions per patient
dfh = pd.DataFrame(np.random.randn(4, 3), 
                  index=[['Charles', 'Charles', 'Ann', 'Mike'], 
                         [1, 2, 1, 1]],  
                  columns=[['First Value', 'First Value', 'Last Value'], 
                           ['Var 1', 'Var 2', 'Var 2']])
dfh

In [None]:
# name the hierarchical levels
dfh.index.names = ['Patient Name', 'Adm Seq']
dfh.columns.names = ['Type', 'Var Name']
dfh

In [None]:
# swap levels
dfh.swaplevel('Type', 'Var Name', axis=1)

In [None]:
# sort data based on a level
dfh.sortlevel(1)

In [None]:
# summary stats by level
dfh.mean(level='Adm Seq')

In [None]:
# columns can be turned into the index
# recall the dataframe pt_df
pt_df

In [None]:
# use the set_index method
pt_df.set_index(['name', 'gender'])