# Getting Started with pandas

In [2]:
import pandas as pd

In [2]:
from pandas import Series, DataFrame

In [3]:
import numpy as np
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

## Introduction to pandas Data Structures

### Series

In [4]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [None]:
# check the values
obj.values

In [6]:
# check the index
obj.index

RangeIndex(start=0, stop=4, step=1)

### DataFrame

In [7]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [8]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [10]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [11]:
# check frame DataFrame of last 5 rows in contrast to the frame.head()
# [hint] tail
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [12]:
# print first 2 rows of frame
frame.head(2)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7


In [13]:
# Print the 'state' column values
# (1) dict-like notation
# (2) attribute .
frame['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [14]:
frame.state

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [15]:
# Add new column with name as 'dept' and value as 16.5 

frame['dept'] = 16.5

In [16]:
frame

Unnamed: 0,state,year,pop,dept
0,Ohio,2000,1.5,16.5
1,Ohio,2001,1.7,16.5
2,Ohio,2002,3.6,16.5
3,Nevada,2001,2.4,16.5
4,Nevada,2002,2.9,16.5
5,Nevada,2003,3.2,16.5


In [17]:
# change the columns to rows, rows to columns
# [hint] transpose
frame.T

Unnamed: 0,0,1,2,3,4,5
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
year,2000,2001,2002,2001,2002,2003
pop,1.5,1.7,3.6,2.4,2.9,3.2
dept,16.5,16.5,16.5,16.5,16.5,16.5


### Index Objects

In [18]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [19]:
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [20]:
index[1]

'b'

In [21]:
index[1] ='d'

TypeError: Index does not support mutable operations

index[1] = 'd'  # TypeError

In [22]:
'state' in frame.columns

True

In [23]:
3 in frame.index

True

## Essential Functionality

### Reindexing

In [24]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [25]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

### Dropping Entries from an Axis

In [29]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [31]:
# drop the row of 'Ohio'
data.drop('Ohio')

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [32]:
# drop the two rows of 'Colorado', 'Ohio'
data.drop(['Colorado','Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [34]:
# drop the colomn of 'two', 'four'
# [hint] axis=1 or axis='columns'
data.drop(['two','four'],axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [35]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [36]:
# drop the column of 'one', inplace=True
data.drop('one',axis=1, inplace = True)

data

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


### Indexing, Selection, and Filtering

In [37]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [38]:
obj['b']

1.0

In [39]:
# assign value of 5 on obj from 'b' to 'd' (caution for the last index 'd')
obj['b':'d']=5
obj

a    0.0
b    5.0
c    5.0
d    5.0
dtype: float64

In [40]:
obj['b':] = 3
obj

a    0.0
b    3.0
c    3.0
d    3.0
dtype: float64

In [41]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [42]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [43]:
# select columns of ['three', 'one']
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [44]:
# select rows of upto Colorado
#          one	two	three	four
# Ohio	    0	0	0	0
# Colorado	0	5	6	7
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [45]:
# select DataFrame when data['three'] > 5
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [46]:
# assign zero when values are less than 5
data[data<5]=0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#### Selection with loc and iloc

In [47]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [50]:
# select row 'Colorado' and column ['two','three']
# (1) using loc
# (2) using iloc
data.loc['Colorado',['two','three']]

two      5
three    6
Name: Colorado, dtype: int32

In [51]:
data.iloc[1,[1,2]]

two      5
three    6
Name: Colorado, dtype: int32

In [55]:
# select row ['Colorado', 'Utah'] and column all
# (1) using loc
# (2) using iloc
data.loc[['Colorado','Utah'],:]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11


In [58]:
data.iloc[[1,2],:]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11


In [59]:
data.iloc[1:3,:]

Unnamed: 0,one,two,three,four
Colorado,0,5,6,7
Utah,8,9,10,11


### Integer Indexes

In [60]:
ser = pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [61]:
ser.index

RangeIndex(start=0, stop=3, step=1)

In [65]:
# Guess for the three cases
# (1) ser[:1]
# (2) ser.loc[:1]
# (3) ser.iloc[:1]
ser[:1]

0    0.0
dtype: float64

In [66]:
ser.loc[:1]


0    0.0
1    1.0
dtype: float64

In [67]:
ser.iloc[:1]

0    0.0
dtype: float64

### Arithmetic and Data Alignment

In [68]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df2.loc[1, 'b'] = np.nan

In [69]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [70]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [71]:
# (1) + operation
# (2) .add

df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [72]:
df1.add(df2)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [73]:
# use fill_value=0 on add( , fill_value=0)
# check the results
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [75]:
df1.add(df2, fill_value=100)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,104.0
1,9.0,105.0,13.0,15.0,109.0
2,18.0,20.0,22.0,24.0,114.0
3,115.0,116.0,117.0,118.0,119.0


### Function Application and Mapping

In [76]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-0.204708,0.478943,-0.519439
Ohio,-0.55573,1.965781,1.393406
Texas,0.092908,0.281746,0.769023
Oregon,1.246435,1.007189,-1.296221


In [77]:
# make the values all positive 
# [hint] np.abs()
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.204708,0.478943,0.519439
Ohio,0.55573,1.965781,1.393406
Texas,0.092908,0.281746,0.769023
Oregon,1.246435,1.007189,1.296221


In [78]:
# check manual for the frame.apply
frame.apply?

In [79]:
# check the differences between min and max values for across rows
# define lambda function 
# [hint] x.max() - x.min()
# use apply

f = lambda x: x.max() - x.min()

frame.apply(f)

b    1.802165
d    1.684034
e    2.689627
dtype: float64

In [80]:
# check the differences between min and max values for across columns
frame.apply(f,axis=1)

Utah      0.998382
Ohio      2.521511
Texas     0.676115
Oregon    2.542656
dtype: float64

In [81]:
frame.apply(f, axis='columns')

Utah      0.998382
Ohio      2.521511
Texas     0.676115
Oregon    2.542656
dtype: float64

In [None]:
# check the manual for applymap
frame.applymap?

In [82]:
frame

Unnamed: 0,b,d,e
Utah,-0.204708,0.478943,-0.519439
Ohio,-0.55573,1.965781,1.393406
Texas,0.092908,0.281746,0.769023
Oregon,1.246435,1.007189,-1.296221


In [83]:
# change the output format of every values '%.2f' % x
format = lambda x:  '%.2f'%x
frame.applymap(format)


Unnamed: 0,b,d,e
Utah,-0.2,0.48,-0.52
Ohio,-0.56,1.97,1.39
Texas,0.09,0.28,0.77
Oregon,1.25,1.01,-1.3


### Sorting and Ranking

In [84]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [85]:
# sort the index using .sort_index()
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [86]:
# sort the column index using sort_index
# [hint] axis=1 or axis='columns'
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [87]:
# sort the column index based on descending order
# [hint] ascending=False
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [88]:
obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [90]:
# sort depending on values
# [hint] sort_values()
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [93]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [94]:
# sort values by='b'
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [95]:
# sort values by 'a' and then 'b'  (depending on two columns)
frame.sort_values(by=['a','b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [5]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [97]:
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

## Summarizing and Computing Descriptive Statistics

In [98]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [99]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [100]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [101]:
df.idxmax()

one    b
two    d
dtype: object

In [102]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [103]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


### Unique Values, Value Counts, and Membership

In [104]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [105]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [106]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [None]:
obj.isin?

In [107]:
# print each value if it is in ['b','c']
# [hint] mask = obj.isin,  obj[mask]

mask = obj.isin(['b','c'])
mask


0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [109]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object