In [2]:
import pandas as pd
import numpy as np

### reindex

In [3]:
obj = pd.Series([4.5, 6.2, 9.1, 8.8], index = ['d', 'c', 'a', 'b'])
obj

d    4.5
c    6.2
a    9.1
b    8.8
dtype: float64

In [4]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value = 0)

a    9.1
b    8.8
c    6.2
d    4.5
e    0.0
dtype: float64

In [5]:
obj_2 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
obj_2.reindex(range(6), method = 'ffill')
# ffill - fill values forward
# bfill - fill values backward

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [6]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), 
                     index = ['a', 'c', 'd'], 
                     columns = ['Wuhan', 'Beijing', 'Guangzhou'])
frame

Unnamed: 0,Wuhan,Beijing,Guangzhou
a,0,1,2
c,3,4,5
d,6,7,8


In [7]:
frame.reindex(['a', 'b', 'c', 'd'], method = 'bfill')

Unnamed: 0,Wuhan,Beijing,Guangzhou
a,0,1,2
b,3,4,5
c,3,4,5
d,6,7,8


In [8]:
frame.reindex(columns = ['Wuhan', 'Guangzhou', 'Beijing', 'Hangzhou'])

Unnamed: 0,Wuhan,Guangzhou,Beijing,Hangzhou
a,0,2,1,
c,3,5,4,
d,6,8,7,


### drop

In [9]:
obj.drop('c')  # drop entries without modifying the original data

d    4.5
a    9.1
b    8.8
dtype: float64

In [10]:
frame.drop('c')  # DataFrame drop

Unnamed: 0,Wuhan,Beijing,Guangzhou
a,0,1,2
d,6,7,8


In [11]:
frame.drop(['Beijing', 'Guangzhou'], axis = 1)

Unnamed: 0,Wuhan
a,0
c,3
d,6


### indexing, selection, filtering

In [12]:
frame_2 = frame.reindex(['a', 'b', 'c', 'd'], 
                        columns = ['Wuhan', 'Beijing', 'Guangzhou', 'Hangzhou'], 
                        fill_value = 0)
frame_2

Unnamed: 0,Wuhan,Beijing,Guangzhou,Hangzhou
a,0,1,2,0
b,0,0,0,0
c,3,4,5,0
d,6,7,8,0


In [13]:
frame_2[['Wuhan', 'Beijing']]

Unnamed: 0,Wuhan,Beijing
a,0,1
b,0,0
c,3,4
d,6,7


In [14]:
frame_2.loc[['a', 'b']]

Unnamed: 0,Wuhan,Beijing,Guangzhou,Hangzhou
a,0,1,2,0
b,0,0,0,0


In [15]:
frame_2.iloc[:2]

Unnamed: 0,Wuhan,Beijing,Guangzhou,Hangzhou
a,0,1,2,0
b,0,0,0,0


In [16]:
frame_2[frame_2['Wuhan'] == 0]

Unnamed: 0,Wuhan,Beijing,Guangzhou,Hangzhou
a,0,1,2,0
b,0,0,0,0


In [17]:
frame_2[frame_2 == 0] = 9
frame_2

Unnamed: 0,Wuhan,Beijing,Guangzhou,Hangzhou
a,9,1,2,9
b,9,9,9,9
c,3,4,5,9
d,6,7,8,9


In [18]:
frame_2.loc[:'b', ['Wuhan', 'Beijing']]

Unnamed: 0,Wuhan,Beijing
a,9,1
b,9,9


### Axis indexes with duplicate values

In [19]:
obj_3 = pd.Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])
obj_3.index.is_unique

False

In [20]:
obj_3['a']

a    0
a    1
dtype: int64

### Summarizing and Computing Descriptive Statistics

In [21]:
frame_3 = pd.DataFrame([[1.4, np.nan], 
                        [7.1, -4.5], 
                        [np.nan, np.nan], 
                        [0.75, -1.3]], 
                       index = ['a', 'b', 'c', 'd'], columns = ['one', 'two'])
frame_3

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [22]:
frame_3.sum()

one    9.25
two   -5.80
dtype: float64

In [23]:
frame_3.sum(axis = 1)

a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [24]:
frame_3.mean(axis = 1, skipna = False)  # (1.4 + NaN) / 2 = NaN

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [25]:
frame_3.mean(axis = 1)  # 1.4 / 1 = 1.4

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [26]:
frame_3.idxmax()  # return the indexes with the max values in each columns

one    b
two    d
dtype: object

In [27]:
frame_3.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [28]:
frame_3.describe()  # incredibly useful in ProbStat

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [29]:
obj_4 = pd.Series(['a', 'a', 'c', 'd'] * 4)
obj_4

0     a
1     a
2     c
3     d
4     a
5     a
6     c
7     d
8     a
9     a
10    c
11    d
12    a
13    a
14    c
15    d
dtype: object

In [30]:
obj_4.describe()  # describle for non-numeric datas

count     16
unique     3
top        a
freq       8
dtype: object

### Function application and mapping

In [32]:
frame_4 = pd.DataFrame(np.random.randn(4, 3), 
                       columns = list('bde'), 
                       index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
frame_4

Unnamed: 0,b,d,e
Utah,-0.097746,-1.530026,1.798382
Ohio,-1.809261,-0.829743,1.243681
Texas,1.92484,0.884638,-0.64727
Oregon,-0.672027,-0.334146,2.22736


In [35]:
np.abs(frame_4)

Unnamed: 0,b,d,e
Utah,0.097746,1.530026,1.798382
Ohio,1.809261,0.829743,1.243681
Texas,1.92484,0.884638,0.64727
Oregon,0.672027,0.334146,2.22736


In [36]:
f = lambda x: x.max() - x.min()
frame_4.apply(f)

b    3.734101
d    2.414663
e    2.874630
dtype: float64

In [37]:
frame_4.apply(f, axis = 1)  # apply with be applied to columns or indexes

Utah      3.328407
Ohio      3.052942
Texas     2.572111
Oregon    2.899387
dtype: float64

In [42]:
format = lambda x: '%.2f' % x
frame_4.applymap(format)  # applymap with be applied to every elements

Unnamed: 0,b,d,e
Utah,-0.1,-1.53,1.8
Ohio,-1.81,-0.83,1.24
Texas,1.92,0.88,-0.65
Oregon,-0.67,-0.33,2.23


### Sorting and ranking

In [None]:
# sort_index(), sort indexes in Series and DataFrames
# order(), sort values in Series
# rank(), return values' rank 

### Correlation and Covariance

In [61]:
import pandas_datareader.data as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'AMZN']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')

price = pd.DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()})
volume = pd.DataFrame({tic: data['Volume'] for tic, data in all_data.items()})

In [76]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,AMZN,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-24,0.03434,-0.003383,0.004384,0.002587
2009-12-28,0.012294,0.006066,0.013326,0.005484
2009-12-29,-0.011861,0.000718,-0.003477,0.007058
2009-12-30,0.012147,-0.020945,0.005461,-0.013699
2009-12-31,-0.0043,-0.014433,-0.012597,-0.015504


In [77]:
returns.MSFT.corr(returns.IBM)  # calculate correlation of two specific columns

0.49598006790672183

In [78]:
returns.MSFT.cov(returns.IBM)

0.00021595768182070335

In [None]:
# corrwith