In [3]:
import pandas as pd
import numpy as np

### reindex

In [3]:
obj = pd.Series([4.5, 6.2, 9.1, 8.8], index = ['d', 'c', 'a', 'b'])
obj

d    4.5
c    6.2
a    9.1
b    8.8
dtype: float64

In [4]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value = 0)

a    9.1
b    8.8
c    6.2
d    4.5
e    0.0
dtype: float64

In [5]:
obj_2 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
obj_2.reindex(range(6), method = 'ffill')
# ffill - fill values forward
# bfill - fill values backward

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [6]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), 
                     index = ['a', 'c', 'd'], 
                     columns = ['Wuhan', 'Beijing', 'Guangzhou'])
frame

Unnamed: 0,Wuhan,Beijing,Guangzhou
a,0,1,2
c,3,4,5
d,6,7,8


In [7]:
frame.reindex(['a', 'b', 'c', 'd'], method = 'bfill')

Unnamed: 0,Wuhan,Beijing,Guangzhou
a,0,1,2
b,3,4,5
c,3,4,5
d,6,7,8


In [8]:
frame.reindex(columns = ['Wuhan', 'Guangzhou', 'Beijing', 'Hangzhou'])

Unnamed: 0,Wuhan,Guangzhou,Beijing,Hangzhou
a,0,2,1,
c,3,5,4,
d,6,8,7,


### drop

In [10]:
obj.drop('c')  # drop entries without modifying the original data

d    4.5
a    9.1
b    8.8
dtype: float64

In [13]:
frame.drop('c')  # DataFrame drop

Unnamed: 0,Wuhan,Beijing,Guangzhou
a,0,1,2
d,6,7,8


In [14]:
frame.drop(['Beijing', 'Guangzhou'], axis = 1)

Unnamed: 0,Wuhan
a,0
c,3
d,6


### indexing, selection, filtering

In [19]:
frame_2 = frame.reindex(['a', 'b', 'c', 'd'], 
                        columns = ['Wuhan', 'Beijing', 'Guangzhou', 'Hangzhou'], 
                        fill_value = 0)
frame_2

Unnamed: 0,Wuhan,Beijing,Guangzhou,Hangzhou
a,0,1,2,0
b,0,0,0,0
c,3,4,5,0
d,6,7,8,0


In [27]:
frame_2[['Wuhan', 'Beijing']]

Unnamed: 0,Wuhan,Beijing
a,0,1
b,0,0
c,3,4
d,6,7


In [28]:
frame_2.loc[['a', 'b']]

Unnamed: 0,Wuhan,Beijing,Guangzhou,Hangzhou
a,0,1,2,0
b,0,0,0,0


In [43]:
frame_2.iloc[:2]

Unnamed: 0,Wuhan,Beijing,Guangzhou,Hangzhou
a,0,1,2,0
b,0,0,0,0


In [32]:
frame_2[frame_2['Wuhan'] == 0]

Unnamed: 0,Wuhan,Beijing,Guangzhou,Hangzhou
a,0,1,2,0
b,0,0,0,0


In [47]:
frame_2[frame_2 == 0] = 9
frame_2

Unnamed: 0,Wuhan,Beijing,Guangzhou,Hangzhou
a,9,1,2,9
b,9,9,9,9
c,3,4,5,9
d,6,7,8,9


In [54]:
frame_2.loc[:'b', ['Wuhan', 'Beijing']]

Unnamed: 0,Wuhan,Beijing
a,9,1
b,9,9


### Axis indexes with duplicate values

In [7]:
obj_3 = pd.Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])
obj_3.index.is_unique

False

In [8]:
obj_3['a']

a    0
a    1
dtype: int64

### Summarizing and Computing Descriptive Statistics

In [16]:
frame_3 = pd.DataFrame([[1.4, np.nan], 
                        [7.1, -4.5], 
                        [np.nan, np.nan], 
                        [0.75, -1.3]], 
                       index = ['a', 'b', 'c', 'd'], columns = ['one', 'two'])
frame_3

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [15]:
frame_3.sum()

one    9.25
two   -5.80
dtype: float64

In [12]:
frame_3.sum(axis = 1)

a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [17]:
frame_3.mean(axis = 1, skipna = False)  # (1.4 + NaN) / 2 = NaN

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [18]:
frame_3.mean(axis = 1)  # 1.4 / 1 = 1.4

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [20]:
frame_3.idxmax()  # return the indexes with the max values in each columns

one    b
two    d
dtype: object

In [21]:
frame_3.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [23]:
frame_3.describe()  # incredibly useful in ProbStat

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [26]:
obj_4 = pd.Series(['a', 'a', 'c', 'd'] * 4)
obj_4

0     a
1     a
2     c
3     d
4     a
5     a
6     c
7     d
8     a
9     a
10    c
11    d
12    a
13    a
14    c
15    d
dtype: object

In [27]:
obj_4.describe()  # describle for non-numeric datas

count     16
unique     3
top        a
freq       8
dtype: object