In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [4]:
ser = Series([1,2,3,4],index=['A','B','C','D'])

In [5]:
ser.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
# index is immutable
ser.index[0] = 'Z'

TypeError: Index does not support mutable operations

# re-index

In [11]:
from numpy.random import randn

In [28]:
ser2 = ser.reindex(['F','C','Q'],fill_value=0)
ser2

F    0
C    3
Q    0
dtype: int64

In [33]:
snum = Series(['a','b','c'],index=[1,5,8])

In [34]:
ser3 = snum.reindex(range(10),method='ffill')
ser3

0    NaN
1      a
2      a
3      a
4      a
5      b
6      b
7      b
8      c
9      c
dtype: object

In [37]:
dframe = DataFrame(randn(9).reshape((3,3)),index=['A','B','C'],columns=['c1','c2','c3'])
dframe

Unnamed: 0,c1,c2,c3
A,0.39736,-1.663429,0.661484
B,0.484215,-0.513835,0.343586
C,-0.74039,1.061558,0.639161


In [39]:
dframe.reindex(index=['A','C','D'])

Unnamed: 0,c1,c2,c3
A,0.39736,-1.663429,0.661484
C,-0.74039,1.061558,0.639161
D,,,


In [40]:
dframe.reindex(columns=['c1','c2','c5'])

Unnamed: 0,c1,c2,c5
A,0.39736,-1.663429,
B,0.484215,-0.513835,
C,-0.74039,1.061558,


### label index
* loc works on labels in the index.
* iloc works on the positions in the index (so it only takes integers).

In [83]:
dframe.loc[['A'],['c1']]

Unnamed: 0,c1
A,0.39736


In [84]:
dframe.iloc[[0],[0]]

Unnamed: 0,c1
A,0.39736


In [80]:
dframe.loc[['A','C','D'],['c1','c2','c5']]

Unnamed: 0,c1,c2,c5
A,0.39736,-1.663429,
C,-0.74039,1.061558,
D,,,


In [87]:
dframe.iloc[[0,1,3],[0,1,4]]
# don't have index[3] and colums[4] raise error

IndexError: positional indexers are out-of-bounds

# drop-entry

In [92]:
serr = Series(range(3),index=['a','b','c'])
serr

a    0
b    1
c    2
dtype: int32

In [93]:
serr.drop('a')

b    1
c    2
dtype: int32

In [109]:
dff = DataFrame(np.arange(9).reshape((3,3)),index=['a','b','c'],columns=range(3))

In [106]:
dff

Unnamed: 0,0,1,2
a,0,1,2
b,3,4,5
c,6,7,8


In [110]:
dff.drop('a')

Unnamed: 0,0,1,2
b,3,4,5
c,6,7,8


In [116]:
dff.drop(0,axis=1)
# rows => axis = 0
# columns => axis = 1

Unnamed: 0,1,2
a,1,2
b,4,5
c,7,8


In [118]:
dff.drop('a',axis=0)
# rows => axis = 0
# columns => axis = 1

Unnamed: 0,0,1,2
b,3,4,5
c,6,7,8


# select-entry

In [127]:
serrr = Series(range(3),index=['A','B','C'])
serrr

A    0
B    1
C    2
dtype: int32

In [128]:
serrr = 5 * serrr
serrr

A     0
B     5
C    10
dtype: int32

In [129]:
serrr['A']

0

In [130]:
serrr[['A','B']]

A    0
B    5
dtype: int32

In [131]:
serrr[0:2]

A    0
B    5
dtype: int32

In [136]:
serrr[serrr>=5] = 0
serrr

A    0
B    0
C    0
dtype: int32

In [163]:
dfff = DataFrame(np.arange(25).reshape((5,5)),index=['a','b','c','d','e'])
dfff

Unnamed: 0,0,1,2,3,4
a,0,1,2,3,4
b,5,6,7,8,9
c,10,11,12,13,14
d,15,16,17,18,19
e,20,21,22,23,24


In [164]:
dfff[[0,1]]

Unnamed: 0,0,1
a,0,1
b,5,6
c,10,11
d,15,16
e,20,21


In [165]:
dfff[dfff[1]>11]

Unnamed: 0,0,1,2,3,4
d,15,16,17,18,19
e,20,21,22,23,24


In [166]:
dfff > 10

Unnamed: 0,0,1,2,3,4
a,False,False,False,False,False
b,False,False,False,False,False
c,False,True,True,True,True
d,True,True,True,True,True
e,True,True,True,True,True


### label index

In [169]:
dfff.loc[['b']]

Unnamed: 0,0,1,2,3,4
b,5,6,7,8,9


In [171]:
dfff.iloc[[1]]

Unnamed: 0,0,1,2,3,4
b,5,6,7,8,9


# data-alignment

In [3]:
# series + series
sa = Series([1,2,3,4],index=['A','B','C','D'])
sb =Series([1,1,1,1],index=['A','B','C','F'])
sa+sb

A    2.0
B    3.0
C    4.0
D    NaN
F    NaN
dtype: float64

In [8]:
# dframe + dframe
dfa = DataFrame(np.arange(4).reshape((2,2)),index=list('AB'),columns=list('ab'))
dfb = DataFrame(np.arange(9).reshape((3,3)),index=list('ADE'),columns=list('ade'))
print(dfa)
print(dfb)

   a  b
A  0  1
B  2  3
   a  d  e
A  0  1  2
D  3  4  5
E  6  7  8


In [9]:
dfa + dfb
# like dfa inner join dfb

Unnamed: 0,a,b,d,e
A,0.0,,,
B,,,,
D,,,,
E,,,,


In [None]:
dfa.add(dfb,fill_value=0)
# like dfa left join dfb

In [26]:
dfb - dfb.iloc[0]

Unnamed: 0,a,d,e
A,0,0,0
D,3,3,3
E,6,6,6


# !important

In [30]:
type(dfb.iloc[[0]])

pandas.core.frame.DataFrame

In [31]:
type(dfb.iloc[0])

pandas.core.series.Series

# rank & sort

In [37]:
ss = Series([2,1,4,3],index=['B','C','A','D'])

In [38]:
ss.sort_index()

A    4
B    2
C    1
D    3
dtype: int64

In [39]:
ss.sort_values()

C    1
B    2
D    3
A    4
dtype: int64

In [40]:
from numpy.random import randn

In [52]:
ss2 = Series(randn(10),index=range(10))

In [54]:
ss2.rank()
# sorted and show the rank index

0    10.0
1     2.0
2     9.0
3     8.0
4     1.0
5     7.0
6     5.0
7     4.0
8     6.0
9     3.0
dtype: float64

In [57]:
ss2 = ss2.sort_values()

In [58]:
ss2.rank()

4     1.0
1     2.0
9     3.0
7     4.0
6     5.0
8     6.0
5     7.0
3     8.0
2     9.0
0    10.0
dtype: float64

# Summary Statistics 

In [68]:
dfss = DataFrame(np.array([[1,2,np.nan],[np.nan,3,4]]),index=['A','B'],columns=['One','Two','Three'])
dfss

Unnamed: 0,One,Two,Three
A,1.0,2.0,
B,,3.0,4.0


In [71]:
dfss.sum(axis=0)

One      1.0
Two      5.0
Three    4.0
dtype: float64

In [70]:
dfss.sum(axis=1)

A    3.0
B    7.0
dtype: float64

In [73]:
dfss.min()

One      1.0
Two      2.0
Three    4.0
dtype: float64

In [74]:
dfss.idxmin()

One      A
Two      A
Three    B
dtype: object

### describe

In [75]:
dfss.describe()

Unnamed: 0,One,Two,Three
count,1.0,2.0,1.0
mean,1.0,2.5,4.0
std,,0.707107,
min,1.0,2.0,4.0
25%,1.0,2.25,4.0
50%,1.0,2.5,4.0
75%,1.0,2.75,4.0
max,1.0,3.0,4.0
