In [1]:
import pandas as pd
import numpy as np

In [4]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
df = pd.DataFrame(np.random.randn(8, 3), 
                  index=pd.date_range('1/1/2000', periods=8),
                  columns=['A', 'B', 'C'])
pn = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
                  major_axis=pd.date_range('1/1/2000', periods=5),
                  minor_axis=['A', 'B', 'C', 'D'])

In [63]:
df_type = pd.DataFrame({'string': list('abc'),
                       'int64': list(range(1, 4)),
                       'uint8': np.arange(3, 6).astype('u1'),
                       'float64': np.arange(4.0, 7.0),
                       'bool1': [True, False, True],
                      'bool2': [False, True, False],
                       'dates': pd.date_range('now', periods=3).values,
                       'category': pd.Series(list("ABC")).astype('category')})

# shape & dtypes

In [30]:
df.shape

(8, 3)

In [65]:
df_type.dtypes

bool1                 bool
bool2                 bool
category          category
dates       datetime64[ns]
float64            float64
int64                int64
string              object
uint8                uint8
dtype: object

 # add(), sub(), mul(), div()

In [6]:
row = df.iloc[1]
df.sub(row, axis='columns')

Unnamed: 0,A,B,C
2000-01-01,-0.696112,-0.024372,0.388314
2000-01-02,0.0,0.0,0.0
2000-01-03,-0.406965,0.18421,-0.038673
2000-01-04,-0.489983,-1.993651,-1.11193
2000-01-05,-2.962181,0.014699,1.091395
2000-01-06,0.039262,-0.700019,-0.089849
2000-01-07,0.001134,-0.012976,-0.897864
2000-01-08,-0.995972,-0.427924,0.365856


In [8]:
column = df['A']
df.sub(column, axis=0)

Unnamed: 0,A,B,C
2000-01-01,0.0,0.527066,0.086887
2000-01-02,0.0,-0.144675,-0.997539
2000-01-03,0.0,0.446501,-0.629247
2000-01-04,0.0,-1.648343,-1.619486
2000-01-05,0.0,2.832204,3.056037
2000-01-06,0.0,-0.883956,-1.12665
2000-01-07,0.0,-0.158784,-1.896537
2000-01-08,0.0,0.423373,0.364289


# 残缺值

#  empty, any(), all(), and bool()

In [9]:
(df > 0).all()

A    False
B    False
C    False
dtype: bool

In [10]:
(df > 0).any()

A    True
B    True
C    True
dtype: bool

In [11]:
df.empty

False

In [12]:
pd.Series([True]).bool()

True

# combine_first()

In [13]:
df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan],
                        'B' : [np.nan, 2., 3., np.nan, 6.]})
df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.],
                        'B' : [np.nan, np.nan, 3., 4., 6., 8.]})

In [14]:
df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [15]:
df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [16]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


# idxmax(), idxmin()

In [21]:
s.idxmin(), s.idxmax()

('d', 'a')

# value_counts()

In [23]:
pd.Series(np.random.randint(0, 7, size=50)).value_counts()

2    16
0    11
6     6
3     6
5     5
4     3
1     3
dtype: int64

# cut(), qcut()

In [24]:
pd.cut(s, 4)

a      (1.111, 2.044]
b     (-0.755, 0.178]
c      (0.178, 1.111]
d    (-1.692, -0.755]
e     (-0.755, 0.178]
dtype: category
Categories (4, interval[float64]): [(-1.692, -0.755] < (-0.755, 0.178] < (0.178, 1.111] < (1.111, 2.044]]

In [25]:
pd.qcut(s, 4)

a      (0.932, 2.044]
b    (-0.722, -0.254]
c     (-0.254, 0.932]
d    (-1.689, -0.722]
e    (-1.689, -0.722]
dtype: category
Categories (4, interval[float64]): [(-1.689, -0.722] < (-0.722, -0.254] < (-0.254, 0.932] < (0.932, 2.044]]

# apply(), transform(), agg(), applymap()

In [26]:
df.apply(np.exp)

Unnamed: 0,A,B,C
2000-01-01,1.299032,2.200503,1.41695
2000-01-02,2.60578,2.254791,0.960975
2000-01-03,1.734583,2.710865,0.92452
2000-01-04,1.596398,0.307096,0.316087
2000-01-05,0.134735,2.288178,2.862194
2000-01-06,2.710124,1.119674,0.878398
2000-01-07,2.608736,2.225722,0.391539
2000-01-08,0.962482,1.46981,1.385483


In [27]:
df.apply(lambda x: x.idxmax())

A   2000-01-06
B   2000-01-03
C   2000-01-05
dtype: datetime64[ns]

In [28]:
df.agg(['sum', 'mean'])

Unnamed: 0,A,B,C
sum,2.15104,3.544426,-0.611207
mean,0.26888,0.443053,-0.076401


In [29]:
df.agg({'A': ['mean', 'min'], 'B': 'sum'})

Unnamed: 0,A,B
mean,0.26888,
min,-2.004448,
sum,,3.544426


In [34]:
from functools import partial
q_25 = partial(pd.Series.quantile, q=0.25)
q_75 = partial(pd.Series.quantile, q=0.75)
df.agg(['count', 'mean', 'std', 'min', q_25, 'median', q_75, 'max'])

Unnamed: 0,A,B,C
count,8.0,8.0,8.0
mean,0.26888,0.443053,-0.076401
std,0.990105,0.715889,0.708841
min,-2.004448,-1.180593,-1.151737
quantile,0.186655,0.317109,-0.33166
median,0.509258,0.794384,-0.059144
quantile,0.958016,0.816732,0.331664
max,0.996994,0.997268,1.051589


In [35]:
df.transform([np.abs, lambda x: x+1])

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,absolute,<lambda>,absolute,<lambda>,absolute,<lambda>
2000-01-01,0.26162,1.26162,0.788686,1.788686,0.348507,1.348507
2000-01-02,0.957732,1.957732,0.813057,1.813057,0.039807,0.960193
2000-01-03,0.550767,1.550767,0.997268,1.997268,0.07848,0.92152
2000-01-04,0.46775,1.46775,1.180593,-0.180593,1.151737,-0.151737
2000-01-05,2.004448,-1.004448,0.827756,1.827756,1.051589,2.051589
2000-01-06,0.996994,1.996994,0.113038,1.113038,0.129656,0.870344
2000-01-07,0.958866,1.958866,0.800082,1.800082,0.937671,0.062329
2000-01-08,0.03824,0.96176,0.385133,1.385133,0.326049,1.326049


In [37]:
df

Unnamed: 0,A,B,C
2000-01-01,0.26162,0.788686,0.348507
2000-01-02,0.957732,0.813057,-0.039807
2000-01-03,0.550767,0.997268,-0.07848
2000-01-04,0.46775,-1.180593,-1.151737
2000-01-05,-2.004448,0.827756,1.051589
2000-01-06,0.996994,0.113038,-0.129656
2000-01-07,0.958866,0.800082,-0.937671
2000-01-08,-0.03824,0.385133,0.326049


# reindex(), 

In [38]:
s

a    2.044497
b   -0.254059
c    0.932380
d   -1.688171
e   -0.722452
dtype: float64

In [39]:
s.reindex(['e', 'b', 'f', 'd'])

e   -0.722452
b   -0.254059
f         NaN
d   -1.688171
dtype: float64

In [40]:
df

Unnamed: 0,A,B,C
2000-01-01,0.26162,0.788686,0.348507
2000-01-02,0.957732,0.813057,-0.039807
2000-01-03,0.550767,0.997268,-0.07848
2000-01-04,0.46775,-1.180593,-1.151737
2000-01-05,-2.004448,0.827756,1.051589
2000-01-06,0.996994,0.113038,-0.129656
2000-01-07,0.958866,0.800082,-0.937671
2000-01-08,-0.03824,0.385133,0.326049


In [41]:
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

Unnamed: 0,three,two,one
c,,,
f,,,
b,,,


In [42]:
ts = pd.Series(np.random.randn(8), index=pd.date_range('1/3/2000', periods=8))
ts

2000-01-03   -2.503591
2000-01-04   -1.746592
2000-01-05   -0.470917
2000-01-06   -0.665066
2000-01-07   -0.997386
2000-01-08    0.508193
2000-01-09   -0.954390
2000-01-10    0.837603
Freq: D, dtype: float64

In [43]:
ts[[0, 3, 6]].reindex(ts.index)

2000-01-03   -2.503591
2000-01-04         NaN
2000-01-05         NaN
2000-01-06   -0.665066
2000-01-07         NaN
2000-01-08         NaN
2000-01-09   -0.954390
2000-01-10         NaN
Freq: D, dtype: float64

# drop(), rename()

# iterrows(), itertuples(), iteritems()

# .dt

In [45]:
ss = pd.Series(pd.date_range('20130101 09:10:12', periods=4))

In [46]:
ss

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [48]:
ss.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

# .str

In [49]:
s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s3.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# sort()

In [52]:
dfdf = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]})
dfdf

Unnamed: 0,one,three,two
0,2,5,1
1,1,4,3
2,1,3,2
3,1,2,4


In [53]:
dfdf.sort_values(by='two')

Unnamed: 0,one,three,two
0,2,5,1
2,1,3,2
1,1,4,3
3,1,2,4


#  nsmallest() and nlargest()

In [60]:
s4 = pd.Series(np.random.permutation(10))
s4

0    6
1    1
2    0
3    7
4    5
5    9
6    3
7    4
8    8
9    2
dtype: int32

In [61]:
s4.nsmallest(3)

2    0
1    1
9    2
dtype: int32

In [62]:
s4.nlargest(3)

5    9
8    8
3    7
dtype: int32