# [10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html)

In [1]:
import numpy as np
import pandas as pd

## 对象生成

In [4]:
# 传入一个列表生成一个series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.757046,-0.663341,2.241457,0.035135
2013-01-02,-0.5111,-0.573211,1.680816,-0.895496
2013-01-03,-1.681077,0.369396,1.660315,2.152265
2013-01-04,0.936792,0.537822,-0.036998,-0.12066
2013-01-05,-0.542985,1.628279,-0.155603,0.365178
2013-01-06,-2.07761,0.846518,0.793445,1.21606


In [7]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [8]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## 查看数据

In [9]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.757046,-0.663341,2.241457,0.035135
2013-01-02,-0.5111,-0.573211,1.680816,-0.895496
2013-01-03,-1.681077,0.369396,1.660315,2.152265
2013-01-04,0.936792,0.537822,-0.036998,-0.12066
2013-01-05,-0.542985,1.628279,-0.155603,0.365178


In [10]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.936792,0.537822,-0.036998,-0.12066
2013-01-05,-0.542985,1.628279,-0.155603,0.365178
2013-01-06,-2.07761,0.846518,0.793445,1.21606


In [11]:
# 查看索引
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [13]:
# 查看列
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [14]:
df.to_numpy()

array([[ 0.75704593, -0.66334064,  2.24145663,  0.03513538],
       [-0.51109983, -0.5732109 ,  1.68081605, -0.89549623],
       [-1.68107721,  0.369396  ,  1.66031471,  2.1522651 ],
       [ 0.93679226,  0.53782222, -0.03699796, -0.12065963],
       [-0.54298458,  1.6282792 , -0.15560259,  0.36517819],
       [-2.07760964,  0.84651807,  0.7934451 ,  1.21606037]])

In [15]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [16]:
# describe() 显示数据快速统计摘要 
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.519822,0.357577,1.030572,0.458747
std,1.227024,0.871208,0.988795,1.076979
min,-2.07761,-0.663341,-0.155603,-0.895496
25%,-1.396554,-0.337559,0.170613,-0.081711
50%,-0.527042,0.453609,1.22688,0.200157
75%,0.440009,0.769344,1.675691,1.00334
max,0.936792,1.628279,2.241457,2.152265


In [17]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.757046,-0.5111,-1.681077,0.936792,-0.542985,-2.07761
B,-0.663341,-0.573211,0.369396,0.537822,1.628279,0.846518
C,2.241457,1.680816,1.660315,-0.036998,-0.155603,0.793445
D,0.035135,-0.895496,2.152265,-0.12066,0.365178,1.21606


In [18]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.035135,2.241457,-0.663341,0.757046
2013-01-02,-0.895496,1.680816,-0.573211,-0.5111
2013-01-03,2.152265,1.660315,0.369396,-1.681077
2013-01-04,-0.12066,-0.036998,0.537822,0.936792
2013-01-05,0.365178,-0.155603,1.628279,-0.542985
2013-01-06,1.21606,0.793445,0.846518,-2.07761


In [19]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-01,0.757046,-0.663341,2.241457,0.035135
2013-01-02,-0.5111,-0.573211,1.680816,-0.895496
2013-01-03,-1.681077,0.369396,1.660315,2.152265
2013-01-04,0.936792,0.537822,-0.036998,-0.12066
2013-01-06,-2.07761,0.846518,0.793445,1.21606
2013-01-05,-0.542985,1.628279,-0.155603,0.365178


## 选择 
### 获取

In [20]:
df['A']

2013-01-01    0.757046
2013-01-02   -0.511100
2013-01-03   -1.681077
2013-01-04    0.936792
2013-01-05   -0.542985
2013-01-06   -2.077610
Freq: D, Name: A, dtype: float64

In [21]:
df.A

2013-01-01    0.757046
2013-01-02   -0.511100
2013-01-03   -1.681077
2013-01-04    0.936792
2013-01-05   -0.542985
2013-01-06   -2.077610
Freq: D, Name: A, dtype: float64

In [22]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.757046,-0.663341,2.241457,0.035135
2013-01-02,-0.5111,-0.573211,1.680816,-0.895496
2013-01-03,-1.681077,0.369396,1.660315,2.152265


In [23]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.5111,-0.573211,1.680816,-0.895496
2013-01-03,-1.681077,0.369396,1.660315,2.152265
2013-01-04,0.936792,0.537822,-0.036998,-0.12066


### 由标签选择 

In [27]:
df.loc[dates[0:2]]

Unnamed: 0,A,B,C,D
2013-01-01,0.757046,-0.663341,2.241457,0.035135
2013-01-02,-0.5111,-0.573211,1.680816,-0.895496


In [25]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.757046,-0.663341
2013-01-02,-0.5111,-0.573211
2013-01-03,-1.681077,0.369396
2013-01-04,0.936792,0.537822
2013-01-05,-0.542985,1.628279
2013-01-06,-2.07761,0.846518


In [28]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.5111,-0.573211
2013-01-03,-1.681077,0.369396
2013-01-04,0.936792,0.537822


In [29]:
df.loc[dates[0], 'A']

0.7570459269930234

In [30]:
df.at[dates[0], 'A']

0.7570459269930234

### 由位置选择 

In [31]:
df.iloc[3]

A    0.936792
B    0.537822
C   -0.036998
D   -0.120660
Name: 2013-01-04 00:00:00, dtype: float64

In [32]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.936792,0.537822
2013-01-05,-0.542985,1.628279


In [33]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.5111,1.680816
2013-01-03,-1.681077,1.660315
2013-01-05,-0.542985,-0.155603


In [34]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-0.663341,2.241457
2013-01-02,-0.573211,1.680816
2013-01-03,0.369396,1.660315
2013-01-04,0.537822,-0.036998
2013-01-05,1.628279,-0.155603
2013-01-06,0.846518,0.793445


In [35]:
df.iloc[1, 1]

-0.5732109001186042

In [36]:
df.iat[1, 1]

-0.5732109001186042

### 布尔指标

In [37]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.757046,-0.663341,2.241457,0.035135
2013-01-04,0.936792,0.537822,-0.036998,-0.12066


In [38]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.757046,,2.241457,0.035135
2013-01-02,,,1.680816,
2013-01-03,,0.369396,1.660315,2.152265
2013-01-04,0.936792,0.537822,,
2013-01-05,,1.628279,,0.365178
2013-01-06,,0.846518,0.793445,1.21606


In [41]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.757046,-0.663341,2.241457,0.035135,one
2013-01-02,-0.5111,-0.573211,1.680816,-0.895496,one
2013-01-03,-1.681077,0.369396,1.660315,2.152265,two
2013-01-04,0.936792,0.537822,-0.036998,-0.12066,three
2013-01-05,-0.542985,1.628279,-0.155603,0.365178,four
2013-01-06,-2.07761,0.846518,0.793445,1.21606,three


In [46]:
df2 = df2[df2['E'].isin(['two', 'four'])]
df2

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.681077,0.369396,1.660315,2.152265,two
2013-01-05,-0.542985,1.628279,-0.155603,0.365178,four


### 设置 

In [52]:
# 设置一个新列，与指标自动对齐
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [56]:
# 用标签设置值
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,-0.663341,2.241457,0.035135
2013-01-02,-0.5111,-0.573211,1.680816,-0.895496
2013-01-03,-1.681077,0.369396,1.660315,2.152265
2013-01-04,0.936792,0.537822,-0.036998,-0.12066
2013-01-05,-0.542985,1.628279,-0.155603,0.365178
2013-01-06,-2.07761,0.846518,0.793445,1.21606


In [60]:
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,2.241457,0.035135
2013-01-02,-0.5111,-0.573211,1.680816,-0.895496
2013-01-03,-1.681077,0.369396,1.660315,2.152265
2013-01-04,0.936792,0.537822,-0.036998,-0.12066
2013-01-05,-0.542985,1.628279,-0.155603,0.365178
2013-01-06,-2.07761,0.846518,0.793445,1.21606


In [66]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,2.241457,5
2013-01-02,-0.5111,-0.573211,1.680816,5
2013-01-03,-1.681077,0.369396,1.660315,5
2013-01-04,0.936792,0.537822,-0.036998,5
2013-01-05,-0.542985,1.628279,-0.155603,5
2013-01-06,-2.07761,0.846518,0.793445,5


In [67]:
# A where operation with setting.
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-2.241457,-5
2013-01-02,-0.5111,-0.573211,-1.680816,-5
2013-01-03,-1.681077,-0.369396,-1.660315,-5
2013-01-04,-0.936792,-0.537822,-0.036998,-5
2013-01-05,-0.542985,-1.628279,-0.155603,-5
2013-01-06,-2.07761,-0.846518,-0.793445,-5


## 缺失数据

In [68]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,2.241457,5,1.0
2013-01-02,-0.5111,-0.573211,1.680816,5,1.0
2013-01-03,-1.681077,0.369396,1.660315,5,
2013-01-04,0.936792,0.537822,-0.036998,5,


In [69]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,2.241457,5,1.0
2013-01-02,-0.5111,-0.573211,1.680816,5,1.0


In [70]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,2.241457,5,1.0
2013-01-02,-0.5111,-0.573211,1.680816,5,1.0
2013-01-03,-1.681077,0.369396,1.660315,5,5.0
2013-01-04,0.936792,0.537822,-0.036998,5,5.0


In [71]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


## 运算
### 统计 

In [72]:
df.mean()

A   -0.645997
B    0.468134
C    1.030572
D    5.000000
dtype: float64

In [73]:
df.mean(1)

2013-01-01    1.810364
2013-01-02    1.399126
2013-01-03    1.337158
2013-01-04    1.609404
2013-01-05    1.482423
2013-01-06    1.140588
Freq: D, dtype: float64

In [78]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [79]:
s = s.shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [80]:
# 待研究
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,,,
2013-01-03,-2.681077,-0.630604,0.660315,4.0
2013-01-04,-2.063208,-2.462178,-3.036998,2.0
2013-01-05,-5.542985,-3.371721,-5.155603,0.0
2013-01-06,,,,


### Apply

In [81]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,2.241457,5
2013-01-02,-0.5111,-0.573211,3.922273,10
2013-01-03,-2.192177,-0.203815,5.582587,15
2013-01-04,-1.255385,0.334007,5.545589,20
2013-01-05,-1.798369,1.962287,5.389987,25
2013-01-06,-3.875979,2.808805,6.183432,30


In [82]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,2.241457,5
2013-01-02,-0.5111,-0.573211,1.680816,5
2013-01-03,-1.681077,0.369396,1.660315,5
2013-01-04,0.936792,0.537822,-0.036998,5
2013-01-05,-0.542985,1.628279,-0.155603,5
2013-01-06,-2.07761,0.846518,0.793445,5


In [83]:
df.apply(lambda x: x.max() - x.min())

A    3.014402
B    2.201490
C    2.397059
D    0.000000
dtype: float64

### Histogramming

In [85]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    5
1    2
2    1
3    3
4    3
5    3
6    3
7    2
8    2
9    1
dtype: int64

In [86]:
s.value_counts()

3    4
2    3
1    2
5    1
dtype: int64

### 字符串方法 

In [87]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [88]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object