### pandas 基本介绍

In [8]:
import pandas as pd
import numpy as np

s = pd.Series([1, 3, 6, np.nan, 22, 33])
print(s)
dates = pd.date_range('20220829', periods=6)
print(dates)

df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns=['a', 'b', 'c', 'd'])

0     1.0
1     3.0
2     6.0
3     NaN
4    22.0
5    33.0
dtype: float64
DatetimeIndex(['2022-08-29', '2022-08-30', '2022-08-31', '2022-09-01',
               '2022-09-02', '2022-09-03'],
              dtype='datetime64[ns]', freq='D')


In [9]:
df

Unnamed: 0,a,b,c,d
2022-08-29,-0.19918,1.78225,-0.373382,1.070211
2022-08-30,1.404726,0.409733,0.826053,0.04419
2022-08-31,-1.983693,-0.095073,-0.213718,2.413526
2022-09-01,1.84245,-0.628111,0.276365,-1.078278
2022-09-02,0.778712,-2.067941,-1.184949,0.521206
2022-09-03,2.95559,-0.885913,-1.010588,0.789547


In [20]:
df2 = pd.DataFrame({
    'A': 1,
    'B': pd.Timestamp('20220829'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype=np.int32),
    'E': pd.Categorical(['test', 'train', 'test', 'train']),
    'F': 'foo',
})

In [21]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2022-08-29,1.0,3,test,foo
1,1,2022-08-29,1.0,3,train,foo
2,1,2022-08-29,1.0,3,test,foo
3,1,2022-08-29,1.0,3,train,foo


In [22]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [23]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [24]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [25]:
df2.values

array([[1, Timestamp('2022-08-29 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1, Timestamp('2022-08-29 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1, Timestamp('2022-08-29 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1, Timestamp('2022-08-29 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [26]:
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [27]:
df2.T

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,2022-08-29 00:00:00,2022-08-29 00:00:00,2022-08-29 00:00:00,2022-08-29 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [31]:
df2.sort_index(axis=1, ascending=False)

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2022-08-29,1
1,foo,train,3,1.0,2022-08-29,1
2,foo,test,3,1.0,2022-08-29,1
3,foo,train,3,1.0,2022-08-29,1


In [32]:
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D,E,F
3,1,2022-08-29,1.0,3,train,foo
2,1,2022-08-29,1.0,3,test,foo
1,1,2022-08-29,1.0,3,train,foo
0,1,2022-08-29,1.0,3,test,foo


In [33]:
df2.sort_values(by='E')

Unnamed: 0,A,B,C,D,E,F
0,1,2022-08-29,1.0,3,test,foo
2,1,2022-08-29,1.0,3,test,foo
1,1,2022-08-29,1.0,3,train,foo
3,1,2022-08-29,1.0,3,train,foo


### 选择数据

In [37]:
dates = pd.date_range('20220829', periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4), index=dates, 
                  columns=['A', 'B', 'C', 'D'])

In [38]:
df

Unnamed: 0,A,B,C,D
2022-08-29,0,1,2,3
2022-08-30,4,5,6,7
2022-08-31,8,9,10,11
2022-09-01,12,13,14,15
2022-09-02,16,17,18,19
2022-09-03,20,21,22,23


In [49]:
print(df['A'])
print(df.A)

2022-08-29     0
2022-08-30     4
2022-08-31     8
2022-09-01    12
2022-09-02    16
2022-09-03    20
Freq: D, Name: A, dtype: int32
2022-08-29     0
2022-08-30     4
2022-08-31     8
2022-09-01    12
2022-09-02    16
2022-09-03    20
Freq: D, Name: A, dtype: int32


In [58]:
print(df[0:3])
print(df['20220829':'20220831'])

            A  B   C   D
2022-08-29  0  1   2   3
2022-08-30  4  5   6   7
2022-08-31  8  9  10  11
            A  B   C   D
2022-08-29  0  1   2   3
2022-08-30  4  5   6   7
2022-08-31  8  9  10  11


### select by label: loc

In [75]:
df.loc['20220829']  # 打印行

A    0
B    1
C    2
D    3
Name: 2022-08-29 00:00:00, dtype: int32

In [53]:
df.loc[:, ['A', 'B']]  # 打印列

Unnamed: 0,A,B
2022-08-29,0,1
2022-08-30,4,5
2022-08-31,8,9
2022-09-01,12,13
2022-09-02,16,17
2022-09-03,20,21


In [59]:
df.loc['20220829', ['A', 'B']]

A    0
B    1
Name: 2022-08-29 00:00:00, dtype: int32

### select by position: iloc

In [76]:
df.iloc[3]

A    12
B    13
C    14
D    15
Name: 2022-09-01 00:00:00, dtype: int32

In [62]:
df.iloc[3, 1]  # df.iloc[3][1]

13

In [63]:
df.iloc[3:5, 1:3]

Unnamed: 0,B,C
2022-09-01,13,14
2022-09-02,17,18


In [64]:
df.iloc[[1, 3, 5], 1:3]

Unnamed: 0,B,C
2022-08-30,5,6
2022-09-01,13,14
2022-09-03,21,22


In [65]:
 df

Unnamed: 0,A,B,C,D
2022-08-29,0,1,2,3
2022-08-30,4,5,6,7
2022-08-31,8,9,10,11
2022-09-01,12,13,14,15
2022-09-02,16,17,18,19
2022-09-03,20,21,22,23


### boolean indexing

In [77]:
df['A'] > 8  # df.loc[:, 'A'] > 8

2022-08-29    False
2022-08-30    False
2022-08-31    False
2022-09-01     True
2022-09-02     True
2022-09-03     True
Freq: D, Name: A, dtype: bool

In [74]:
df[df['A'] > 8]

Unnamed: 0,A,B,C,D
2022-09-01,12,13,14,15
2022-09-02,16,17,18,19
2022-09-03,20,21,22,23
