In [2]:
import pandas as pd
import numpy as np

## Quick review of official "10 minutes to pandas"

#### 创建

Series 和 DataFrame

In [3]:
s = pd.Series([1, 3, 5, np.nan, 7])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    7.0
dtype: float64

In [6]:
# 通过 Numpy arrays创建DataFrames
dates = pd.date_range("20210101", periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2021-01-01,0.627445,0.91957,-1.187726,-0.101283
2021-01-02,-0.018015,-1.452471,-0.897048,2.337127
2021-01-03,-0.550587,0.16776,-0.883465,-0.92323
2021-01-04,-0.15073,0.631339,-1.545795,0.008567
2021-01-05,0.752789,-0.506569,0.136119,0.62667
2021-01-06,0.476722,-1.062432,0.988346,0.543079


In [8]:
# DataFrames 同样可以通过dict来创建。但是不建议。
df2 = pd.DataFrame({
    "A": 1.0,
    "B": pd.Timestamp("20210101"),
    "C": pd.Series(1, index=list(range(4)), dtype="float32"),
    "D": np.array([3] * 4, dtype="int32"),
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "foo",
})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-01-01,1.0,3,test,foo
1,1.0,2021-01-01,1.0,3,train,foo
2,1.0,2021-01-01,1.0,3,test,foo
3,1.0,2021-01-01,1.0,3,train,foo


In [9]:
# 查看数据类型
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

#### 查看数据


In [10]:
df.head()

Unnamed: 0,A,B,C,D
2021-01-01,0.627445,0.91957,-1.187726,-0.101283
2021-01-02,-0.018015,-1.452471,-0.897048,2.337127
2021-01-03,-0.550587,0.16776,-0.883465,-0.92323
2021-01-04,-0.15073,0.631339,-1.545795,0.008567
2021-01-05,0.752789,-0.506569,0.136119,0.62667


In [11]:
df.tail(3)

Unnamed: 0,A,B,C,D
2021-01-04,-0.15073,0.631339,-1.545795,0.008567
2021-01-05,0.752789,-0.506569,0.136119,0.62667
2021-01-06,0.476722,-1.062432,0.988346,0.543079


In [12]:
df.index

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06'],
              dtype='datetime64[ns]', freq='D')

In [13]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [14]:
# to_numpy()不要随便用。如果是将非数字类型转化时，会都转为object类型。很耗时
df.to_numpy()

array([[ 0.62744546,  0.91957012, -1.18772611, -0.10128255],
       [-0.01801508, -1.45247104, -0.89704784,  2.33712747],
       [-0.5505865 ,  0.16776019, -0.88346494, -0.92323035],
       [-0.15073028,  0.63133913, -1.54579518,  0.00856726],
       [ 0.75278865, -0.50656872,  0.13611883,  0.62666996],
       [ 0.4767224 , -1.06243176,  0.98834608,  0.54307858]])

In [15]:
df2.to_numpy()

array([[1.0, Timestamp('2021-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2021-01-01 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2021-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2021-01-01 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [16]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.189604,-0.217134,-0.564928,0.415155
std,0.509536,0.947049,0.945079,1.09347
min,-0.550587,-1.452471,-1.545795,-0.92323
25%,-0.117551,-0.923466,-1.115057,-0.07382
50%,0.229354,-0.169404,-0.890256,0.275823
75%,0.589765,0.515444,-0.118777,0.605772
max,0.752789,0.91957,0.988346,2.337127


In [18]:
df.T  # 等于 df.transpose()

Unnamed: 0,2021-01-01,2021-01-02,2021-01-03,2021-01-04,2021-01-05,2021-01-06
A,0.627445,-0.018015,-0.550587,-0.15073,0.752789,0.476722
B,0.91957,-1.452471,0.16776,0.631339,-0.506569,-1.062432
C,-1.187726,-0.897048,-0.883465,-1.545795,0.136119,0.988346
D,-0.101283,2.337127,-0.92323,0.008567,0.62667,0.543079


In [20]:
df.sort_index(ascending=False)

Unnamed: 0,A,B,C,D
2021-01-06,0.476722,-1.062432,0.988346,0.543079
2021-01-05,0.752789,-0.506569,0.136119,0.62667
2021-01-04,-0.15073,0.631339,-1.545795,0.008567
2021-01-03,-0.550587,0.16776,-0.883465,-0.92323
2021-01-02,-0.018015,-1.452471,-0.897048,2.337127
2021-01-01,0.627445,0.91957,-1.187726,-0.101283


In [21]:
df.sort_index(by="B")

  """Entry point for launching an IPython kernel.


Unnamed: 0,A,B,C,D
2021-01-02,-0.018015,-1.452471,-0.897048,2.337127
2021-01-06,0.476722,-1.062432,0.988346,0.543079
2021-01-05,0.752789,-0.506569,0.136119,0.62667
2021-01-03,-0.550587,0.16776,-0.883465,-0.92323
2021-01-04,-0.15073,0.631339,-1.545795,0.008567
2021-01-01,0.627445,0.91957,-1.187726,-0.101283


#### 选择
这里讲了如何定位想要的数据。
建议使用的是 `.at, .iat, .loc, .iloc` 四个方法。就不写其它方法了。

##### loc : Selection by label 名称

In [22]:
df.loc[dates[0]]

A    0.627445
B    0.919570
C   -1.187726
D   -0.101283
Name: 2021-01-01 00:00:00, dtype: float64

In [26]:
# 这里注意理解下axis， 0 是指行， 1 是指列。
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2021-01-01,0.627445,0.91957
2021-01-02,-0.018015,-1.452471
2021-01-03,-0.550587,0.16776
2021-01-04,-0.15073,0.631339
2021-01-05,0.752789,-0.506569
2021-01-06,0.476722,-1.062432


In [27]:
# 也可以切片
df.loc["20210102":"20210105", ["B", "C"]]


Unnamed: 0,B,C
2021-01-02,-1.452471,-0.897048
2021-01-03,0.16776,-0.883465
2021-01-04,0.631339,-1.545795
2021-01-05,-0.506569,0.136119


In [28]:
df.loc[dates[0], "A"]

0.6274454575081495

In [32]:
df.at[dates[0], "A"]

0.6274454575081495

##### iloc: Selection by position 即index
同样还是记得axis这个概念。很重要。

In [33]:
df.iloc[3]

A   -0.150730
B    0.631339
C   -1.545795
D    0.008567
Name: 2021-01-04 00:00:00, dtype: float64

In [34]:
# 这里和loc的区别在于：loc是前后包含，iloc是不包含后
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2021-01-04,-0.15073,0.631339
2021-01-05,0.752789,-0.506569


In [36]:
df.iloc[[1,2,4], [3, 0]]

Unnamed: 0,D,A
2021-01-02,2.337127,-0.018015
2021-01-03,-0.92323,-0.550587
2021-01-05,0.62667,0.752789


In [37]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2021-01-02,-0.018015,-1.452471,-0.897048,2.337127
2021-01-03,-0.550587,0.16776,-0.883465,-0.92323


In [39]:
df.iloc[1, 1]  # equal df.iat[1, 1]

-1.4524710433876127

##### 布尔型 indexing
通过判断条件来选择数据。
简单数据，通过这种筛选会更有效的看到一些初步的不同。


In [40]:
df[df["A"]>0]

Unnamed: 0,A,B,C,D
2021-01-01,0.627445,0.91957,-1.187726,-0.101283
2021-01-05,0.752789,-0.506569,0.136119,0.62667
2021-01-06,0.476722,-1.062432,0.988346,0.543079


In [41]:
df[df>0]

Unnamed: 0,A,B,C,D
2021-01-01,0.627445,0.91957,,
2021-01-02,,,,2.337127
2021-01-03,,0.16776,,
2021-01-04,,0.631339,,0.008567
2021-01-05,0.752789,,0.136119,0.62667
2021-01-06,0.476722,,0.988346,0.543079


In [43]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2021-01-01,0.627445,0.91957,-1.187726,-0.101283,one
2021-01-02,-0.018015,-1.452471,-0.897048,2.337127,one
2021-01-03,-0.550587,0.16776,-0.883465,-0.92323,two
2021-01-04,-0.15073,0.631339,-1.545795,0.008567,three
2021-01-05,0.752789,-0.506569,0.136119,0.62667,four
2021-01-06,0.476722,-1.062432,0.988346,0.543079,three


In [44]:
# isin() 用来筛选多个条件
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2021-01-03,-0.550587,0.16776,-0.883465,-0.92323,two
2021-01-05,0.752789,-0.506569,0.136119,0.62667,four


##### Setting
设置数据。同样也是一个变化数据必须的步骤

In [45]:
df2 = df.copy()
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,C,D
2021-01-01,-0.627445,-0.91957,-1.187726,-0.101283
2021-01-02,-0.018015,-1.452471,-0.897048,-2.337127
2021-01-03,-0.550587,-0.16776,-0.883465,-0.92323
2021-01-04,-0.15073,-0.631339,-1.545795,-0.008567
2021-01-05,-0.752789,-0.506569,-0.136119,-0.62667
2021-01-06,-0.476722,-1.062432,-0.988346,-0.543079


#### Missing data
丢失数据的处理


In [46]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0]: dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,E
2021-01-01,0.627445,0.91957,-1.187726,-0.101283,1.0
2021-01-02,-0.018015,-1.452471,-0.897048,2.337127,1.0
2021-01-03,-0.550587,0.16776,-0.883465,-0.92323,
2021-01-04,-0.15073,0.631339,-1.545795,0.008567,


In [47]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,E
2021-01-01,0.627445,0.91957,-1.187726,-0.101283,1.0
2021-01-02,-0.018015,-1.452471,-0.897048,2.337127,1.0


In [48]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2021-01-01,0.627445,0.91957,-1.187726,-0.101283,1.0
2021-01-02,-0.018015,-1.452471,-0.897048,2.337127,1.0
2021-01-03,-0.550587,0.16776,-0.883465,-0.92323,5.0
2021-01-04,-0.15073,0.631339,-1.545795,0.008567,5.0


In [51]:
# 这里，df1通过fillna填充的数据，还是可以通过isna找到
# TODO：弄明白原理。文中写这个是mask。
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2021-01-01,False,False,False,False,False
2021-01-02,False,False,False,False,False
2021-01-03,False,False,False,False,True
2021-01-04,False,False,False,False,True


#### Operations 操作

##### 统计
这里简单跟着官方文档写下。后边会深入学习。

In [52]:
df.mean(0)

A    0.189604
B   -0.217134
C   -0.564928
D    0.415155
dtype: float64

In [53]:
df.mean(1)

2021-01-01    0.064502
2021-01-02   -0.007602
2021-01-03   -0.547380
2021-01-04   -0.264155
2021-01-05    0.252252
2021-01-06    0.236429
Freq: D, dtype: float64

In [54]:
s = pd.Series([1, 3, 5, np.nan, 9, 11], index=dates).shift(2)
s

2021-01-01    NaN
2021-01-02    NaN
2021-01-03    1.0
2021-01-04    3.0
2021-01-05    5.0
2021-01-06    NaN
Freq: D, dtype: float64

In [60]:
df.sub(s, axis="index") # 用df减去s

Unnamed: 0,A,B,C,D
2021-01-01,,,,
2021-01-02,,,,
2021-01-03,-1.550587,-0.83224,-1.883465,-1.92323
2021-01-04,-3.15073,-2.368661,-4.545795,-2.991433
2021-01-05,-4.247211,-5.506569,-4.863881,-4.37333
2021-01-06,,,,


##### Apply
算是最有意思的一个方法了。用于处理数据。

In [61]:
df.apply(np.cumsum)


Unnamed: 0,A,B,C,D
2021-01-01,0.627445,0.91957,-1.187726,-0.101283
2021-01-02,0.60943,-0.532901,-2.084774,2.235845
2021-01-03,0.058844,-0.365141,-2.968239,1.312615
2021-01-04,-0.091886,0.266198,-4.514034,1.321182
2021-01-05,0.660902,-0.24037,-4.377915,1.947852
2021-01-06,1.137625,-1.302802,-3.389569,2.49093


In [62]:
df.apply(lambda x: x.max() - x.min())

A    1.303375
B    2.372041
C    2.534141
D    3.260358
dtype: float64

#### Merge 合并
有两种方式。一个concat，一个join。
这块经常用到也是。


In [64]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-2.182904,-1.950158,-0.860137,-1.059878
1,1.40091,-1.562209,0.354533,-0.701314
2,-0.125271,-0.261585,-0.975121,-0.396658
3,1.156702,0.233368,1.193686,0.601394
4,-0.223079,1.102356,0.465808,0.461869
5,1.935319,-0.432694,1.010228,-2.010836
6,0.589509,-0.113083,-0.260055,-1.000811
7,-0.461179,0.004107,1.596869,0.124233
8,0.626396,-0.246509,0.844803,-0.01773
9,0.65268,0.260238,-0.275766,-0.266441


In [65]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0 -2.182904 -1.950158 -0.860137 -1.059878
 1  1.400910 -1.562209  0.354533 -0.701314
 2 -0.125271 -0.261585 -0.975121 -0.396658,
           0         1         2         3
 3  1.156702  0.233368  1.193686  0.601394
 4 -0.223079  1.102356  0.465808  0.461869
 5  1.935319 -0.432694  1.010228 -2.010836
 6  0.589509 -0.113083 -0.260055 -1.000811,
           0         1         2         3
 7 -0.461179  0.004107  1.596869  0.124233
 8  0.626396 -0.246509  0.844803 -0.017730
 9  0.652680  0.260238 -0.275766 -0.266441]

In [66]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-2.182904,-1.950158,-0.860137,-1.059878
1,1.40091,-1.562209,0.354533,-0.701314
2,-0.125271,-0.261585,-0.975121,-0.396658
3,1.156702,0.233368,1.193686,0.601394
4,-0.223079,1.102356,0.465808,0.461869
5,1.935319,-0.432694,1.010228,-2.010836
6,0.589509,-0.113083,-0.260055,-1.000811
7,-0.461179,0.004107,1.596869,0.124233
8,0.626396,-0.246509,0.844803,-0.01773
9,0.65268,0.260238,-0.275766,-0.266441
