In [2]:
import pandas as pd

pd.__version__

'0.24.2'

In [3]:
pd.__file__

'E:\\setup\\Anaconda3\\lib\\site-packages\\pandas\\__init__.py'

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
# 设置notebook一个cell的显示行数
pd.options.display.max_rows = 10

In [6]:
pd?

# Pandas 对象简介

## Series对象

In [7]:
p0 = pd.Series([1,2,3])
p0

0    1
1    2
2    3
dtype: int64

In [8]:
p0.index

RangeIndex(start=0, stop=3, step=1)

In [9]:
n1 = p0.to_numpy()
n1

array([1, 2, 3], dtype=int64)

In [10]:
n1[:] = -1
p0

0   -1
1   -1
2   -1
dtype: int64

### Series 是特殊的Numpy数组
Numpy的数组隐含是整数索引, 而Series的索引不仅可以是整数, 还可以任何hash-type的类型.

In [11]:
data = pd.Series([0.1,0.2, 0.3, 0.4],index=['a', 'b', 'c', 'd'])
data

a    0.1
b    0.2
c    0.3
d    0.4
dtype: float64

In [12]:
data['c']

0.3

In [13]:
data['a':'c']

a    0.1
b    0.2
c    0.3
dtype: float64

In [14]:
data = pd.Series([0.2, 0.3, 0.7, 0.9],index=[2, 9, 10, 8])

In [15]:
data

2     0.2
9     0.3
10    0.7
8     0.9
dtype: float64

### Series也可以看成特殊的字典

In [16]:
p_dict = {"数学": 88, "物理": 99, 
          "英语": 123, "历史": 83}
p = pd.Series(p_dict)

In [17]:
p

数学     88
物理     99
英语    123
历史     83
dtype: int64

In [18]:
p["数学":"英语"]

数学     88
物理     99
英语    123
dtype: int64

### 新建`Series`对象

新建`Series`对象的一般方法为:

```python
pd.Series(data, index=index)
```

`index`, 是可选的, 默认是整数.

In [19]:
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

In [20]:
pd.Series({2:"a", 3:"d",5:3})

2    a
3    d
5    3
dtype: object

## `DataFrame`对象

### `DataFrame` 可以看成特殊的二维`Numpy array`


In [21]:
score_dict = {"数学": 88, "物理": 99, 
          "英语": 123, "历史": 83}
score = pd.Series(score_dict)

In [22]:
numbs_dict = {"数学": 100, "物理": 99, 
          "英语": 200, "历史": 300}
nums = pd.Series(numbs_dict)

In [23]:
data = pd.DataFrame({"score": score, "nums": nums})

In [24]:
data

Unnamed: 0,score,nums
数学,88,100
物理,99,99
英语,123,200
历史,83,300


In [25]:
data.index

Index(['数学', '物理', '英语', '历史'], dtype='object')

In [26]:
data.columns

Index(['score', 'nums'], dtype='object')

`DataFrame`作为特殊的二维数组, 但是行和列都有特殊的`index`.

### `DataFrame` 也可以看成特殊的字典

In [27]:
data["score"]

数学     88
物理     99
英语    123
历史     83
Name: score, dtype: int64

In [28]:
data.columns

Index(['score', 'nums'], dtype='object')

### 新建`DataFrame`

In [29]:
score

数学     88
物理     99
英语    123
历史     83
dtype: int64

In [30]:
p0 = pd.DataFrame(score, columns=["scores"])
p0

Unnamed: 0,scores
数学,88
物理,99
英语,123
历史,83


注意只有一列的`DataFrame`跟`Series`不同.

In [31]:
p0.iloc[:, 0]

数学     88
物理     99
英语    123
历史     83
Name: scores, dtype: int64

In [32]:
pos_pd = pd.DataFrame(np.random.rand(1000,3), columns=["x", "y", "z"])

In [33]:
pos_pd

Unnamed: 0,x,y,z
0,0.700130,0.652299,0.802431
1,0.342385,0.440163,0.938838
2,0.343173,0.234538,0.425177
3,0.985094,0.067857,0.195173
4,0.386271,0.164564,0.567335
...,...,...,...
995,0.872783,0.432303,0.592761
996,0.658903,0.607457,0.415687
997,0.831074,0.493256,0.392594
998,0.625684,0.057871,0.141284


In [34]:
p0 = pd.DataFrame({"x": np.random.randn(10),
                  "y": np.random.randn(10)})
p0

Unnamed: 0,x,y
0,-0.301555,0.062222
1,1.796111,-1.686084
2,0.896957,0.457312
3,-0.056979,1.325587
4,-0.409286,-0.295437
5,-0.269874,0.854741
6,0.001405,-0.888463
7,-0.040191,-0.324085
8,-1.116415,-0.212701
9,-0.967747,-0.886234


## `Index`对象

In [35]:
ind = pd.Index([1,3,5])
ind

Int64Index([1, 3, 5], dtype='int64')

Index 可以看成不可变的数组

In [36]:
ind[0]

1

In [37]:
ind

Int64Index([1, 3, 5], dtype='int64')

In [38]:
ind.size, ind.shape, ind.ndim, ind.dtype

(3, (3,), 1, dtype('int64'))

In [39]:
ind[::2]

Int64Index([1, 5], dtype='int64')

In [40]:
# 错误
# ind[0] = 20  

# 数据索引与选择

## `Series` 数据选择方法

In [41]:
data = pd.Series([0.25, 0.5, 0.75, 1.0, 2.0],
                 index=['a', 'b', 'c', 'd', "d"])
data

a    0.25
b    0.50
c    0.75
d    1.00
d    2.00
dtype: float64

看成字典

In [42]:
data["b"], "a" in data

(0.5, True)

In [43]:
pd.Index([1,2,1])

Int64Index([1, 2, 1], dtype='int64')

In [44]:
data.index

Index(['a', 'b', 'c', 'd', 'd'], dtype='object')

In [45]:
for i, j in data.items():
    print(i, j)

a 0.25
b 0.5
c 0.75
d 1.0
d 2.0


看成一维数组

In [46]:
data

a    0.25
b    0.50
c    0.75
d    1.00
d    2.00
dtype: float64

In [47]:
data['a': 'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [48]:
data[['a', 'b']]  # fancy indexing

a    0.25
b    0.50
dtype: float64

In [49]:
data[(data>0.1) & (data<0.8)] # mask indexing

a    0.25
b    0.50
c    0.75
dtype: float64

Indexers: loc, iloc

如果Series是不连续整数的话, 索引操作data[1]会使用显示索引, 切片操作会使用隐含索引.

In [50]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [51]:
# explicit index when indexing  索引时显式索引
data[1]

'a'

In [52]:
# implicit index when slicing   切片时隐式索引
data[1:3]

3    b
5    c
dtype: object

* `loc`函数可以用来显示索引(explicit index)
* `iloc`函数用来隐式索引(implicit index)

In [53]:
data

1    a
3    b
5    c
dtype: object

In [54]:
data.loc[1], data[1]

('a', 'a')

In [55]:
data.loc[1:4]

1    a
3    b
dtype: object

In [56]:
data.loc[1]

'a'

In [57]:
data.loc[1:3]

1    a
3    b
dtype: object

In [58]:
data

1    a
3    b
5    c
dtype: object

In [59]:
data.iloc[1]

'b'

In [60]:
data.iloc[1:3]

3    b
5    c
dtype: object

## `DataFrame`数据选择方法

将`DataFrame`看成字典

In [61]:
p0 = pd.DataFrame({"x": np.random.randn(3),
                  "y": np.random.randn(3)})
p0

Unnamed: 0,x,y
0,-1.444446,-1.349775
1,0.246592,-1.767261
2,-0.373661,-0.696685


In [62]:
p0["x"]

0   -1.444446
1    0.246592
2   -0.373661
Name: x, dtype: float64

In [63]:
p0.x

0   -1.444446
1    0.246592
2   -0.373661
Name: x, dtype: float64

In [64]:
p0["z"] = p0["x"] * p0["y"]
p0

Unnamed: 0,x,y,z
0,-1.444446,-1.349775,1.949678
1,0.246592,-1.767261,-0.435792
2,-0.373661,-0.696685,0.260324


将`DataFrame`看成二维数组

In [65]:
p0.values

array([[-1.44444586, -1.34977542,  1.94967752],
       [ 0.24659192, -1.76726106, -0.43579229],
       [-0.37366067, -0.69668541,  0.26032394]])

In [66]:
p0.T

Unnamed: 0,0,1,2
x,-1.444446,0.246592,-0.373661
y,-1.349775,-1.767261,-0.696685
z,1.949678,-0.435792,0.260324


In [67]:
p0.values[0]

array([-1.44444586, -1.34977542,  1.94967752])

In [68]:
p0["x"]

0   -1.444446
1    0.246592
2   -0.373661
Name: x, dtype: float64

In [69]:
# p0.loc[:3, :2]
p0.loc[:1, ["x", "z"]]

Unnamed: 0,x,z
0,-1.444446,1.949678
1,0.246592,-0.435792


In [70]:
p0.loc[p0.x>0.5, ["x", "y"]] #fancy indexing

Unnamed: 0,x,y


其他的约定, 索引对应列, 切片对应行.

In [71]:
p0["x"]

0   -1.444446
1    0.246592
2   -0.373661
Name: x, dtype: float64

In [72]:
# p0[0]

In [73]:
p0[0:3]

Unnamed: 0,x,y,z
0,-1.444446,-1.349775,1.949678
1,0.246592,-1.767261,-0.435792
2,-0.373661,-0.696685,0.260324


In [74]:
p0[p0.x > 1.0]

Unnamed: 0,x,y,z


# 操作Pandas的数据

## 索引保留

In [75]:
ser = pd.Series(np.random.rand(5))
df = pd.DataFrame(np.random.rand(5,3),
                 columns=["A", "B", "C"])

In [76]:
ser

0    0.508422
1    0.514642
2    0.987650
3    0.540840
4    0.586787
dtype: float64

In [77]:
np.exp(ser)

0    1.662665
1    1.673039
2    2.684917
3    1.717448
4    1.798201
dtype: float64

In [78]:
np.sin(df*np.pi/10)

Unnamed: 0,A,B,C
0,0.185141,0.049611,0.283516
1,0.010074,0.189691,0.292717
2,0.218103,0.202993,0.239184
3,0.285836,0.002708,0.30496
4,0.248573,0.012521,0.038095


## 索引对齐

In [79]:
s0 = pd.Series({"x": 3, "y": 2.3, "z":2.1})
s1 = pd.Series({"x": 3, "y": 2.3, "m":2.1, "n":2.22})

In [80]:
s0

x    3.0
y    2.3
z    2.1
dtype: float64

In [81]:
s0/s1

m    NaN
n    NaN
x    1.0
y    1.0
z    NaN
dtype: float64

In [82]:
s0.index | s1.index

Index(['m', 'n', 'x', 'y', 'z'], dtype='object')

In [83]:
s0.divide(s1, fill_value=0.0)

m    0.0
n    0.0
x    1.0
y    1.0
z    inf
dtype: float64

In [84]:
A = pd.DataFrame(np.random.randint(0,20,(2,2)),
                        columns=list('AB'))
B = pd.DataFrame(np.random.randint(0,10,(3,3)),
                        columns=list('BAC'))

In [85]:
A

Unnamed: 0,A,B
0,7,5
1,9,10


In [86]:
A+B

Unnamed: 0,A,B,C
0,9.0,9.0,
1,12.0,12.0,
2,,,


## `DataFrame`与`Series`之间的操作

In [87]:
A = np.random.randn(4, 3)

In [88]:
df = pd.DataFrame(A, columns=list("ABC"))
df - df.iloc[0]

Unnamed: 0,A,B,C
0,0.0,0.0,0.0
1,-3.007412,-1.380023,0.005782
2,-1.801334,-0.732527,-1.741462
3,-1.404232,0.573062,0.426464


In [89]:
df.sub(df["B"], axis=0)

Unnamed: 0,A,B,C
0,0.220532,0.0,-0.60737
1,-1.406857,0.0,0.778434
2,-0.848275,0.0,-1.616305
3,-1.756762,0.0,-0.753968


In [90]:
s3 = df.iloc[0, ::2]
s3

A    1.257174
C    0.429272
Name: 0, dtype: float64

In [91]:
df.sub(s3, fill_value=0.0)

NotImplementedError: fill_value 0.0 not supported.

In [92]:
df -s3

Unnamed: 0,A,B,C
0,0.0,,0.0
1,-3.007412,,0.005782
2,-1.801334,,-1.741462
3,-1.404232,,0.426464


# 处理缺失值

Pandas中用NaN(not a number)表示缺省值.

In [93]:
p1 = pd.Series(['a', None, 'c'])
p1

0       a
1    None
2       c
dtype: object

In [94]:
p1[0] = np.nan
p1

0     NaN
1    None
2       c
dtype: object

In [95]:
None, np.nan

(None, nan)

## 处理缺失值的函数

* isnull
* notnull
* dropna
* fillna

In [96]:
p1 = pd.Series([1,2,3,None])
p1

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [97]:
p1.drop_duplicates?

# 层级索引

层级索引的目的是用Series和DataFrame表示高维数据, 首先看一下如何手动用Series表示二维数据.

In [98]:
index = [('California', 2000), ('California', 2010),
        ('New York', 2000), ('New York', 2010),
        ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
                18976457, 19378102,
                20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [99]:
pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [100]:
index1 = pd.MultiIndex.from_tuples(index)
index1

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [101]:
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [102]:
pop1 = pop.reindex(index1)
pop1

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [103]:
pop1[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [104]:
pop_df = pop1.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561
