# 青苗课 pandas

In [1]:
import numpy as np
import pandas as pd

### 一维数据框

In [4]:
# 随机产生
pan = pd.Series(np.random.randint(10,36,3))
pan

0    19
1    33
2    22
dtype: int32

In [9]:
# 指定索引,和数据类型
pan1 = pd.Series(np.random.randint(10,36,3),index=list("abc"),dtype=float)
pan1

a    25.0
b    22.0
c    32.0
dtype: float64

In [10]:
# 返回数据类型
pan1.dtype

dtype('float64')

In [11]:
# 返回索引
pan1.index

Index(['a', 'b', 'c'], dtype='object')

In [13]:
# 返回索引
pan1.keys()

Index(['a', 'b', 'c'], dtype='object')

In [15]:
# 返回值的
pan1.values

array([25., 22., 32.])

In [17]:
# 将值返回为list
pan1.tolist()

[25.0, 22.0, 32.0]

In [21]:
# 检验某个值是否存在
pan1.isin([25])

a     True
b    False
c    False
dtype: bool

In [150]:
pan2 = pd.Series(np.random.randint(10,36,5),index=list("abcde"),dtype=float)

In [29]:
# shape()函数
# 返回元素外形，元素个数
pan2.shape

(5,)

In [30]:
# size 函数
pan2.size

5

In [151]:
# 使用items()转为字典
list(pan2.items())

[('a', 23.0), ('b', 15.0), ('c', 34.0), ('d', 30.0), ('e', 32.0)]

In [152]:
# 判断是否为空
pan2.isnull()

a    False
b    False
c    False
d    False
e    False
dtype: bool

In [153]:
pan3 = pd.Series(np.random.randint(10,36,13),index=["a"+str(i) for i in range(0,13)])
pan3

a0     11
a1     18
a2     24
a3     30
a4     17
a5     34
a6     13
a7     19
a8     17
a9     16
a10    25
a11    30
a12    21
dtype: int32

In [154]:
# 条件
(pan3>10)&(pan3<15)

a0      True
a1     False
a2     False
a3     False
a4     False
a5     False
a6      True
a7     False
a8     False
a9     False
a10    False
a11    False
a12    False
dtype: bool

In [155]:
# 将pan3进行值的过滤

pan3[(pan3>10)&(pan3<15)]

a0    11
a6    13
dtype: int32

In [156]:
# 使用花式索引赋值为nan
pan3[[3,5,7]] = np.nan
pan3

a0     11.0
a1     18.0
a2     24.0
a3      NaN
a4     17.0
a5      NaN
a6     13.0
a7      NaN
a8     17.0
a9     16.0
a10    25.0
a11    30.0
a12    21.0
dtype: float64

In [157]:
# 进行判断是否为nan
# 并返回索引值
pan3[pan3.isnull()]

a3   NaN
a5   NaN
a7   NaN
dtype: float64

In [158]:
# 改变pan3的索引值
pan3.index = list("a"+str(i) for i in range(13))
pan3

a0     11.0
a1     18.0
a2     24.0
a3      NaN
a4     17.0
a5      NaN
a6     13.0
a7      NaN
a8     17.0
a9     16.0
a10    25.0
a11    30.0
a12    21.0
dtype: float64

In [159]:
# 一维框的选区

pan3["a5":"a8"]

a5     NaN
a6    13.0
a7     NaN
a8    17.0
dtype: float64

## 创建数据框

In [62]:
import numpy as np
import pandas as pd

In [70]:
# 创建单列的数据框
# 数据框都为二维
pd.DataFrame(np.random.randint(12,36,4))

Unnamed: 0,0
0,30
1,12
2,26
3,28


In [71]:
# 创建单行的数据框
pd.DataFrame([np.random.randint(12,36,5)])

Unnamed: 0,0,1,2,3,4
0,23,16,12,35,31


In [78]:
# 指定索引数据框
pd.DataFrame(np.random.randint(13,69,(5,5)),index=list("abcde"),columns=list("ABCDE"),dtype=float)

Unnamed: 0,A,B,C,D,E
a,63.0,57.0,54.0,67.0,37.0
b,24.0,35.0,66.0,25.0,27.0
c,22.0,46.0,44.0,62.0,33.0
d,64.0,65.0,23.0,35.0,18.0
e,62.0,27.0,19.0,41.0,27.0


In [82]:
# 数据框的计算
pan1 = pd.DataFrame(np.random.randint(13,69,(5,5)),index=list("abcde"),columns=list("ABCDE"))
pan1

Unnamed: 0,A,B,C,D,E
a,19,37,51,27,64
b,48,20,64,27,52
c,55,61,36,66,59
d,38,43,19,18,58
e,64,27,46,35,14


In [84]:
np.sin(pan1*2+1)

Unnamed: 0,A,B,C,D,E
a,0.963795,-0.387782,0.622989,-0.999755,-0.193473
b,0.379608,-0.158623,-0.193473,-0.999755,-0.970535
c,-0.864551,-0.459903,-0.676772,0.868966,-0.371404
d,0.99952,-0.821818,0.963795,-0.643538,-0.689698
e,-0.193473,-0.999755,-0.948282,0.951055,-0.663634


In [89]:
# 手动创建含有空值的数据框
arr = np.random.randint(10,36,(5,5)).astype(float)
arr[[0,2,4],[1,2,3]] = np.nan
arr

array([[33., nan, 21., 14., 22.],
       [25., 34., 32., 25., 32.],
       [15., 14., nan, 26., 33.],
       [27., 19., 30., 14., 13.],
       [11., 30., 27., nan, 25.]])

In [91]:
pan3 = pd.DataFrame(arr,index=list("abcde"),columns=["a"+str(i) for i in range(5)])
pan3

Unnamed: 0,a0,a1,a2,a3,a4
a,33.0,,21.0,14.0,22.0
b,25.0,34.0,32.0,25.0,32.0
c,15.0,14.0,,26.0,33.0
d,27.0,19.0,30.0,14.0,13.0
e,11.0,30.0,27.0,,25.0


In [94]:
# 返回行索引，列索引
pan3.index,pan3.columns

(Index(['a', 'b', 'c', 'd', 'e'], dtype='object'),
 Index(['a0', 'a1', 'a2', 'a3', 'a4'], dtype='object'))

In [95]:
# 数据框的键值作为列索引
pan3.keys()

Index(['a0', 'a1', 'a2', 'a3', 'a4'], dtype='object')

In [97]:
# 得出值
pan3.values

array([[33., nan, 21., 14., 22.],
       [25., 34., 32., 25., 32.],
       [15., 14., nan, 26., 33.],
       [27., 19., 30., 14., 13.],
       [11., 30., 27., nan, 25.]])

In [104]:
# 数据框具有超强的兼容性，每一行可以有不同的数据类型
# dtypes 函数
pan3.dtypes

a0    float64
a1    float64
a2    float64
a3    float64
a4    float64
dtype: object

In [107]:
# 找值
# isin[()] 函数
# any(1)  按行查找
pan3.isin([21,22]).any(1)

a     True
b    False
c    False
d    False
e    False
dtype: bool

In [109]:
# shape 外形
pan3.shape

(5, 5)

In [111]:
# size 返回外形
pan3.size

25

In [113]:
# items()
# list(pan3.items())

In [120]:
# 使用字典创建数据框
ser1 = {i:pd.Series(np.random.randint(12,36,5),index=["a"+str(j) for j in range(5)]) for i in list("cvbm")}
pd.DataFrame(ser1)

Unnamed: 0,c,v,b,m
a0,21,32,19,19
a1,33,22,22,29
a2,21,22,17,35
a3,24,24,15,17
a4,24,27,25,15


In [129]:
# 根据指定索引进行更新

pan4 = pd.DataFrame(np.random.randint(13,69,(5,5)),index=list("abcde"),columns=list("ABCDE"),dtype=float)
pan4

Unnamed: 0,A,B,C,D,E
a,41.0,36.0,60.0,35.0,20.0
b,18.0,32.0,45.0,45.0,29.0
c,52.0,35.0,43.0,54.0,60.0
d,65.0,64.0,21.0,63.0,43.0
e,56.0,47.0,37.0,42.0,61.0


In [130]:
pan4.update(pd.DataFrame(np.random.randint(-20,-10,(2,5)),index=["b","d"],columns=list("ABCDE")))
pan4

Unnamed: 0,A,B,C,D,E
a,41.0,36.0,60.0,35.0,20.0
b,-17.0,-14.0,-16.0,-15.0,-15.0
c,52.0,35.0,43.0,54.0,60.0
d,-17.0,-18.0,-12.0,-17.0,-19.0
e,56.0,47.0,37.0,42.0,61.0


In [132]:
# 以追加的方式进行更新

pan5 = pd.DataFrame(np.random.randint(13,69,(5,5)),index=list("abcde"),columns=list("QWERT"),dtype=float)
pan5

Unnamed: 0,Q,W,E,R,T
a,25.0,50.0,34.0,29.0,67.0
b,16.0,43.0,21.0,52.0,47.0
c,66.0,63.0,56.0,33.0,24.0
d,38.0,35.0,51.0,30.0,19.0
e,32.0,44.0,45.0,36.0,43.0


In [133]:
pan5.append(pd.DataFrame(np.random.randint(-20,-10,(3,5)),index=["f","g","h"],columns=list("QWERT")))

Unnamed: 0,Q,W,E,R,T
a,25.0,50.0,34.0,29.0,67.0
b,16.0,43.0,21.0,52.0,47.0
c,66.0,63.0,56.0,33.0,24.0
d,38.0,35.0,51.0,30.0,19.0
e,32.0,44.0,45.0,36.0,43.0
f,-18.0,-15.0,-13.0,-17.0,-18.0
g,-18.0,-12.0,-20.0,-12.0,-13.0
h,-11.0,-18.0,-19.0,-18.0,-15.0


In [140]:
# 对数据框nan进行补值
pan6 = pd.DataFrame(arr,index=list("abcde"),columns=["a"+str(i) for i in range(5)])
pan6

Unnamed: 0,a0,a1,a2,a3,a4
a,33.0,666.0,21.0,14.0,22.0
b,25.0,34.0,32.0,25.0,32.0
c,15.0,14.0,999.0,26.0,33.0
d,27.0,19.0,30.0,14.0,13.0
e,11.0,30.0,27.0,888.0,25.0


In [141]:
pan6[pan6.isnull()] = [999,666,888]
pan6

Unnamed: 0,a0,a1,a2,a3,a4
a,33.0,666.0,21.0,14.0,22.0
b,25.0,34.0,32.0,25.0,32.0
c,15.0,14.0,999.0,26.0,33.0
d,27.0,19.0,30.0,14.0,13.0
e,11.0,30.0,27.0,888.0,25.0


In [162]:
# 数据框的选区

pan6["a":"c"]

Unnamed: 0,a0,a1,a2,a3,a4
a,33.0,666.0,21.0,14.0,22.0
b,25.0,34.0,32.0,25.0,32.0
c,15.0,14.0,999.0,26.0,33.0


In [168]:
# 选区
pan6.loc["a":"c","a0":"a2"]

Unnamed: 0,a0,a1,a2
a,33.0,666.0,21.0
b,25.0,34.0,32.0
c,15.0,14.0,999.0


In [169]:
# 值选区一行数据
pan6.loc["a":"a","a0":"a4"]

Unnamed: 0,a0,a1,a2,a3,a4
a,33.0,666.0,21.0,14.0,22.0


In [170]:
# 只选取一列数据
pan6.loc["a":"e","a0":"a0"]

Unnamed: 0,a0
a,33.0
b,25.0
c,15.0
d,27.0
e,11.0


In [173]:
# 换行索引和列索引

pan6.index = list("ABCDE")
pan6.columns = ["b"+str(i) for i in range(3,8)]
pan6

Unnamed: 0,b3,b4,b5,b6,b7
A,33.0,666.0,21.0,14.0,22.0
B,25.0,34.0,32.0,25.0,32.0
C,15.0,14.0,999.0,26.0,33.0
D,27.0,19.0,30.0,14.0,13.0
E,11.0,30.0,27.0,888.0,25.0


In [194]:
arr2 = np.random.randint(20,50,(5,5)).astype(float)
arr2[[0,2,4],[1,3,4]] = np.nan
pan9 = pd.DataFrame(arr2,index=list("abcde"),columns=["a"+str(i) for i in range(5)])
# pan9

Unnamed: 0,a0,a1,a2,a3,a4
a,34.0,,40.0,48.0,34.0
b,30.0,23.0,25.0,29.0,35.0
c,27.0,42.0,39.0,,44.0
d,23.0,44.0,40.0,31.0,42.0
e,31.0,43.0,44.0,22.0,


In [226]:
# 将nan值进行填充
pan9.loc["a":"a","a1"] = [999]
pan9

Unnamed: 0,a0,a1,a2,a3,a4
a,34.0,999.0,40.0,48.0,34.0
b,30.0,23.0,25.0,29.0,35.0
c,27.0,42.0,39.0,,44.0
d,23.0,44.0,40.0,31.0,42.0
e,31.0,43.0,44.0,22.0,
