# Series和数据框的计算

In [2]:
import numpy as np
import pandas as pd

In [3]:
# Series相加
# 根据索引相加，且两个Series都含有相同的指定的index索引才能相加，
# 否则将产生空值

In [4]:
# 数据框的计算
pan1 = pd.DataFrame(np.random.randint(10,36,(4,5)),index=list("abcd"),columns=["a"+str(i) for i in range(5)])
pan2 = pd.DataFrame(np.random.randint(10,36,(3,2)),index=list("cba"),columns=["a1","a0"])
print(pan1)
print("---------------------")
print(pan2)

   a0  a1  a2  a3  a4
a  23  16  35  14  35
b  12  25  13  29  14
c  21  13  16  28  27
d  31  26  22  24  27
---------------------
   a1  a0
c  27  25
b  12  31
a  31  18


In [5]:
# 数据框与数据框的计算，也强调索引的相对应，行列都相同的情况下才能进行计算
# 否则产生空值
pan1*pan2

Unnamed: 0,a0,a1,a2,a3,a4
a,414.0,496.0,,,
b,372.0,300.0,,,
c,525.0,351.0,,,
d,,,,,


In [6]:
pan3 = pd.DataFrame(np.random.randint(10,36,(4,5)),index=list("abcd"))
pan4 = pd.DataFrame(np.random.randint(10,36,(4,5)),index=list("abcd"))
print(pan3)
print(pan4)

    0   1   2   3   4
a  34  15  18  28  29
b  30  23  13  16  28
c  14  16  24  12  17
d  16  10  16  14  30
    0   1   2   3   4
a  18  17  12  11  10
b  21  21  20  23  28
c  24  31  11  13  31
d  17  20  28  24  32


In [7]:
# 加减乘除
# axis= 轴无用
pan3.subtract(pan4,axis=0)
pan3.add(pan4)
pan3.multiply(pan4)
pan3.divide(pan4)

Unnamed: 0,0,1,2,3,4
a,1.888889,0.882353,1.5,2.545455,2.9
b,1.428571,1.095238,0.65,0.695652,1.0
c,0.583333,0.516129,2.181818,0.923077,0.548387
d,0.941176,0.5,0.571429,0.583333,0.9375


In [8]:
# Series和数据框的计算
pan5 = pd.DataFrame(np.random.randint(10,36,(4,5)),index=list("abcd"))
ser3 = pd.Series(np.random.randint(4,36,4))
print(pan5)
print(ser3)

    0   1   2   3   4
a  28  16  20  34  28
b  18  35  12  19  13
c  23  30  27  25  32
d  17  20  31  10  28
0     9
1    12
2    35
3     7
dtype: int32


In [9]:
# 相减
pan5.subtract(ser3)

Unnamed: 0,0,1,2,3,4
a,19.0,4.0,-15.0,27.0,
b,9.0,23.0,-23.0,12.0,
c,14.0,18.0,-8.0,18.0,
d,8.0,8.0,-4.0,3.0,


In [10]:
# 复杂计算
(pan5-ser3)*pan5

Unnamed: 0,0,1,2,3,4
a,532.0,64.0,-300.0,918.0,
b,162.0,805.0,-276.0,228.0,
c,322.0,540.0,-216.0,450.0,
d,136.0,160.0,-124.0,30.0,


#### Series的排序

In [11]:
ser3 = pd.Series(np.random.randint(10,30,5),index=list("bcade"))
ser3

b    12
c    17
a    13
d    23
e    13
dtype: int32

In [12]:
# sort_index（）对索引进行排序
# True和False 
ser3.sort_index(ascending=True)

a    13
b    12
c    17
d    23
e    13
dtype: int32

In [13]:
# sort_values 对值进行排序
ser3.sort_values()

b    12
a    13
e    13
c    17
d    23
dtype: int32

In [14]:
# 返回排名的序号
ser3.rank(method="average")

b    1.0
c    4.0
a    2.5
d    5.0
e    2.5
dtype: float64

In [15]:
pan7 = pd.DataFrame(np.random.randint(10,36,(4,5)),index=list("dacb"),columns=["a"+str(i) for i in range(5)])
pan7

Unnamed: 0,a0,a1,a2,a3,a4
d,25,22,15,26,29
a,33,16,28,34,26
c,30,13,13,28,33
b,34,14,26,14,30


In [16]:
# axis=0或者1对索引进行排序
pan7.sort_index(axis=1)

Unnamed: 0,a0,a1,a2,a3,a4
d,25,22,15,26,29
a,33,16,28,34,26
c,30,13,13,28,33
b,34,14,26,14,30


In [17]:
# 按照某一类值的排序,axis=0,按列排序
pan7.sort_values(by="a1",axis=0)

Unnamed: 0,a0,a1,a2,a3,a4
c,30,13,13,28,33
b,34,14,26,14,30
a,33,16,28,34,26
d,25,22,15,26,29


In [18]:
# 按某一行的 值 进行排序
# 整行整列的不破坏数据的排序
pan7.sort_values(by="d",axis=1)

Unnamed: 0,a2,a1,a0,a3,a4
d,15,22,25,26,29
a,28,16,33,34,26
c,13,13,30,28,33
b,26,14,34,14,30


In [19]:
# 多条件进行排序，一层一层的排序
# 按照两列进行排序，类似mysql中order by排序 
pan7.sort_values(by=["a1","a2","a3"])

Unnamed: 0,a0,a1,a2,a3,a4
c,30,13,13,28,33
b,34,14,26,14,30
a,33,16,28,34,26
d,25,22,15,26,29


In [20]:
# 数据框的排名
pan8 = pd.DataFrame(np.random.randint(10,36,(4,5)),index=list("abcd"),columns=["a"+str(i) for i in range(5)])
pan8

Unnamed: 0,a0,a1,a2,a3,a4
a,35,12,25,15,30
b,30,34,11,30,33
c,18,14,21,19,14
d,23,30,11,26,13


In [21]:
# 对数据框的值进行排名
# 对数据的列 排名
pan8.rank(axis=0,method="average",ascending=True)

Unnamed: 0,a0,a1,a2,a3,a4
a,4.0,1.0,4.0,1.0,3.0
b,3.0,4.0,1.5,4.0,4.0
c,1.0,2.0,3.0,2.0,2.0
d,2.0,3.0,1.5,3.0,1.0


In [37]:
pan8["a0"]

a    35
b    30
c    18
d    23
Name: a0, dtype: int32

#### 读取csv文件练习

In [22]:
data_txt = np.loadtxt("datasql.csv",delimiter=",",dtype=object)
data_txt

array([['addtime', 'id', 'username', ..., 'sex', 'age', 'status'],
       ['2000/2/10', '14', '李四', ..., '男', '69', '正常'],
       ['2000/2/11', '15', '张三', ..., '男', '女', '异常'],
       ...,
       ['2000/5/26', '1258', '张三', ..., '男', '女', '异常'],
       ['2000/5/27', '1259', '王晓静', ..., '女', '36', '正常'],
       ['2000/5/28', '1260', '李四', ..., '男', '69', '正常']], dtype=object)

In [23]:
# 去掉列索引中的addtime，作为列索引
data_col = data_txt[0][1:]
data_col

array(['id', 'username', 'password', 'phone', 'sex', 'age', 'status'],
      dtype=object)

In [24]:
# 将第一列的日期作为行索引
# 从第一行的所有的0列
data_row = data_txt[1:,0]
# data_row

In [25]:
# 取出数据，只要第一行的开始的行，第一列开始的列
data_zone = data_txt[1:,1:]
# data_zone

In [26]:
# 生成一个数据框
pan_res = pd.DataFrame(data_zone,index=[data_row],columns=[data_col])
pan_res

Unnamed: 0,id,username,password,phone,sex,age,status
2000/2/10,14,李四,pbkdf2_sha256$100000$NqOrKGFhGFMF$/y94WUFnqHSD...,14522,男,69,正常
2000/2/11,15,张三,pbkdf2_sha256$100000$nXEtCddj15LB$QlUcmxzjRO1F...,14522,男,女,异常
2000/2/12,16,王晓静,pbkdf2_sha256$100000$PTGW6gEY3BXm$+git95NdWBaK...,3698,女,36,正常
2000/2/13,17,李四,pbkdf2_sha256$100000$NqOrKGFhGFMF$/y94WUFnqHSD...,14522,男,69,正常
2000/2/14,18,张三,pbkdf2_sha256$100000$nXEtCddj15LB$QlUcmxzjRO1F...,14522,男,女,异常
...,...,...,...,...,...,...,...
2000/5/24,1256,王晓静,pbkdf2_sha256$100000$PTGW6gEY3BXm$+git95NdWBaK...,3698,女,36,正常
2000/5/25,1257,李四,pbkdf2_sha256$100000$NqOrKGFhGFMF$/y94WUFnqHSD...,14522,男,69,正常
2000/5/26,1258,张三,pbkdf2_sha256$100000$nXEtCddj15LB$QlUcmxzjRO1F...,14522,男,女,异常
2000/5/27,1259,王晓静,pbkdf2_sha256$100000$PTGW6gEY3BXm$+git95NdWBaK...,3698,女,36,正常


In [27]:
# 返回相同的索引
# 判断有重复的索引
pan_res.loc["2000/1/10"]

Unnamed: 0,id,username,password,phone,sex,age,status
2000/1/10,297,张三,pbkdf2_sha256$100000$nXEtCddj15LB$QlUcmxzjRO1F...,14522,男,女,异常
2000/1/10,298,王晓静,pbkdf2_sha256$100000$PTGW6gEY3BXm$+git95NdWBaK...,3698,女,36,正常
2000/1/10,299,李四,pbkdf2_sha256$100000$NqOrKGFhGFMF$/y94WUFnqHSD...,14522,男,69,正常
2000/1/10,300,张三,pbkdf2_sha256$100000$nXEtCddj15LB$QlUcmxzjRO1F...,14522,男,女,异常
2000/1/10,301,王晓静,pbkdf2_sha256$100000$PTGW6gEY3BXm$+git95NdWBaK...,3698,女,36,正常
...,...,...,...,...,...,...,...
2000/1/10,1116,李四,pbkdf2_sha256$100000$NqOrKGFhGFMF$/y94WUFnqHSD...,14522,男,69,正常
2000/1/10,1117,张三,pbkdf2_sha256$100000$nXEtCddj15LB$QlUcmxzjRO1F...,14522,男,女,异常
2000/1/10,1118,王晓静,pbkdf2_sha256$100000$PTGW6gEY3BXm$+git95NdWBaK...,3698,女,36,正常
2000/1/10,1119,李四,pbkdf2_sha256$100000$NqOrKGFhGFMF$/y94WUFnqHSD...,14522,男,69,正常
