## 3.1 Pandas 数据结构的介绍
### 3.1.2 Series 数据类型的运算

In [None]:
from pandas import Series

goods_in = Series({"苹果": 30, "梨": 25, "香蕉": 20, "桃": 21, "李子": 15})
goods_other_in = Series({"苹果": 10, "梨": 20, "香蕉": 15, "桃": 10, "西瓜": 50})
goods_kucun = goods_in + goods_other_in
print(f"库存:\n[{goods_kucun}]")
print("注：一个 Series 中有索引而另一个没有，相加后结果为 NAN （表示空值）")

In [None]:
import numpy as np

goods = Series([30, 25, 20, 21, np.nan], index=["苹果", "梨", "香蕉", "桃", "李子"])
print(f"original Series:\n{goods}")
goods["李子"] = 15
print(f"Series after assigning new values:\n{goods}")

### 3.1.3 DataFrame 数据结构

In [None]:
from pandas import DataFrame

data = {
    "字画名称": ["旭日东升", "富水长流", "招财进宝", "鸿运当头"],
    "字画底价": [2860, 498, 1068, 598],
    "字画拍卖加价": [1000, 2000, 500, 1500],
}
goods_in = DataFrame(data)
print(f"goods_in:\n{goods_in}")
print("-" * 20)
goods_in1 = DataFrame(data, columns=["字画名称", "字画拍卖加价", "字画底价"])
print(f"goods_in1:\n{goods_in1}")

DataFrame最终是按照columns指定的顺序排列的。如果传入的列名在数据中是无法找到的,就会产生NaN值。  

In [None]:
goods_in2 = DataFrame(
    data, columns=["字画名称", "字画拍卖加价", "字画底价", "字画所属人"]
)
print(f"goods_in2:\n{goods_in2}")

列名被指定DataFrame结构时可以通过类似字典标记的方式将列获取为一个Series结构。  
代码中定义了字画的DataFrame之后,直接通过goods_in["字画底价"]来访问“字画底价”这个维度的数据,获取的是一个Series结构的数据。  
返回的Series拥有与原DataFrame相同的索引,且其索引值也已经被相应地设置好了。

In [None]:
goods_in3 = DataFrame(data, index=["第一幅", "第二幅", "第三幅", "第四幅"])
data_price = goods_in3["字画底价"]
print(data_price)

可以通过位置和名称的方式访问行。  
使用goods_in.loc["第三幅"] 来获取行数据,其中loc就是位置的关键词,“第三幅”就是索引的名称。


In [None]:
data_3 = goods_in3.loc["第三幅"]
print(f"data_3:\n{data_3}")

利用标签的切片运算也可以获取多行和多列。  

In [None]:
data_4 = goods_in3.loc[["第三幅", "第四幅"], ["字画名称", "字画底价"]]
print(data_4)

对DataFrame数据的选取也可以通过布尔型数组实现。  

In [None]:
data_5 = goods_in3.loc[goods_in3["字画底价"] > 500, :]
print(f"data_5:\n{data_5}")
print(f"-" * 20)
data_6 = goods_in3.loc[
    (goods_in3["字画底价"] > 500) & (goods_in3["字画拍卖加价"] > 1000), :
]
print(f"data_6:\n{data_6}")

### 3.1.4 DataFrame 数据的修改

In [None]:
from pandas import DataFrame

data = {
    "字画名称": ["旭日东升", "富水长流", "招财进宝", "鸿运当头"],
    "字画底价": [2860, 498, 1068, 598],
    "字画拍卖加价": [1000, 2000, 500, 1500],
}
goods_in = DataFrame(
    data, columns=["字画名称", "字画底价", "字画拍卖加价", "字画所属人"]
)
print(f"goods_in:\n{goods_in}")
print(f"-" * 20)
goods_in["字画所属人"] = "张三"
print(f"goods_in:\n{goods_in}")
print(f"-" * 20)
goods_in["字画所属人"] = ["张三", "李四", "王五", "赵六"]
print(f"goods_in:\n{goods_in}")

可通过del DataFrame名["列名"]格式来进行删除操作。

In [None]:
del goods_in["字画所属人"]
print(f"goods_in:\n{goods_in}")

### 3.1.5 DataFrame 中的索引对象
注意：索引对象不可修改。

### 3.1.6 层次化索引
层次化索引使用户能在一个轴上拥有多个索引级别，即能以低纬度形式处理高维度数据。

In [None]:
from pandas import DataFrame

data = {
    "字画名称": ["旭日东升", "富水长流", "招财进宝", "鸿运当头"],
    "字画底价": [2860, 498, 1068, 598],
    "字画拍卖加价": [1000, 2000, 500, 1500],
}
goods_in = DataFrame(
    data,
    index=[
        ["第一拍卖现场", "第一拍卖现场", "第二拍卖现场", "第二拍卖现场"],
        ["第一幅", "第二幅", "第一幅", "第二幅"],
    ],
)
print(f"层次化索引后的 goods_in:\n{goods_in}")
print(f"-" * 20)
goods_in_indexes = goods_in.index
print(f"goods_in_indexes:\n{goods_in_indexes}")

 选取数据子集。

In [None]:
goods_in_second = goods_in.loc["第二拍卖现场"]
print(f"goods_in_second:\n{goods_in_second}")
print(f"-" * 20)
goods_in_second_first = goods_in.loc["第二拍卖现场", "第一幅"]
print(f"goods_in_second_first:\n{goods_in_second_first}")

 通过 unstack() 方法将二级列索引转换为二级行索引。
 还可通过 stack() 方法进行逆向操作。

In [None]:
print(f"original goods_in:\n{goods_in}")
print(f"-" * 20)
goods_stack = goods_in.unstack()
print(f"unstacked goods_in:\n{goods_stack}")
print(f"-" * 20)
goods_stack = goods_in.unstack().stack()
print(f"stacked goods_in:\n{goods_stack.stack()}")

## 3.2 Pandas 数据结构中的基本数据操作
### 3.2.1 重新索引

In [None]:
from pandas import DataFrame

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [38.78, 9.80, 39.48],
    "最 高报价": [49.80, 14.10, 58.78],
}
goods_in = DataFrame(data, index=["第一辆车", "第二辆车", "第三辆车"])
print(f"goods_in:\n{goods_in}")
print(f"-" * 20)
other_goods = goods_in.reindex(["第三辆车", "第二辆车", "第一辆车"])
print(f"other_goods:\n{other_goods}")

缺少数据时的前填充。

In [None]:
other_goods = goods_in.reindex(
    ["第三辆车", "第二辆车", "第一辆车", "第四辆车"], fill_value=7.9
)
print(f"直接填充数字后的 other_goods:\n{other_goods}")
print(f"-" * 20)
goods_in = DataFrame(data, index=[1, 2, 3])
other_goods = goods_in.reindex([1, 2, 3, 4], method="ffill")
print(f"使用 ffill 方法填充后的 other_goods:\n{other_goods}")
print(f"-" * 20)
other_goods = goods_in.reindex([1, 2, 4, 3], method="bfill")
print(f"使用 bfill 方法填充后的 other_goods:\n{other_goods}")

### 3.2.2 删除指定轴上的项

In [None]:
from pandas import DataFrame

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [38.78, 9.80, 39.48],
    "最高报价": [49.80, 14.10, 58.78],
}
goods_in = DataFrame(data, index=[1, 2, 3])
print(f"goods_in:\n{goods_in}")
print(f"-" * 20)
goods_in = goods_in.drop(2)
print(f"goods_in after dropping row 2:\n{goods_in}")
print(f"-" * 20)
goods_in = goods_in.drop("最低报价", axis=1)
print(f"goods_in after dropping column '最低报价':\n{goods_in}")

### 3.2.3 算数运算和数据对齐
非共有索引项会显示 Nan。

In [None]:
from pandas import DataFrame

kindergarden1 = {
    "小朋友数目": {"1班": 32, "2班": 20},
    "小朋友睡床": {"1班": 40, "2班": 30},
    "上课教室": {"1班": 3, "2班": 2},
}
kindergarden2 = {
    "小朋友数目": {"1班": 10, "2班": 21, "3班": 15},
    "小朋友睡床": {"1班": 11, "2班": 21, "3班": 16},
    "上课教室": {"1班": 1, "2班": 2, "3班": 2},
}
kindergarden_dataframe1 = DataFrame(kindergarden1)
kindergarden_dataframe2 = DataFrame(kindergarden2)
kindergarden_all1 = kindergarden_dataframe1+kindergarden_dataframe2
print(f"all kindergarden data using “+”:\n{kindergarden_all1}")
print(f"-"*20)
kindergarden_all2 = kindergarden_dataframe1.add(kindergarden_dataframe2,fill_value=0)
print(f"all kindergarden data using “add”:\n{kindergarden_all2}")

 定义数据时使用 DataFrame 和Series 类型，可直接使用 “+” 合并数据。

In [None]:
from pandas import DataFrame, Series

kindergarden1={"小朋友数目":[32,20],"小朋友睡床":[40,30],"上课教室": [3,2]}
kindergarden2={"小朋友数目":16,"小朋友睡床":19,"上课教室":2}
kindergarden_dataframe1 = DataFrame(kindergarden1)
kindergarden_series1 = Series(kindergarden2)
kindergarden_all = kindergarden_dataframe1+kindergarden_series1
print(f"combined dataframe:\n{kindergarden_all}")

## 3.3 数据处理
### 3.3.1 判断缺失数据

In [None]:
from pandas import DataFrame
import numpy as np

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [np.nan, 9.80, 20.00],
    "最高报价": [49.80, np.nan, 58.78],
}
goods_in = DataFrame(data, index=[1, 2, 3])
print(f"original data:\n{goods_in}")
print(f"-" * 20)
goods_in_isnull = goods_in.isnull()
print(f"查找 nan 单元格:\n{goods_in_isnull}")
print(f"-" * 20)
goods_in_isnull = goods_in[goods_in["最低报价"].isnull()]
print(f"查找最低报价为 nan 的行:\n{goods_in_isnull}")

### 3.3.2 删除缺失数据

In [None]:
from pandas import DataFrame
import numpy as np

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [np.nan, 9.80, 20.00],
    "最高报价": [49.80, np.nan, 58.78],
}
goods_in = DataFrame(data, index=[1, 2, 3])
print(f"original data:\n{goods_in}")
print(f"-"*20)
goods_in_nonull = goods_in.dropna()
print(f"剔除 nan 值所在的行:\n{goods_in_nonull}")
print(f"-"*20)
goods_in_nonull = goods_in.dropna(axis=1)
print(f"剔除 nan 值所在的列:\n{goods_in_nonull}")

### 3.3.3 填充缺失数据

In [None]:
from pandas import DataFrame
import numpy as np

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [np.nan, 9.80, np.nan],
    "最高报价": [49.80, 23.10, np.nan],
}
goods_in = DataFrame(data, index=[1, 2, 3])
goods_in_nonull = goods_in.fillna(999.99)
print(f"统一填充后的数据:\n{goods_in_nonull}")
print(f"-" * 20)
goods_in_fill = goods_in.fillna({"最低报价": 0.00, "最高报价": 999.99})
print(f"按列分别填充后的数据:\n{goods_in_fill}")

### 3.3.4 移除重复数据
查找重复数据，行中有重复元素的显示为 True，否则显示为 False。  
可根据以上查找结果删除重复项，默认保留第一个出现的项，参数 keep="last" 则保留最后一个出现的项。

In [None]:
from pandas import DataFrame

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC", "奥迪Q5L", "哈弗H6"],
    "最低报价": [9.80, 14.35, 15.42, 9.80, 14.35],
    "最高报价": [49.80, 23.10, 60.45, 49.80, 23.10],
}
dataFrame = DataFrame(data)
print(f"original dataFrame:\n{dataFrame}")
print(f"-" * 20)
dataFrameDuplicated = dataFrame.duplicated()
print(f"finding duplicated data in dataFrame:\n{dataFrameDuplicated}")
print(f"-" * 20)
dataFrameDropDuplicated = dataFrame.drop_duplicates(keep="last")
print(f"dropping duplicated data in dataFrame:\n{dataFrameDropDuplicated}")

### 3.3.5 替换数据

In [None]:
from pandas import DataFrame
import numpy as np

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC", "奥迪Q5L", "哈弗H6"],
    "最低报价": [9.80, 14.35, 15.42, 9.80, np.nan],
    "最高报价": [49.80, 23.10, np.nan, 49.80, 23.10],
}
dataFrame = DataFrame(data)
print(f"original dataFrame:\n{dataFrame}")
print(f"-" * 20)
dataFrameReplaced = dataFrame.replace({np.nan: "无报价", 9.80: 88.88})
print(f"dataFrame after Replaced:\n{dataFrameReplaced}")

### 3.3.6 排列和随机采样
从 dataFrame 提取行(随机排列)。  
随机提取后采样。

In [None]:
from pandas import DataFrame
import numpy as np

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC", "奥迪Q5L", "哈弗H6"],
    "最低报价": [9.80, 14.35, 15.42, 9.80, 14.35],
    "最高报价": [49.80, 23.10, 60.45, 49.80, 23.10],
}
dataFrame = DataFrame(data, index=["A", "B", "C", "D", "E"])
print(f"original dataFrame:\n{dataFrame}")
print(f"-" * 20)
dataFramePermutated = np.random.permutation(dataFrame)
print(f"dataFrame after permutation:\n{dataFramePermutated}")
print(f"-" * 20)
dataFramePermutated1 = dataFrame.take(np.random.permutation(len(dataFrame)))
print(f"dataFrame after permutation:\n{dataFramePermutated1}")
print(f"-" * 20)
dataFramePermutated2 = dataFrame.take(np.random.permutation(2))
print(f"dataFrame after permutation for 2 rows:\n{dataFramePermutated2}")


## 3.4 方法的应用与映射
以将数据归一化为例。

In [None]:
from pandas import DataFrame

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [38.78, 9.80, 39.48],
    "最高报价": [49.80, 14.10, 58.78],
}
dataFrame = DataFrame(data, index=["A", "B", "C"])
print(f"原始数据:\n{dataFrame}")
print(f"-" * 20)
f = lambda x: (x - x.min()) / (x.max() - x.min())
dataFrame[["最低报价", "最高报价"]] = dataFrame[["最低报价", "最高报价"]].apply(f)
print(f"报价归一化:\n{dataFrame}")