## 3.1 Pandas 数据结构的介绍
### 3.1.2 Series 数据类型的运算

In [None]:
from pandas import Series

goods_in = Series({"苹果": 30, "梨": 25, "香蕉": 20, "桃": 21, "李子": 15})
goods_other_in = Series({"苹果": 10, "梨": 20, "香蕉": 15, "桃": 10, "西瓜": 50})
goods_kucun = goods_in + goods_other_in
print(f"库存:\n[{goods_kucun}]")
print("注：一个 Series 中有索引而另一个没有，相加后结果为 NAN （表示空值）")

In [None]:
import numpy as np

goods = Series([30, 25, 20, 21, np.nan], index=["苹果", "梨", "香蕉", "桃", "李子"])
print(f"original Series:\n{goods}")
goods["李子"] = 15
print(f"Series after assigning new values:\n{goods}")

### 3.1.3 DataFrame 数据结构

In [None]:
from pandas import DataFrame

data = {
    "字画名称": ["旭日东升", "富水长流", "招财进宝", "鸿运当头"],
    "字画底价": [2860, 498, 1068, 598],
    "字画拍卖加价": [1000, 2000, 500, 1500],
}
df = DataFrame(data)
print(f"原始数据:\n{df}")
print("-" * 20)
df1 = DataFrame(data, columns=["字画名称", "字画拍卖加价", "字画底价"])
print(f"交换列顺序:\n{df1}")

DataFrame最终是按照columns指定的顺序排列的。如果传入的列名在数据中是无法找到的,就会产生NaN值。  

In [None]:
df2 = DataFrame(
    data, columns=["字画名称", "字画拍卖加价", "字画底价", "字画所属人"]
)
print(f"插入无数据的新列:\n{df2}")

列名被指定DataFrame结构时可以通过类似字典标记的方式将列获取为一个Series结构。  
代码中定义了字画的DataFrame之后,直接通过goods_in["字画底价"]来访问“字画底价”这个维度的数据,获取的是一个Series结构的数据。  
返回的Series拥有与原DataFrame相同的索引,且其索引值也已经被相应地设置好了。

In [None]:
df3 = DataFrame(data, index=["第一幅", "第二幅", "第三幅", "第四幅"])
data_price = df3["字画底价"]
print(f"仅列出字画低价列:\n{data_price}")

可以通过位置和名称的方式访问行。  
使用goods_in.loc["第三幅"] 来获取行数据,其中loc就是位置的关键词,“第三幅”就是索引的名称。


In [None]:
data_3 = df3.loc["第三幅"]
print(f"仅列出“第三幅”所在行的数据:\n{data_3}")

利用标签的切片运算也可以获取多行和多列。  

In [None]:
data_4 = df3.loc[["第三幅", "第四幅"], ["字画名称", "字画底价"]]
print(f"沿axis=1的方向切片:\n{data_4}")

对DataFrame数据的选取也可以通过布尔型数组实现。  

In [None]:
data_5 = df3.loc[df3["字画底价"] > 500, :]
print(f"筛选出字画底价大于500的作品:\n{data_5}")
print(f"-" * 20)
data_6 = df3.loc[
    (df3["字画底价"] > 500) & (df3["字画拍卖加价"] > 1000), :
]
print(f"筛选出字画底价大于500，字画拍卖加价大于1000的作品:\n{data_6}")

### 3.1.4 DataFrame 数据的修改

In [None]:
from pandas import DataFrame

data = {
    "字画名称": ["旭日东升", "富水长流", "招财进宝", "鸿运当头"],
    "字画底价": [2860, 498, 1068, 598],
    "字画拍卖加价": [1000, 2000, 500, 1500],
}
df = DataFrame(
    data, columns=["字画名称", "字画底价", "字画拍卖加价", "字画所属人"]
)
print(f"原始数据:\n{df}")
print(f"-" * 20)
df["字画所属人"] = "张三"
print(f"填充“张三”为字画所熟人:\n{df}")
print(f"-" * 20)
df["字画所属人"] = ["张三", "李四", "王五", "赵六"]
print(f"分别为每条数据添加字画所属人:\n{df}")

可通过del DataFrame名["列名"]格式来进行删除操作。

In [None]:
del df["字画所属人"]
print(f"删除“字画所属人”列:\n{df}")

### 3.1.5 DataFrame 中的索引对象
注意：索引对象不可修改。

### 3.1.6 层次化索引
层次化索引使用户能在一个轴上拥有多个索引级别，即能以低纬度形式处理高维度数据。

In [None]:
from pandas import DataFrame

data = {
    "字画名称": ["旭日东升", "富水长流", "招财进宝", "鸿运当头"],
    "字画底价": [2860, 498, 1068, 598],
    "字画拍卖加价": [1000, 2000, 500, 1500],
}
df = DataFrame(
    data,
    index=[
        ["第一拍卖现场", "第一拍卖现场", "第二拍卖现场", "第二拍卖现场"],
        ["第一幅", "第二幅", "第一幅", "第二幅"],
    ],
)
print(f"层次化索引后的 dataframe:\n{df}")
print(f"-" * 20)
df_indexes = df.index
print(f"goods_in_indexes:\n{df_indexes}")

 选取数据子集。

In [None]:
df_second = df.loc["第二拍卖现场"]
print(f"仅选取第二拍卖现场的数据:\n{df_second}")
print(f"-" * 20)
df_second_first = df.loc["第二拍卖现场", "第一幅"]
print(f"仅选取第二拍卖现场第一幅画的数据:\n{df_second_first}")

 通过 unstack() 方法将二级列索引转换为二级行索引。
 还可通过 stack() 方法进行逆向操作。

In [None]:
print(f"原始数据:\n{df}")
print(f"-" * 20)
df_stack = df.unstack()
print(f"转换索引后的数据:\n{df_stack}")
print(f"-" * 20)
df_stack = df.unstack().stack()
print(f"二次转换索引后的数据:\n{df.unstack().stack()}")

## 3.2 Pandas 数据结构中的基本数据操作
### 3.2.1 重新索引

In [None]:
from pandas import DataFrame

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [38.78, 9.80, 39.48],
    "最高报价": [49.80, 14.10, 58.78],
}
df = DataFrame(data, index=["第一辆车", "第二辆车", "第三辆车"])
print(f"原始数据:\n{df}")
print(f"-" * 20)
other_df = df.reindex(["第三辆车", "第二辆车", "第一辆车"])
print(f"调整索引后的数据:\n{other_df}")

缺少数据时的前填充。

In [None]:
other_df = df.reindex(
    ["第三辆车", "第二辆车", "第一辆车", "第四辆车"], fill_value=7.9
)
print(f"直接填充数字后的数据:\n{other_df}")
print(f"-" * 20)
df = DataFrame(data, index=[1, 2, 3])
other_df = df.reindex([1, 2, 3, 4], method="ffill")
print(f"使用 ffill 方法填充后的数据:\n{other_df}")
print(f"-" * 20)
other_df = df.reindex([1, 2, 4, 3], method="bfill")
print(f"使用 bfill 方法填充后的数据:\n{other_df}")

### 3.2.2 删除指定轴上的项

In [None]:
from pandas import DataFrame

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [38.78, 9.80, 39.48],
    "最高报价": [49.80, 14.10, 58.78],
}
df = DataFrame(data, index=[1, 2, 3])
print(f"原始数据:\n{df}")
print(f"-" * 20)
df = df.drop(2)
print(f"删除第二行后的数据:\n{df}")
print(f"-" * 20)
df = df.drop("最低报价", axis=1)
print(f"删除\"最低报价\"列后的数据:\n{df}")

### 3.2.3 算数运算和数据对齐
非共有索引项会显示 Nan。

In [None]:
from pandas import DataFrame

kindergarden1 = {
    "小朋友数目": {"1班": 32, "2班": 20},
    "小朋友睡床": {"1班": 40, "2班": 30},
    "上课教室": {"1班": 3, "2班": 2},
}
kindergarden2 = {
    "小朋友数目": {"1班": 10, "2班": 21, "3班": 15},
    "小朋友睡床": {"1班": 11, "2班": 21, "3班": 16},
    "上课教室": {"1班": 1, "2班": 2, "3班": 2},
}
kindergarden_dataframe1 = DataFrame(kindergarden1)
kindergarden_dataframe2 = DataFrame(kindergarden2)
kindergarden_all1 = kindergarden_dataframe1+kindergarden_dataframe2
print(f"all kindergarden data using “+”:\n{kindergarden_all1}")
print(f"-"*20)
kindergarden_all2 = kindergarden_dataframe1.add(kindergarden_dataframe2,fill_value=0)
print(f"all kindergarden data using “add”:\n{kindergarden_all2}")

 定义数据时使用 DataFrame 和Series 类型，可直接使用 “+” 合并数据。

In [None]:
from pandas import DataFrame, Series

kindergarden1={"小朋友数目":[32,20],"小朋友睡床":[40,30],"上课教室": [3,2]}
kindergarden2={"小朋友数目":16,"小朋友睡床":19,"上课教室":2}
kindergarden_dataframe1 = DataFrame(kindergarden1)
kindergarden_series1 = Series(kindergarden2)
kindergarden_all = kindergarden_dataframe1+kindergarden_series1
print(f"combined dataframe:\n{kindergarden_all}")

## 3.3 数据处理
### 3.3.1 判断缺失数据

In [None]:
from pandas import DataFrame
import numpy as np

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [np.nan, 9.80, 20.00],
    "最高报价": [49.80, np.nan, 58.78],
}
df = DataFrame(data, index=[1, 2, 3])
print(f"original data:\n{df}")
print(f"-" * 20)
df_isnull = df.isnull()
print(f"查找 nan 单元格:\n{df_isnull}")
print(f"-" * 20)
df_isnull = df[df["最低报价"].isnull()]
print(f"查找最低报价为 nan 的行:\n{df_isnull}")

### 3.3.2 删除缺失数据

In [None]:
from pandas import DataFrame
import numpy as np

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [np.nan, 9.80, 20.00],
    "最高报价": [49.80, np.nan, 58.78],
}
df = DataFrame(data, index=[1, 2, 3])
print(f"original data:\n{df}")
print(f"-"*20)
df_nonull = df.dropna()
print(f"剔除 nan 值所在的行:\n{df_nonull}")
print(f"-"*20)
df_nonull = df.dropna(axis=1)
print(f"剔除 nan 值所在的列:\n{df_nonull}")

### 3.3.3 填充缺失数据

In [None]:
from pandas import DataFrame
import numpy as np

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [np.nan, 9.80, np.nan],
    "最高报价": [49.80, 23.10, np.nan],
}
goods_in = DataFrame(data, index=[1, 2, 3])
goods_in_nonull = goods_in.fillna(999.99)
print(f"统一填充后的数据:\n{goods_in_nonull}")
print(f"-" * 20)
goods_in_fill = goods_in.fillna({"最低报价": 0.00, "最高报价": 999.99})
print(f"按列分别填充后的数据:\n{goods_in_fill}")

### 3.3.4 移除重复数据
查找重复数据，行中有重复元素的显示为 True，否则显示为 False。  
可根据以上查找结果删除重复项，默认保留第一个出现的项，参数 keep="last" 则保留最后一个出现的项。

In [None]:
from pandas import DataFrame

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC", "奥迪Q5L", "哈弗H6"],
    "最低报价": [9.80, 14.35, 15.42, 9.80, 14.35],
    "最高报价": [49.80, 23.10, 60.45, 49.80, 23.10],
}
dataFrame = DataFrame(data)
print(f"original dataFrame:\n{dataFrame}")
print(f"-" * 20)
dataFrameDuplicated = dataFrame.duplicated()
print(f"finding duplicated data in dataFrame:\n{dataFrameDuplicated}")
print(f"-" * 20)
dataFrameDropDuplicated = dataFrame.drop_duplicates(keep="last")
print(f"dropping duplicated data in dataFrame:\n{dataFrameDropDuplicated}")

### 3.3.5 替换数据

In [None]:
from pandas import DataFrame
import numpy as np

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC", "奥迪Q5L", "哈弗H6"],
    "最低报价": [9.80, 14.35, 15.42, 9.80, np.nan],
    "最高报价": [49.80, 23.10, np.nan, 49.80, 23.10],
}
dataFrame = DataFrame(data)
print(f"original dataFrame:\n{dataFrame}")
print(f"-" * 20)
dataFrameReplaced = dataFrame.replace({np.nan: "无报价", 9.80: 88.88})
print(f"dataFrame after Replaced:\n{dataFrameReplaced}")

### 3.3.6 排列和随机采样
从 dataFrame 提取行(随机排列)。  
随机提取后采样。

In [None]:
from pandas import DataFrame
import numpy as np

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC", "奥迪Q5L", "哈弗H6"],
    "最低报价": [9.80, 14.35, 15.42, 9.80, 14.35],
    "最高报价": [49.80, 23.10, 60.45, 49.80, 23.10],
}
dataFrame = DataFrame(data, index=["A", "B", "C", "D", "E"])
print(f"original dataFrame:\n{dataFrame}")
print(f"-" * 20)
dataFramePermutated = np.random.permutation(dataFrame)
print(f"dataFrame after permutation:\n{dataFramePermutated}")
print(f"-" * 20)
dataFramePermutated1 = dataFrame.take(np.random.permutation(len(dataFrame)))
print(f"dataFrame after permutation:\n{dataFramePermutated1}")
print(f"-" * 20)
dataFramePermutated2 = dataFrame.take(np.random.permutation(2))
print(f"dataFrame after permutation for 2 rows:\n{dataFramePermutated2}")


## 3.4 方法的应用与映射
以将数据归一化为例。

In [None]:
from pandas import DataFrame

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [38.78, 9.80, 39.48],
    "最高报价": [49.80, 14.10, 58.78],
}
dataFrame = DataFrame(data, index=["A", "B", "C"])
print(f"原始数据:\n{dataFrame}")
print(f"-" * 20)
f = lambda x: (x - x.min()) / (x.max() - x.min())
dataFrame[["最低报价", "最高报价"]] = dataFrame[["最低报价", "最高报价"]].apply(f)
print(f"报价归一化:\n{dataFrame}")

### 3.4.1 排序和排名
可按 index 或列的数据进行排序。

In [None]:
from pandas import DataFrame

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC"],
    "最低报价": [38.78, 9.80, 39.48],
    "最高报价": [49.80, 14.10, 58.78],
}
dataFrame = DataFrame(data, index=["L车", "K车", "D车"])
print(f"原始数据：\n{dataFrame}")
print(f"-" * 20)
dataFrame = dataFrame.sort_index()
print(f"按车名排序后的数据：\n{dataFrame}")
print(f"-" * 20)
dataFrame = dataFrame.sort_values(by="最高报价", ascending=False)
print(f"按最高报价排序后的数据：\n{dataFrame}")
print(f"-" * 20)
dataFrame = dataFrame.rank(method="min", ascending=False)
print(f"按最高报价排名后的数据：\n{dataFrame}")

### 3.4.2 带有重复值的轴索引

In [None]:
from pandas import DataFrame

data = {
    "车名": ["奥迪Q5L", "哈弗H6", "奔驰GLC", "奔驰GLC", "奥迪Q5L"],
    "最低报价": [38.78, 9.80, 39.48, 39.48, 38.78],
    "最高报价": [49.80, 14.10, 58.78, 58.78, 49.80],
}
dataFrame = DataFrame(data, index=["一辆车", "某辆车", "一辆车", "又一辆车", "一辆车"])
print(f"原始数据:\n{dataFrame}")
dataFrameIsUnique = dataFrame.index.is_unique
print(f"是否唯一索引: {dataFrameIsUnique}")
dataFrameUnique = dataFrame.index.unique()
print(f"去重后的索引: {dataFrameUnique}")

### 3.4.3 汇总和计算描述统计

In [None]:
from pandas import DataFrame

data ={"地址":["北京市","大兴区","黄村镇","卫星城"], "购物车内每件商品价格":[38.78,9.80,39.48,39.48]}