# 9 Pandas分组与聚合

# 设置索引

In [None]:
import pandas as pd
import numpy as np

In [None]:
#索引中单项不可变，但是整体可以换掉
a = pd.DataFrame({'a': range(7),'b': range(7, 0, -1),
                  'c': ['one','one','one','two','two','two', 'two'],
                  'd': list("hjklmno")})
a

In [None]:
c=a.copy()
a.index=list('abcdefg')  #a的索引变了，a.index更换索引
print(c)
# a.columns=list('ABCD')  #a的列名变了，a.columns更换列名
print('-'*50)
print(a)

In [None]:
c.values.shape

In [None]:
#让某些列变为索引，让c列，d列数据变为索引
print(a)
print('-'*50)
a.set_index(['c'],inplace=True)#a没变，返回修改后的df
a

In [None]:
a.columns

In [None]:
a.index

### 时间序列

In [None]:
pd.date_range(start="20190101", end="20190201")

In [None]:
pd.date_range(start="20250107",periods=10,freq='B')

In [None]:
pd.date_range(start="20190101",periods=10,freq='ME')

In [None]:
pd.date_range(start="20190101",periods=10,freq='MS')

In [None]:
pd.date_range(start="20230710",periods=10,freq='W')  #拿每周的周日生成

In [None]:
s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 5)
s

In [None]:
#timeit可以统计执行耗时，to_datetime把字符串转为时间格式
pd.to_datetime(s)

In [None]:
#实战美国911数据
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
# 把时间字符串转为时间类型设置为索引
df = pd.read_csv("./911.csv")
print(df.info())
df["timeStamp"] = pd.to_datetime(df["timeStamp"])

# 添加列，表示分类
temp_list = df["title"].str.split(": ").tolist() #二维列表
cate_list = [i[0] for i in temp_list] #i[0]就是EMS  Fire  Traffic
# print(cate_list)
# print(np.array(cate_list).reshape((df.shape[0], 1)))
 #添加一列
df["cate"] = pd.DataFrame(np.array(cate_list).reshape((df.shape[0], 1)))

df.set_index("timeStamp", inplace=True) #设置索引，时间戳，inplace=True表示在原df上修改

df.head(10)

In [None]:
plt.figure(figsize=(20, 8), dpi=80)

# 分组,一图多线
#resample降采样，按月统计,索引必须是时间类型，类似groupby
for group_name, group_data in df.groupby(by="cate"):
    # 对不同的分类都进行绘图
    count_by_month = group_data.resample("MS").count()["title"]  #降采样
    print(count_by_month)
    # 画图
    _x = count_by_month.index
    print(_x)
    _y = count_by_month.values #values是对应事故发生次数

    _x = [i.strftime("%Y%m%d") for i in _x]  #变年月日格式

    plt.plot(_x, _y, label=group_name)

plt.xticks(range(len(_x)), _x, rotation=45)
plt.legend(loc="best")
plt.show()

### 分组运算

In [None]:
import numpy as np
#分组后给名称加前缀
dict_obj = {'key1' : ['a', 'b', 'a', 'b',
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1, 10, 8),
            'data2': np.random.randint(1, 10, 8)}
df_obj = pd.DataFrame(dict_obj)
print(df_obj)
print('-'*50)


In [None]:
df_obj.info()

In [None]:
# 按key1分组后，计算data1，data2的统计信息并附加到原始表格中，并添加表头前缀
k1_sum = df_obj.groupby('key1').mean(numeric_only=True).add_prefix('mean_')
print(k1_sum)

In [None]:
# 方法2，使用transform，分组后计算结果和原本的df保持一致
k1_sum_tf = df_obj.loc[:,['key1','data1', 'data2']].groupby('key1').transform('mean').add_prefix('mean_')
k1_sum_tf
# df_obj[k1_sum_tf.columns] = k1_sum_tf
# print(df_obj)

In [None]:
del df_obj['key2']

In [None]:
df_obj.groupby('key1').transform(np.mean)

In [None]:
df_obj

In [None]:
#实现a组和b组，谁比平均分高，谁比平均分低
def diff_mean(s):
    """
        返回数据与均值的差值，s传入的是某一个分组
    """
    return s - s.mean()

print(df_obj.groupby('key1').transform(diff_mean))