# 6 Pandas的函数应用

In [10]:
import numpy as np
import pandas as pd

# numpy ufunc函数，randn跟的是维数
df = pd.DataFrame(np.random.randn(5, 4) - 1)
print(df)
print("-" * 100)
print(np.abs(df))
print("-" * 100)
# apply默认作用在列上，x是每一列，因为axis=0
print(df.apply(lambda x: x.max()))
print("-" * 100)
print(df.apply(lambda x: x.max(), axis=1))  # axis=1表示作用在行上
print("-" * 100)
print(df.applymap(lambda x: '%.2f' % x))  # applymap作用在每个元素上

          0         1         2         3
0 -2.234334 -2.094441  0.606715 -1.287387
1  0.314688 -1.407676 -2.420805 -1.193059
2 -2.077430 -2.568874 -0.582940 -2.359218
3 -2.177423 -1.868837 -0.436159 -1.172174
4 -1.228451 -3.575611 -0.433009 -2.772940
----------------------------------------------------------------------------------------------------
          0         1         2         3
0  2.234334  2.094441  0.606715  1.287387
1  0.314688  1.407676  2.420805  1.193059
2  2.077430  2.568874  0.582940  2.359218
3  2.177423  1.868837  0.436159  1.172174
4  1.228451  3.575611  0.433009  2.772940
----------------------------------------------------------------------------------------------------
0    0.314688
1   -1.407676
2    0.606715
3   -1.172174
dtype: float64
----------------------------------------------------------------------------------------------------
0    0.606715
1    0.314688
2   -0.582940
3   -0.436159
4   -0.433009
dtype: float64
-------------------------------------

# 6.4 索引排序（不重要）

In [20]:
# Series
print(np.random.randint(5, size=5))
print("-" * 100)
s4 = pd.Series(range(5), index=np.random.randint(5, size=5))
print(s4)
print("-" * 100)
# 索引排序，sort_index返回一个新的排好索引的Series
print(s4.sort_index())
print("-" * 100)
print(s4)
print("-" * 100)
print(s4.iloc[0:3])

[3 3 3 1 4]
----------------------------------------------------------------------------------------------------
2    0
0    1
3    2
0    3
0    4
dtype: int64
----------------------------------------------------------------------------------------------------
0    1
0    3
0    4
2    0
3    2
dtype: int64
----------------------------------------------------------------------------------------------------
2    0
0    1
3    2
0    3
0    4
dtype: int64
----------------------------------------------------------------------------------------------------
2    0
0    1
3    2
dtype: int64


In [24]:
# DataFrame
df4 = pd.DataFrame(np.random.randn(5, 5),
                   index=np.random.randint(5, size=5),
                   columns=np.random.randint(5, size=5))
print(df4)
print("-" * 100)
# 轴零时行索引排序
df4_isort = df4.sort_index(axis=0, ascending=False)  # ascending=False表示降序
print(df4_isort)
# 轴一时列索引排序
df4_csort = df4.sort_index(axis=1, ascending=False)
print(df4_csort)

          0         3         4         2         3
2 -0.173916  1.224009  0.458904  1.171089  0.217429
2 -0.444430  0.129921  0.285294 -0.860084 -0.171724
4  0.817999  1.644636  0.524371 -0.588175 -1.294124
2  1.081619  0.150042  0.803723 -1.313770  0.563771
0 -1.910546 -0.122703  0.097040  0.223301 -0.378628
----------------------------------------------------------------------------------------------------
          0         3         4         2         3
4  0.817999  1.644636  0.524371 -0.588175 -1.294124
2 -0.173916  1.224009  0.458904  1.171089  0.217429
2 -0.444430  0.129921  0.285294 -0.860084 -0.171724
2  1.081619  0.150042  0.803723 -1.313770  0.563771
0 -1.910546 -0.122703  0.097040  0.223301 -0.378628
          4         3         3         2         0
2  0.458904  1.224009  0.217429  1.171089 -0.173916
2  0.285294  0.129921 -0.171724 -0.860084 -0.444430
4  0.524371  1.644636 -1.294124 -0.588175  0.817999
2  0.803723  0.150042  0.563771 -1.313770  1.081619
0  0.097040 -0.

# # 6.5 按值排序（机器学习，深度学习不重要，数据分析才需要）

In [27]:
# 按值排序，by后是column的值
import random

l = [random.randint(0, 100) for i in range(24)]
df4 = pd.DataFrame(np.array(l).reshape(6, 4))
print(df4)
print("-" * 100)
df4_vsort = df4.sort_values(by=3, axis=0, ascending=False)  # 按第四列的值排序，降序
print(df4_vsort)
df4_vsort = df4.sort_values(by=2, axis=1, ascending=True)  # 按第三列的值排序，降序

    0   1   2   3
0  15  36  58  52
1   4  86  10  24
2   3  56  63  57
3  25  16  38  51
4  91  15  45  79
5  53  81  39  99
----------------------------------------------------------------------------------------------------
    0   1   2   3
5  53  81  39  99
4  91  15  45  79
2   3  56  63  57
0  15  36  58  52
3  25  16  38  51
1   4  86  10  24


# 6.6 处理缺失数据（重要）

In [37]:
df_data = pd.DataFrame([np.random.randn(3), [1, 2., np.nan], [np.nan, 4., np.nan], [1., 2., 3.]])
print(df_data)
print("-" * 100)
print(df_data.iloc[2, 0])
print("-" * 100)
print(df_data.isnull())
print("-" * 100)
# 计算df_data的缺失率
print(df_data.isnull().sum() / len(df_data))
print("-" * 100)
# 计算df_data的行的缺失率
print(df_data.isnull().sum(axis=1) / len(df_data.columns))

          0         1         2
0  0.000571 -1.220256 -1.153879
1  1.000000  2.000000       NaN
2       NaN  4.000000       NaN
3  1.000000  2.000000  3.000000
----------------------------------------------------------------------------------------------------
nan
----------------------------------------------------------------------------------------------------
       0      1      2
0  False  False  False
1  False  False   True
2   True  False   True
3  False  False  False
----------------------------------------------------------------------------------------------------
0    0.25
1    0.00
2    0.50
dtype: float64
----------------------------------------------------------------------------------------------------
0    0.000000
1    0.333333
2    0.666667
3    0.000000
dtype: float64


# 删除缺失数据

In [41]:
# 默认一个样本，任何一个特征缺失，就删除
# inplace=True表示直接修改df_data
# subset=[0]是按第一列来删除，第一列有空值就删除对应的行
print(df_data)
print("-" * 100)
print(df_data.dropna(subset=[0]))
print("-" * 100)
print(df_data.dropna(axis=1))

          0         1         2
0  0.000571 -1.220256 -1.153879
1  1.000000  2.000000       NaN
2       NaN  4.000000       NaN
3  1.000000  2.000000  3.000000
----------------------------------------------------------------------------------------------------
          0         1         2
0  0.000571 -1.220256 -1.153879
1  1.000000  2.000000       NaN
3  1.000000  2.000000  3.000000
----------------------------------------------------------------------------------------------------
          1
0 -1.220256
1  2.000000
2  4.000000
3  2.000000


# 填充缺失数据

In [46]:
# 均值，中位数，众数填充
# 给零列的空值填为-100，按特征（按列）去填充
print(df_data.iloc[:, 0].fillna(-100.))
print("-" * 100)
print(df_data)
print("-" * 100)
# 依次拿每一行
for i in df_data.columns:
    print(df_data.loc[:, i])

0      0.000571
1      1.000000
2   -100.000000
3      1.000000
Name: 0, dtype: float64
----------------------------------------------------------------------------------------------------
          0         1         2
0  0.000571 -1.220256 -1.153879
1  1.000000  2.000000       NaN
2       NaN  4.000000       NaN
3  1.000000  2.000000  3.000000
----------------------------------------------------------------------------------------------------
0    0.000571
1    1.000000
2         NaN
3    1.000000
Name: 0, dtype: float64
0   -1.220256
1    2.000000
2    4.000000
3    2.000000
Name: 1, dtype: float64
0   -1.153879
1         NaN
2         NaN
3    3.000000
Name: 2, dtype: float64


In [48]:
# df_data.iloc[:, 0].fillna(-100., inplace=True) # inplace = True表示直接修改df_data，但是inplace不推荐使用

df_data.iloc[:, 2] = df_data.iloc[:, 2].fillna(df_data.iloc[:, 2].mean())  # 用均值填充空值
print(df_data)

          0         1         2
0  0.000571 -1.220256 -1.153879
1  1.000000  2.000000  0.923061
2       NaN  4.000000  0.923061
3  1.000000  2.000000  3.000000
