# 6 Pandas的函数应用

In [1]:
import pandas as pd
import numpy as np
# Numpy ufunc 函数，randn跟的是维数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df)) #绝对值

          0         1         2         3
0 -0.552244 -0.865683 -0.712708 -1.785448
1  0.211144 -1.888578 -1.038154  0.157401
2 -0.595638  0.130912 -1.801095 -1.876519
3 -0.964640 -2.504971 -0.990287 -0.918304
4 -1.366872 -1.012090 -0.457426 -2.765415
          0         1         2         3
0  0.552244  0.865683  0.712708  1.785448
1  0.211144  1.888578  1.038154  0.157401
2  0.595638  0.130912  1.801095  1.876519
3  0.964640  2.504971  0.990287  0.918304
4  1.366872  1.012090  0.457426  2.765415


In [2]:
#apply默认作用在列上,x是每一列,因为axis=0，指定axis=1就是行
#apply是DataFrame 的方法，用于将指定的函数应用到每一列（默认 axis=0）或每一行（axis=1）上。
print(df.apply(lambda x : x.max()))

0    0.211144
1    0.130912
2   -0.457426
3    0.157401
dtype: float64


In [3]:
#apply作用在行上
print(df.apply(lambda x : x.max(), axis=1))

0   -0.552244
1    0.211144
2    0.130912
3   -0.918304
4   -0.457426
dtype: float64


In [7]:
# 使用applymap应用到每个数据,map()用于将函数逐元素应用到整个 DataFrame 的每一个单元格上
print(df.map(lambda x : '%.2f' % x)) #每个元素格式化为保留两位小数的字符串
df.dtypes

       0      1      2      3
0  -0.55  -0.87  -0.71  -1.79
1   0.21  -1.89  -1.04   0.16
2  -0.60   0.13  -1.80  -1.88
3  -0.96  -2.50  -0.99  -0.92
4  -1.37  -1.01  -0.46  -2.77


0    float64
1    float64
2    float64
3    float64
dtype: object

In [9]:
type('%.2f' % 1.3456)#该操作会将数字转化为字符串

str

## 6.4 索引排序（不重要）

In [8]:
# Series
print(np.random.randint(5, size=5))
print('-'*50)
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5)) #索引随机生成
print(s4)
print('-'*50)
# 索引排序,sort_index返回一个新的排好索引的series
print(s4.sort_index())
print(s4)
# s4.loc[0:3]  loc索引值不唯一时直接报错
print(s4.iloc[0:3])
s4[0:3]  #默认用的位置索引

[2 2 0 2 1]
--------------------------------------------------
1    10
2    11
0    12
2    13
4    14
dtype: int64
--------------------------------------------------
0    12
1    10
2    11
2    13
4    14
dtype: int64
1    10
2    11
0    12
2    13
4    14
dtype: int64
1    10
2    11
0    12
dtype: int64


1    10
2    11
0    12
dtype: int64

In [8]:
# s4.loc[1:2] #loc索引值唯一时可以切片

In [9]:
# DataFrame
df4 = pd.DataFrame(np.random.randn(5, 5),
                   index=np.random.randint(5, size=5),
                   columns=np.random.randint(5, size=5))
print(df4)
#轴零是行索引排序
df4_isort = df4.sort_index(axis=0, ascending=False)
print(df4_isort)


          2         3         1         3         0
2  0.481626  0.267296 -0.951306 -0.167853 -0.215942
0 -0.590575 -0.441858  0.380532  1.001047  1.564096
3 -1.641073 -1.698706  0.205363 -0.206155 -0.668905
3  0.716457  0.461260  0.742063 -1.594768 -0.449151
4  1.401914 -0.229332 -1.285855 -0.761922  0.502977
          2         3         1         3         0
4  1.401914 -0.229332 -1.285855 -0.761922  0.502977
3 -1.641073 -1.698706  0.205363 -0.206155 -0.668905
3  0.716457  0.461260  0.742063 -1.594768 -0.449151
2  0.481626  0.267296 -0.951306 -0.167853 -0.215942
0 -0.590575 -0.441858  0.380532  1.001047  1.564096


In [10]:
#轴1是列索引排序
df4_isort = df4.sort_index(axis=1, ascending=True)
print(df4_isort)

          0         1         2         3         3
2 -0.215942 -0.951306  0.481626  0.267296 -0.167853
0  1.564096  0.380532 -0.590575 -0.441858  1.001047
3 -0.668905  0.205363 -1.641073 -1.698706 -0.206155
3 -0.449151  0.742063  0.716457  0.461260 -1.594768
4  0.502977 -1.285855  1.401914 -0.229332 -0.761922


# 6.5 按值排序（机器学习，深度学习不重要，数据分析才需要）

In [12]:
# 按值排序,by后是column的值
import random
l=[random.randint(0,100) for i in range(24)] #生成24个随机数
df4 = pd.DataFrame(np.array(l).reshape(6,4)) #生成6行4列的dataframe
# print(df4) #查看数据,ndarray
# print('-'*50)
print(df4)
print('-'*50)
#按轴零排序，by后是列名,交换的是行
df4_vsort = df4.sort_values(by=3,axis=0, ascending=False) #寻找的是columns里的3,重要
print(df4_vsort)


    0   1   2   3
0  43  84  86  31
1  48  65  19  89
2  44  30   8  73
3   6   5  66  49
4  40  44  15  90
5  10  42  33  20
--------------------------------------------------
    0   1   2   3
4  40  44  15  90
1  48  65  19  89
2  44  30   8  73
3   6   5  66  49
0  43  84  86  31
5  10  42  33  20


In [11]:
#按轴1排序，by后行索引名，交换的是列
df4_vsort = df4.sort_values(by=3,axis=1, ascending=False) #寻找的是index里的3
print(df4_vsort)

    2   0   1    3
0  58  65  21   61
1  35  88  25   86
2  63  65  28    5
3  95  64  45   23
4  57   0  92  100
5  89  52  57   24


# 6.6 处理缺失数据（重要）

In [11]:
df_data = pd.DataFrame([np.random.randn(3), #第一行：3个标准正态分布的随机数           
                        [1., 2., np.nan],# 第二行：1.0, 2.0, NaN
                        [np.nan, 4., np.nan], # 第三行：NaN, 4.0, NaN
                        [1., 2., 3.]])# 第四行：1.0, 2.0, 3.0
print(df_data.head())

         0         1         2
0 -0.70405  1.751776 -0.286744
1  1.00000  2.000000       NaN
2      NaN  4.000000       NaN
3  1.00000  2.000000  3.000000


In [14]:
df_data.iloc[2,0]

np.float64(nan)

In [15]:
#isnull来判断是否有空的数据
print(df_data.isnull())

       0      1      2
0  False  False  False
1  False  False   True
2   True  False   True
3  False  False  False


In [17]:
#帮我计算df_data缺失率
print(df_data.isnull().sum()/len(df_data))#这里的缺失率是按照列来计算的

0    0.25
1    0.00
2    0.50
dtype: float64


## 删除缺失数据

In [18]:
#默认一个样本，任何一个特征缺失，就删除
#inplace True 修改的是原有的 df
#subset=[0]是指按第一列来删除,第一列有空值就删除对应的行
print(df_data.dropna(subset=[0]))#删除 DataFrame 中在指定列（这里是列 0）包含缺失值（NaN）的行，并返回一个新的 DataFrame
# df_data

         0         1         2
0 -0.70405  1.751776 -0.286744
1  1.00000  2.000000       NaN
3  1.00000  2.000000  3.000000


In [19]:
df_data

Unnamed: 0,0,1,2
0,-0.70405,1.751776,-0.286744
1,1.0,2.0,
2,,4.0,
3,1.0,2.0,3.0


In [21]:
#用得不多，用在某个特征缺失太多时，才会进行删除
print(df_data.dropna(axis=1))  #某列有 nan 就删除该列

          1
0  1.751776
1  2.000000
2  4.000000
3  2.000000


In [22]:
df_data

Unnamed: 0,0,1,2
0,-0.70405,1.751776,-0.286744
1,1.0,2.0,
2,,4.0,
3,1.0,2.0,3.0


## 填充缺失数据

In [None]:
#均值，中位数，众数填充

In [24]:
#给零列的空值填为-100，按特征（按列）去填充
print(df_data.iloc[:,0].fillna(-100.))#所有行，第0列的空值填为-100
df_data

0     -0.70405
1      1.00000
2   -100.00000
3      1.00000
Name: 0, dtype: float64


Unnamed: 0,0,1,2
0,-0.70405,1.751776,-0.286744
1,1.0,2.0,
2,,4.0,
3,1.0,2.0,3.0


In [25]:
#依次拿到每一列
for i in df_data.columns:
    print(df_data.loc[:,i])

0   -0.70405
1    1.00000
2        NaN
3    1.00000
Name: 0, dtype: float64
0    1.751776
1    2.000000
2    4.000000
3    2.000000
Name: 1, dtype: float64
0   -0.286744
1         NaN
2         NaN
3    3.000000
Name: 2, dtype: float64


In [28]:
df_data.iloc[:,0].fillna(-100.,inplace=True) #inplace=True后面会被删除

C:\Users\CZG\AppData\Local\Temp\ipykernel_24316\2218614896.py:1: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using an inplace method.
Such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy (due to Copy-on-Write).

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object, or try to avoid an inplace operation using 'df[col] = df[col].method(value)'.

See the documentation for a more detailed explanation: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html
  df_data.iloc[:,0].fillna(-100.,inplace=True) #inplace=True后面会被删除


0     -0.70405
1      1.00000
2   -100.00000
3      1.00000
Name: 0, dtype: float64

In [30]:
df_data.iloc[:,2]=df_data.iloc[:,2].fillna(df_data.iloc[:,2].mean()) #用均值填充空值

In [31]:
df_data

Unnamed: 0,0,1,2
0,-0.70405,1.751776,-0.286744
1,1.0,2.0,1.356628
2,,4.0,1.356628
3,1.0,2.0,3.0
