# 函数应用

In [64]:
import pandas as pd
import numpy as np
# Numpy ufunc 函数，randn跟的是维数
df = pd.DataFrame(np.random.randn(5,4) - 1)  
print(df)   
print(np.abs(df)) #绝对值 

          0         1         2         3
0 -1.823654 -2.217253 -1.905701 -3.346511
1 -0.394204 -0.437067  0.067901 -1.748860
2 -0.597289 -2.327537 -0.593009 -0.154756
3  0.949637 -0.957908 -1.821659  0.067857
4 -0.321981 -1.047882 -2.765389 -2.245926
          0         1         2         3
0  1.823654  2.217253  1.905701  3.346511
1  0.394204  0.437067  0.067901  1.748860
2  0.597289  2.327537  0.593009  0.154756
3  0.949637  0.957908  1.821659  0.067857
4  0.321981  1.047882  2.765389  2.245926


In [65]:
# apply默认作用在列上,x是每一列,因为axis=0
print(df.apply(lambda x : x.max())) #每一列的最大值

0    0.949637
1   -0.437067
2    0.067901
3    0.067857
dtype: float64


In [66]:
# apply作用在行上,x是每一行,因为axis=1
print(df.apply(lambda x : x.max(), axis=1))

0   -1.823654
1    0.067901
2   -0.154756
3    0.949637
4   -0.321981
dtype: float64


In [67]:
# 使用applymap应用到每个数据
print(df.map(lambda x : '%.2f' % x))
print(df.dtypes)

       0      1      2      3
0  -1.82  -2.22  -1.91  -3.35
1  -0.39  -0.44   0.07  -1.75
2  -0.60  -2.33  -0.59  -0.15
3   0.95  -0.96  -1.82   0.07
4  -0.32  -1.05  -2.77  -2.25
0    float64
1    float64
2    float64
3    float64
dtype: object


# 索引排序

In [68]:
# Series
print(np.random.randint(5, size=5))
print('-'*50)
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5)) #索引随机生成
print(s4)
print('-'*50)
# 索引排序,sort_index返回一个新的排好索引的series
print(s4.sort_index())
print(s4)
# s4.loc[0:3]  loc索引值不唯一时直接报错
print(s4.iloc[0:3])
s4[0:3]  #默认用的位置索引

[4 0 2 4 0]
--------------------------------------------------
4    10
2    11
0    12
4    13
4    14
dtype: int64
--------------------------------------------------
0    12
2    11
4    10
4    13
4    14
dtype: int64
4    10
2    11
0    12
4    13
4    14
dtype: int64
4    10
2    11
0    12
dtype: int64


4    10
2    11
0    12
dtype: int64

In [69]:
# DataFrame
df4 = pd.DataFrame(np.random.randn(5, 5),
                   index=np.random.randint(5, size=5),
                   columns=np.random.randint(5, size=5))
print(df4)
#轴零是行索引排序
df4_isort = df4.sort_index(axis=0, ascending=False)  # 按行索引排序
print(df4_isort)

          1         1         0         4         0
4 -0.025873  0.237141  0.165074 -1.134153  0.694082
1  0.371933 -2.368624  0.292611 -1.573073  0.217457
1 -0.216396  0.689673 -0.194823  0.195863  1.790911
0  0.631698 -0.606620  1.827383 -0.048728 -0.487976
1  1.444932  1.624664  0.042855 -0.139360  1.678678
          1         1         0         4         0
4 -0.025873  0.237141  0.165074 -1.134153  0.694082
1  0.371933 -2.368624  0.292611 -1.573073  0.217457
1 -0.216396  0.689673 -0.194823  0.195863  1.790911
1  1.444932  1.624664  0.042855 -0.139360  1.678678
0  0.631698 -0.606620  1.827383 -0.048728 -0.487976


In [70]:
#轴1是列索引排序
df4_isort = df4.sort_index(axis=1, ascending=True)
print(df4_isort)

          0         0         1         1         4
4  0.165074  0.694082  0.237141 -0.025873 -1.134153
1  0.292611  0.217457 -2.368624  0.371933 -1.573073
1 -0.194823  1.790911  0.689673 -0.216396  0.195863
0  1.827383 -0.487976 -0.606620  0.631698 -0.048728
1  0.042855  1.678678  1.624664  1.444932 -0.139360


In [71]:
# 按值排序,by后是column的值
import random
l=[random.randint(0,100) for i in range(24)] #生成24个随机数
df4 = pd.DataFrame(np.array(l).reshape(6,4)) #生成6行4列的dataframe
# print(df4) #查看数据,ndarray
# print('-'*50)
print(df4)
print('-'*50)
#按轴零排序，by后是列名,交换的是行
df4_vsort = df4.sort_values(by=3,axis=0, ascending=False) #寻找的是columns里的3,重要
print(df4_vsort)


    0   1   2   3
0  20  29  58  22
1  93  52  12  64
2  46  78  33  44
3  82  71  55  47
4  71  66  53   1
5   1  24  39  63
--------------------------------------------------
    0   1   2   3
1  93  52  12  64
5   1  24  39  63
3  82  71  55  47
2  46  78  33  44
0  20  29  58  22
4  71  66  53   1


In [72]:
#按轴1排序，by后行索引名，交换的是列
df4_vsort = df4.sort_values(by=3,axis=1, ascending=False) #寻找的是index里的3
print(df4_vsort)

    0   1   2   3
0  20  29  58  22
1  93  52  12  64
2  46  78  33  44
3  82  71  55  47
4  71  66  53   1
5   1  24  39  63


# 处理缺失数据

In [73]:
df_data = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],
                       [np.nan, 4., np.nan], [1., 2., 3.]])
print(df_data.head())

          0        1        2
0  0.787574  0.01204  1.50148
1  1.000000  2.00000      NaN
2       NaN  4.00000      NaN
3  1.000000  2.00000  3.00000


In [74]:
#isnull来判断是否有空的数据
print(df_data.isnull())

       0      1      2
0  False  False  False
1  False  False   True
2   True  False   True
3  False  False  False


In [75]:
# 计算df_data缺失率
print(df_data.isnull().sum()/len(df_data))

0    0.25
1    0.00
2    0.50
dtype: float64


In [76]:
#默认一个样本，任何一个特征缺失，就删除
#inplace True是修改的是原有的df
#subset=[0]是指按第一列来删除,第一列有空值就删除对应的行
print(df_data.dropna(subset=[0]))
# df_data

          0        1        2
0  0.787574  0.01204  1.50148
1  1.000000  2.00000      NaN
3  1.000000  2.00000  3.00000


In [77]:
#用的不多，用在某个特征缺失太多时，才会进行删除
print(df_data.dropna(axis=1))  #某列有nan就删除该列

         1
0  0.01204
1  2.00000
2  4.00000
3  2.00000


In [78]:
print(df_data)

          0        1        2
0  0.787574  0.01204  1.50148
1  1.000000  2.00000      NaN
2       NaN  4.00000      NaN
3  1.000000  2.00000  3.00000


# 填充缺失值

In [79]:
#给零列的空值填为-100，按特征（按列）去填充
print(df_data.iloc[:,0].fillna(-100.))
print(df_data)

0      0.787574
1      1.000000
2   -100.000000
3      1.000000
Name: 0, dtype: float64
          0        1        2
0  0.787574  0.01204  1.50148
1  1.000000  2.00000      NaN
2       NaN  4.00000      NaN
3  1.000000  2.00000  3.00000


In [80]:
#依次拿到每一列
for i in df_data.columns:
    print(df_data.loc[:,i])

0    0.787574
1    1.000000
2         NaN
3    1.000000
Name: 0, dtype: float64
0    0.01204
1    2.00000
2    4.00000
3    2.00000
Name: 1, dtype: float64
0    1.50148
1        NaN
2        NaN
3    3.00000
Name: 2, dtype: float64


In [86]:
df_data.iloc[:, 0] = df_data.iloc[:, 0].fillna(-100.)
# # 填充第 0 列的缺失值为 -100

In [83]:
df_data.iloc[:,2]=df_data.iloc[:,2].fillna(df_data.iloc[:,2].mean()) #用均值填充空值

In [84]:
df_data

Unnamed: 0,0,1,2
0,0.787574,0.01204,1.50148
1,1.0,2.0,2.25074
2,,4.0,2.25074
3,1.0,2.0,3.0


# 层级索引

In [87]:
import pandas as pd
import numpy as np

#MultiIndex是层级索引，索引类型的一种
index1 = pd.MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'd'],
                [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]], names=['cloth', 'size'])

ser_obj = pd.Series(np.random.randn(12),index=index1)
print(ser_obj)
print(type(ser_obj)) #Series
print(type(ser_obj.index)) #索引类型，MultiIndex
print(ser_obj.index)
print(ser_obj.index.levels) #层级索引的索引值
ser_obj.index.codes  #没那么重要，代表索引的位置

cloth  size
a      0       0.321609
       1       0.718668
       2       0.674320
b      0       0.641929
       1      -0.294042
       2      -0.171909
c      0       0.313592
       1       0.557552
       2       1.317587
d      0      -0.258273
       1      -1.849511
       2      -2.166140
dtype: float64
<class 'pandas.core.series.Series'>
<class 'pandas.core.indexes.multi.MultiIndex'>
MultiIndex([('a', 0),
            ('a', 1),
            ('a', 2),
            ('b', 0),
            ('b', 1),
            ('b', 2),
            ('c', 0),
            ('c', 1),
            ('c', 2),
            ('d', 0),
            ('d', 1),
            ('d', 2)],
           names=['cloth', 'size'])
[['a', 'b', 'c', 'd'], [0, 1, 2]]


FrozenList([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])