### Python中的空值处理

In [1]:
import pandas as pd 
import numpy as np 
import datetime

- None: Python中的空值
- NaN: dataframe中的空值
- NaT: Pandas中的时间空值

In [2]:
print(type(None),type(np.nan),type(pd.NaT))

<class 'NoneType'> <class 'float'> <class 'pandas._libs.tslibs.nattype.NaTType'>


In [6]:
df=pd.DataFrame([
    ["张三","26","北京市",None],
    ["李四","30",None,datetime.date.today()+datetime.timedelta(-30)],
    ["王五","28",None,datetime.date.today()],
    ["小明",None,"河北省",None]
])
df.columns=['name','age','province','exam_date']
df

Unnamed: 0,name,age,province,exam_date
0,张三,26.0,北京市,
1,李四,30.0,,2022-04-25
2,王五,28.0,,2022-05-25
3,小明,,河北省,


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   name       4 non-null      object        
 1   age        3 non-null      float64       
 2   province   2 non-null      object        
 3   exam_date  2 non-null      datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 256.0+ bytes


In [9]:
df['age']=df['age'].astype('float')
df['exam_date']=df['exam_date'].astype('datetime64')
df

Unnamed: 0,name,age,province,exam_date
0,张三,26.0,北京市,NaT
1,李四,30.0,,2022-04-25
2,王五,28.0,,2022-05-25
3,小明,,河北省,NaT


#### 判断空值

- isnull() notnull(): 可跨越type判断所有类型的空值

In [11]:
df.isnull()

Unnamed: 0,name,age,province,exam_date
0,False,False,False,True
1,False,False,True,False
2,False,False,True,False
3,False,True,False,True


In [12]:
df.notnull()

Unnamed: 0,name,age,province,exam_date
0,True,True,True,False
1,True,True,False,True
2,True,True,False,True
3,True,False,True,False


- 判断每一列是否含有空值

In [13]:
df.isnull().any()

name         False
age           True
province      True
exam_date     True
dtype: bool

- 判断并筛选出每一列中的空值

In [14]:
df[df['age'].isnull()]

Unnamed: 0,name,age,province,exam_date
3,小明,,河北省,NaT


#### 空值的删除与填充

- 删除空值: df.dropna()

In [16]:
df
df.dropna(axis=1) #1,删除含有空值的列；0，删除含有空值的行

Unnamed: 0,name
0,张三
1,李四
2,王五
3,小明


- 填充空值：df.fillna()

In [18]:
df['age'].fillna(df['age'].mean(),inplace=True)
df

Unnamed: 0,name,age,province,exam_date
0,张三,26.0,北京市,NaT
1,李四,30.0,,2022-04-25
2,王五,28.0,,2022-05-25
3,小明,28.0,河北省,NaT


In [20]:
df['province'].fillna('未知',inplace=True)
df

Unnamed: 0,name,age,province,exam_date
0,张三,26.0,北京市,NaT
1,李四,30.0,未知,2022-04-25
2,王五,28.0,未知,2022-05-25
3,小明,28.0,河北省,NaT


In [21]:
df['exam_date'].fillna('2022-04-01',inplace=True)
df

Unnamed: 0,name,age,province,exam_date
0,张三,26.0,北京市,2022-04-01
1,李四,30.0,未知,2022-04-25
2,王五,28.0,未知,2022-05-25
3,小明,28.0,河北省,2022-04-01
