# 数据清洗

##空值和缺失值的处理

In [3]:
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
from numpy import NaN

In [4]:
series_obj = Series([1,NaN,NaN])
series_obj.isnull()

0    False
1     True
2     True
dtype: bool

In [5]:
pd.isnull(series_obj)

0    False
1     True
2     True
dtype: bool

In [6]:
demo_arr = np.array([[None,'b','c'],['d',None,'f']])
df_obj = pd.DataFrame(demo_arr)
df_obj

Unnamed: 0,0,1,2
0,,b,c
1,d,,f


In [7]:
df_obj.isnull()

Unnamed: 0,0,1,2
0,True,False,False
1,False,True,False


In [8]:
df_obj.isnull().sum()

0    1
1    1
2    0
dtype: int64

In [9]:
series_obj.isnull().sum()

2

In [10]:
df_obj.notnull()

Unnamed: 0,0,1,2
0,False,True,True
1,True,False,True


In [11]:
df_obj.notnull().sum()

0    1
1    1
2    2
dtype: int64

In [12]:
df_obj2 = pd.DataFrame({"类别":['小说','散文随笔','青春文学','传记'],
                        "书名":[np.nan,'《皮囊》','《路程结束时》','《老舍自传》'],
                        "作者":['老舍',None,'张齐鑫','老舍']})

In [13]:
df_obj2

Unnamed: 0,书名,作者,类别
0,,老舍,小说
1,《皮囊》,,散文随笔
2,《路程结束时》,张齐鑫,青春文学
3,《老舍自传》,老舍,传记


In [14]:
df_obj2.dropna()

Unnamed: 0,书名,作者,类别
2,《路程结束时》,张齐鑫,青春文学
3,《老舍自传》,老舍,传记


In [15]:
df_obj2

Unnamed: 0,书名,作者,类别
0,,老舍,小说
1,《皮囊》,,散文随笔
2,《路程结束时》,张齐鑫,青春文学
3,《老舍自传》,老舍,传记


In [16]:
df_obj2.dropna(thresh = 2,inplace = True)

In [17]:
df_obj2

Unnamed: 0,书名,作者,类别
0,,老舍,小说
1,《皮囊》,,散文随笔
2,《路程结束时》,张齐鑫,青春文学
3,《老舍自传》,老舍,传记


In [18]:
df_obj3 = pd.DataFrame([[1.0,NaN,'a',NaN],[2.0,4.0,7,2.0],[3.0,NaN,8,3.0],[NaN,6.0,9,NaN]],columns = ['A','B','C','D'])
df_obj3

Unnamed: 0,A,B,C,D
0,1.0,,a,
1,2.0,4.0,7,2.0
2,3.0,,8,3.0
3,,6.0,9,


In [19]:
df_obj3.fillna(value = 66.0)

Unnamed: 0,A,B,C,D
0,1.0,66.0,a,66.0
1,2.0,4.0,7,2.0
2,3.0,66.0,8,3.0
3,66.0,6.0,9,66.0


In [20]:
df_obj3

Unnamed: 0,A,B,C,D
0,1.0,,a,
1,2.0,4.0,7,2.0
2,3.0,,8,3.0
3,,6.0,9,


In [21]:
df_obj3.fillna(method = 'ffill',limit = 1)

Unnamed: 0,A,B,C,D
0,1.0,,a,
1,2.0,4.0,7,2.0
2,3.0,4.0,8,3.0
3,3.0,6.0,9,3.0


In [22]:
df_obj3.fillna({'A':5.0,'B':3.0})

Unnamed: 0,A,B,C,D
0,1.0,3.0,a,
1,2.0,4.0,7,2.0
2,3.0,3.0,8,3.0
3,5.0,6.0,9,


##重复值的处理

In [23]:
person_info = pd.DataFrame([[1,'小铭',18,180,'女'],[2,'小月月',18,180,'女'],[3,'彭艳',25,185,'男'],[4,'刘华',58,175,'男'],[4,'刘华',58,175,'男'],[5,'周华',36,178,'男']],columns = ['id','name','age','height','gender'])
person_info

Unnamed: 0,id,name,age,height,gender
0,1,小铭,18,180,女
1,2,小月月,18,180,女
2,3,彭艳,25,185,男
3,4,刘华,58,175,男
4,4,刘华,58,175,男
5,5,周华,36,178,男


In [24]:
person_info.duplicated(keep = 'first')

0    False
1    False
2    False
3    False
4     True
5    False
dtype: bool

In [25]:
person_info.duplicated(subset = 'height',keep = False)

0     True
1     True
2    False
3     True
4     True
5    False
dtype: bool

In [26]:
person_info.drop_duplicates(keep = False)

Unnamed: 0,id,name,age,height,gender
0,1,小铭,18,180,女
1,2,小月月,18,180,女
2,3,彭艳,25,185,男
5,5,周华,36,178,男


In [27]:
person_info.drop_duplicates(keep = False,ignore_index = True)#版本问题

TypeError: drop_duplicates() got an unexpected keyword argument 'ignore_index'

##异常值的处理

In [28]:
def three_sigma(ser1):
    mean_value = ser1.mean()
    std_value = ser1.std()
    rule = (mean_value - 3 * std_value > ser1 | mean_value + 3 * std_value < ser1)
    index = np.arange(ser1.shape[0])[rule]
    return ser1.iloc[index]

In [29]:
df = pd.DataFrame({'A':[1,2,3,4],
                   'B':[2,3,5,2],
                   'C':[1,4,7,4],
                   'D':[1,5,30,3]})
df

Unnamed: 0,A,B,C,D
0,1,2,1,1
1,2,3,4,5
2,3,5,7,30
3,4,2,4,3


In [32]:
df.boxplot()

<matplotlib.axes._subplots.AxesSubplot at 0x2bfdf9e91d0>

In [34]:
df_obj4 = pd.DataFrame({'菜谱名':['红烧肉','铁板鱿鱼','干锅鸭张'],
                        '价格':[29,30,338]})
df_obj4

Unnamed: 0,价格,菜谱名
0,29,红烧肉
1,30,铁板鱿鱼
2,338,干锅鸭张


In [35]:
df_obj4.replace(to_replace = [30,338],value = [29,29])

Unnamed: 0,价格,菜谱名
0,29,红烧肉
1,29,铁板鱿鱼
2,29,干锅鸭张


## 更改数据类型

In [36]:
df_2 = pd.DataFrame({'A':[1,2,3,4],
                   'B':[2,3,5,2],
                   'C':[1,4,7,4],
                   'D':[1,5,30,3]})
df_2

Unnamed: 0,A,B,C,D
0,1,2,1,1
1,2,3,4,5
2,3,5,7,30
3,4,2,4,3


In [37]:
df_2.dtypes

A    int64
B    int64
C    int64
D    int64
dtype: object

In [38]:
df_2['B'].astype(dtype = 'float')

0    2.0
1    3.0
2    5.0
3    2.0
Name: B, dtype: float64

In [40]:
ser_obj= pd.Series(['1','1.2','4.2'])
pd.to_numeric(ser_obj)

0    1.0
1    1.2
2    4.2
dtype: float64