In [1]:
import pandas as pd
import numpy as np

# 1.处理缺失值

In [4]:
#生成一张含NaN的dataFrame
from io import StringIO
csv_data='''A,B,C,D
            1.0,2.0,3.0,4.0
            5.0,6.0,,8.0
            9.0,10.0,11.0,'''
csv_data=str(csv_data)
df_data=pd.read_csv(StringIO(csv_data))
df_data

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,


## 1.1 检查空值

In [5]:
df_data.isnull()  #会逐一检查每个值是否为NaN

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


In [6]:
df_data.isnull().sum() #统计哪些列含有空值

A    0
B    0
C    1
D    1
dtype: int64

## 1.2 策略一：消除带有空值的特征或样本

DataFrame.dropna(axis=0, how=’any’, thresh=None, subset=None, inplace=False)

axis指定删除还是列

how=‘any’：该行/列只要有NaN就删除  how=‘all’：全为NaN才删除

thresh=n：只保留至少有n个非NaN的行/列

subset=['C']:只去掉在C列中出现NaN的行

In [8]:
df_data.dropna() #删除所有含空值的样本

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


## 1.3 策略二：插入法
sklearn.preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)

missing_values：缺失值的占位符，一般为nan

strategy：替换策略(mean/median/most_frequent)

axis=0:沿列

In [15]:
from sklearn.preprocessing import Imputer
imr=Imputer(missing_values='NaN',strategy='mean')
imr=imr.fit(df_data)
imputed_data=imr.transform(df_data)
imputed_data

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,   7.,   8.],
       [  9.,  10.,  11.,   6.]])

# 2. 处理类别数据

In [27]:
df=pd.DataFrame([['green','M',10.1,'class1'],
                ['red','L',13.5,'class2'],
                ['blue','XL',15.3,'class1']])
df.columns=['color','size','price','classLable']
df

Unnamed: 0,color,size,price,classLable
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


## 2.1 对有序特征：映射

In [28]:
size_mapping={
    'M':1,
    'L':2,
    'XL':3
}
df['size']=df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classLable
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


## 2.2 对无序特征：独热编码

In [40]:
X=pd.get_dummies(df[['price','color','size']]) #get_dummies默认会对DataFrame中所有字符串类型的列进行独热编码
X

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


## 2.3 对类别编码

In [42]:
from sklearn.preprocessing import LabelEncoder
class_le=LabelEncoder()
y=class_le.fit_transform(df['classLable'])
y

array([0, 1, 0])

# 3. 划分测试集和训练集

In [45]:
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
iris=load_iris()
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.2,random_state=0)
print (X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(120, 4) (30, 4) (120,) (30,)


# 4. 特征缩放：统一特征取值范围

决策树和随机森林不需要特征缩放

In [57]:
df=pd.DataFrame([[1,1,10,'class1'],
                [10,2,13.5,'class2'],
                [100,4,15.3,'class1']])
df.columns=['length','width','price','classLable']
df

Unnamed: 0,length,width,price,classLable
0,1,1,10.0,class1
1,10,2,13.5,class2
2,100,4,15.3,class1


## 4.1 标准化：让特征值变得符合正态分布，即均值为0，标准差为1
### 适合要求训练集大致符合正态分布的算法

In [56]:
from sklearn.preprocessing import StandardScaler

X=df.loc[:,['length','width','price']]

stdSc=StandardScaler()
stdSc.fit(X)   #StandardScaler只使用训练集fit一次，这样保证训练集和测试集使用相同的标准进行的特征缩放。
X_stdSc=stdSc.transform(X)
print (X_stdSc)
print (X_stdSc.mean(axis=0))
print (X_stdSc.std(axis=0))

[[-0.80538727 -1.06904497 -1.33302735]
 [-0.60404045 -0.26726124  0.25751665]
 [ 1.40942772  1.33630621  1.0755107 ]]
[ -7.40148683e-17  -1.48029737e-16   8.14163551e-16]
[ 1.  1.  1.]


## 4.2 归一化：将特征范围缩放到[0,1]

In [60]:
from sklearn.preprocessing import normalize

X=df.loc[:,['length','width','price']]

X_normal=normalize(X,axis=0)
X_normal

array([[ 0.00994988,  0.21821789,  0.44008049],
       [ 0.09949879,  0.43643578,  0.59410866],
       [ 0.99498793,  0.87287156,  0.67332315]])