# 0517 DataNormalization

规范化分类的三种方法参考
https://www.geeksforgeeks.org/data-normalization-in-data-mining/?ref=lbp

## 数据转换 transformation

### 数据集

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import datasets
from sklearn import preprocessing
from sklearn.preprocessing import Binarizer
from sklearn.decomposition import PCA

In [2]:
#california housing dataset
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
type(housing)

sklearn.utils.Bunch

In [3]:
housing.data.shape

(20640, 8)

In [4]:
#哪几个属性
housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [5]:
#目标属性：房价中位数
housing.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [6]:
#切片
df = pd.DataFrame(housing.data[:,2:5])
df.columns = housing.feature_names[2:5]
df

Unnamed: 0,AveRooms,AveBedrms,Population
0,6.984127,1.023810,322.0
1,6.238137,0.971880,2401.0
2,8.288136,1.073446,496.0
3,5.817352,1.073059,558.0
4,6.281853,1.081081,565.0
...,...,...,...
20635,5.045455,1.133333,845.0
20636,6.114035,1.315789,356.0
20637,5.205543,1.120092,1007.0
20638,5.329513,1.171920,741.0


### 规范化

In [8]:
# 最小-最大规范化
## 用公式
df1= (df-df.min())/(df.max()-df.min())
# df1
## 用preprocessing模块
df2=preprocessing.minmax_scale(df)
df2
######################函数##########################################
def MinMax_Norm(df):
    df_normed = preprocessing.minmax_scale(df)
    return df_normed
df_norm_test = MinMax_Norm(df)
df_norm_test


array([[0.0435123 , 0.02046866, 0.00894083],
       [0.03822395, 0.01892926, 0.0672104 ],
       [0.05275646, 0.02194011, 0.01381765],
       ...,
       [0.03090386, 0.0233229 , 0.0281398 ],
       [0.03178269, 0.02485928, 0.02068444],
       [0.03125246, 0.02457305, 0.03879032]])

In [10]:
# zscore 规范化
## 公式处理
df3=(df-df.mean())/df.std()
#df3
## preprocessing模块处理
df4=preprocessing.scale(df)
df4
######################函数##########################################
def Zscore_Norm(df):
    df_normed = preprocessing.scale(df)
    return df_normed
df_norm_test2 = Zscore_Norm(df)
df_norm_test2

array([[ 0.62855945, -0.15375759, -0.9744286 ],
       [ 0.32704136, -0.26333577,  0.86143887],
       [ 1.15562047, -0.04901636, -0.82077735],
       ...,
       [-0.09031802,  0.04941393, -0.3695372 ],
       [-0.04021111,  0.15877763, -0.60442933],
       [-0.07044252,  0.1384028 , -0.03397701]])

In [12]:
# 小数定标规范化
## 公式处理
df5=df/10**np.ceil(np.log10(df.abs().max()))
#df5
######################函数##########################################
def Decimal_Norm(df):
    df_normed = df/10**np.ceil(np.log10(df.abs().max()))
    return df_normed
df_norm_test3 = Decimal_Norm(df)
df_norm_test3

Unnamed: 0,AveRooms,AveBedrms,Population
0,0.006984,0.010238,0.00322
1,0.006238,0.009719,0.02401
2,0.008288,0.010734,0.00496
3,0.005817,0.010731,0.00558
4,0.006282,0.010811,0.00565
...,...,...,...
20635,0.005045,0.011333,0.00845
20636,0.006114,0.013158,0.00356
20637,0.005206,0.011201,0.01007
20638,0.005330,0.011719,0.00741


### 连续属性离散化

In [15]:
# df.Population
# type(df.Population)
# df.iloc[:,1]
# df_binned = pd.qcut(df.iloc[:,1],20,labels=range(20))

def Binning(col_Index):
    df_binned = pd.qcut(df.iloc[:,col_Index],20,labels=range(20))
    return df_binned
df6=Binning(1)
df6

0         6
1         2
2        12
3        12
4        13
         ..
20635    16
20636    19
20637    16
20638    17
20639    17
Name: AveBedrms, Length: 20640, dtype: category
Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]

### 特征二值化

prerprossessing模块中Binarizer()或LabelEncoder()函数

In [22]:
def Bina(data_array,y):#data_array数据类型需为numpy.ndarray
    X=data_array.reshape(-1,1)
    BI=Binarizer(threshold=y).fit_transform(X)
    return BI
test=Bina(housing.data,4)
test

array([[1.],
       [1.],
       [1.],
       ...,
       [0.],
       [1.],
       [0.]])

In [17]:

housing.target.shape
#housing target是1行20640列，要得到每条记录有一个类别，故需要有20640行/样本
housing.target.reshape(-1,1).shape

(20640, 1)

## 数据规约

### 属性规约，以PCA降维为例

In [24]:
housing.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [27]:
def Reduction_pca(df,n='mle'):
    X= preprocessing.scale(df)#标准化处理
    #n_components设置为‘mle’，算法自动选择满足所要求的方差百分比的特征个数；设置为数字n，则对应n个特征
    pca=PCA(n_components=n)
    # pca=PCA(n_components='mle')
    pca.fit(X)
    pca_com=pca.components_
    ratio = pca.explained_variance_ratio_#各成分方差百分比，各变量的方差贡献率
    s=sum(ratio)
    print("前"+str(n)+"个属性解释了数据中"+str(s)+"的变化。")
    return pca_com
pca_components = Reduction_pca(housing.data,3)

前3个属性解释了数据中0.6474174722211659的变化。


### 数值规约，以简单随机抽样为例

In [28]:
#replace-true表示有放回；反之不放回
df.sample(n=20)
df.sample(frac=0.1)#frac为抽样百分比
def Simple_Sample(df,sam_num):
    df_sample=df.sample(n=sam_num)
    return df_sample
df_sample=Simple_Sample(df,5)
df_sample

Unnamed: 0,AveRooms,AveBedrms,Population
19564,5.962162,1.086486,1796.0
10609,6.524752,1.029703,1137.0
10858,5.162162,1.135135,208.0
15840,5.025467,1.057725,1191.0
4694,5.081019,1.006944,884.0


### 分层抽样

In [89]:
iris=datasets.load_iris()
iris_df=pd.DataFrame(iris.data)
iris_df.columns = iris.feature_names
iris_df['target']=iris.target

In [95]:
A=iris_df[iris_df.target==0].sample(frac=0.2)
B=iris_df[iris_df.target==1].sample(frac=0.2)
df_new_sampled=A.append(B)
df_new_sampled

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
13,4.3,3.0,1.1,0.1,0
5,5.4,3.9,1.7,0.4,0
25,5.0,3.0,1.6,0.2,0
49,5.0,3.3,1.4,0.2,0
23,5.1,3.3,1.7,0.5,0
9,4.9,3.1,1.5,0.1,0
41,4.5,2.3,1.3,0.3,0
4,5.0,3.6,1.4,0.2,0
32,5.2,4.1,1.5,0.1,0
33,5.5,4.2,1.4,0.2,0
