# preprocessing

    DataFrame中，axis = 0沿着行的方向垂直往下（即一列），axis = 1沿着列的方向水平延伸（即一行）

# 标准化，均值，方差缩放

In [2]:
# 函数scale对单个类似数组的数据集进行标准化
from sklearn import preprocessing
import numpy as np
X_train = np.array([[1.,-1.,2.],
                    [2.,0.,0.],
                    [0.,1.,-1.]])

X_scaled = preprocessing.scale(X_train)#对训练集数据标准化
print(X_scaled)

[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]


In [4]:
#对列操作,标准化后的数据操作
print("均值：",X_scaled.mean(axis = 0))
print("方差:",X_scaled.std(axis = 0))

均值： [0. 0. 0.]
方差: [1. 1. 1.]


In [8]:
#preprocessing模块提供StandardScaler进行标准化，还可计算训练集上的均值和标准差

scaler = preprocessing.StandardScaler().fit(X_train)
standard = scaler.transform(X_train)

print(standard)
print("原始数据集的均值：",scaler.mean_)
print("原始数据集的标准差：",scaler.scale_)#无方差std_的表达


[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]
原始数据集的均值： [1.         0.         0.33333333]
原始数据集的标准差： [0.81649658 0.81649658 1.24721913]


In [9]:
#可以将缩放器实例用于新数据，以与训练集相同的方式进行转换
X_test = [[-1.,1.,0.]]
scaler.transform(X_test)

array([[-2.44948974,  1.22474487, -0.26726124]])

+ 可以通过传递with_mean=False或传递with_std=False到的构造函数来禁用居中或缩放StandardScaler

# min-max标准化

In [10]:
#缩放到范围[0,1]
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [12]:
#将变压器的相同实例用去其他数据
X_test = np.array([[-3.,-1.,4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

In [15]:
#对缩放器进行内部检查
print(min_max_scaler.scale_)#标准差
print(min_max_scaler.min_)

[0.5        0.5        0.33333333]
[0.         0.5        0.33333333]


In [None]:
X_std = (X - X.min(axis = 0))/(X.max(axis = 0) - X.min(axis = 0))

X_scaled = X_std * (max - min) + min

+ MaxAbsScaler用法与MinMaxScaler相似，适用于以0或稀疏数据为中心的数据，范围[-1,1]

In [19]:
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxbs = max_abs_scaler.fit_transform(X_train)
print(X_train_maxbs)
print()
X_test = np.array([[ -3., -1.,  4.]])
X_test_maxbs = max_abs_scaler.transform(X_test)#仍以X_train的训练模型进行变化
print(X_test_maxbs)
print()
print(max_abs_scaler.scale_)

[[ 0.5 -1.   1. ]
 [ 1.   0.   0. ]
 [ 0.   1.  -0.5]]

[[-1.5 -1.   2. ]]

[2. 1. 2.]


# 非线性变换
1. 分位数转换
2. 幂变换

## 映射到均匀分布

In [26]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split#拆分数据集
X,y = load_iris(return_X_y = True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

quantile_transformer = preprocessing.QuantileTransformer(random_state=0)#提供非参数转化

X_train_trans = quantile_transformer.fit_transform(X_train)
X_test_trans = quantile_transformer.transform(X_test)#以训练数据的压缩器来转化测试集

np.percentile(X_train[:,0],[0,25,50,75,100])

  % (self.n_quantiles, n_samples))


array([4.3, 5.1, 5.8, 6.5, 7.9])

In [27]:
np.percentile(X_train_trans[:,0],[0,25,50,75,100])

array([0.        , 0.22268908, 0.50840336, 0.7605042 , 1.        ])

In [31]:
print(np.percentile(X_test[:, 0], [0, 25, 50, 75, 100]))
print(np.percentile(X_test_trans[:,0],[0, 25, 50, 75, 100]))

[4.4   5.    5.7   6.175 7.3  ]
[0.01260504 0.16386555 0.45588235 0.61659664 0.94537815]


# 分类模型评估

1. 准确率

In [1]:
from sklearn.metrics import accuracy_score

import numpy as np

y_pred = [0,2,1,3,9,9,8,5,8]
y_true = [0,1,2,3,2,6,3,5,9]

accuracy_score(y_true,y_pred)

#accuracy_score(y_true,y_pred,normalize=False) 类似海明距离,每个类别求准确后,再求微平均

0.3333333333333333

2. 召回率

In [3]:
from sklearn import metrics
metrics.recall_score(y_true,y_pred,average='micro')#微平均

0.3333333333333333

In [4]:
metrics.recall_score(y_true,y_pred,average='macro')#宏平均

  'recall', 'true', average, warn_for)


0.3125

3. F1

In [5]:
metrics.f1_score(y_true,y_pred,average='weighted')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.37037037037037035

4. 混淆矩阵

In [6]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true,y_pred)

array([[1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0]], dtype=int64)

5. 分类报告

In [7]:
#分类报告:precision/recall/fi_score/均值/分类个数
from sklearn.metrics import classification_report
y_true = [0,1,2,2,0]
y_pred = [0,0,2,2,0]
target_names = ['class 0','class 1','class 2']
print(classification_report(y_true,y_pred,target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.67      1.00      0.80         2
     class 1       0.00      0.00      0.00         1
     class 2       1.00      1.00      1.00         2

    accuracy                           0.80         5
   macro avg       0.56      0.67      0.60         5
weighted avg       0.67      0.80      0.72         5



  'precision', 'predicted', average, warn_for)


6. kappa score

In [9]:
#介于(-1,1)之间,score>0.8意味着好的分类;0或更低意味着不好
from sklearn.metrics import cohen_kappa_score
y_true = [2,0,2,2,0,1]
y_pred = [0,0,2,2,0,2]
cohen_kappa_score(y_true,y_pred)

0.4285714285714286

# ROC
1. 计算ROC值

In [11]:
import sklearn.metrics import roc_auc_score
y_true = np.array([0,0,1,1])
y_scores = np.array([0.1,0.4,0.35,0.8])
roc_auc_score(y_true,y_scores)

SyntaxError: invalid syntax (<ipython-input-11-10f400cb5d4c>, line 1)

In [22]:
import pandas as pd
a = pd.DataFrame(range(10))
a

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [37]:
c = set(a[0].values)
d = set([1,2,3,4,20])
e = c.intersection(d)
len(e)

4

In [25]:
b = pd.Series(range(10))
b

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [26]:
type(a)

pandas.core.frame.DataFrame

In [29]:
b.values

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64)

In [30]:
list(b)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]