In [22]:
import numpy as np
from sklearn import datasets

In [23]:
iris = datasets.load_iris()

##### preprocessing

In [19]:
from sklearn import preprocessing as prep

In [21]:
le = prep.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
list(le.classes_)

['amsterdam', 'paris', 'tokyo']

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [24]:
binarizer = prep.LabelBinarizer()
# 与OneHotEncoder的区别是LabelBinarizer只对一维数据编码，主要用于target编码，而不是特征
# Binarize labels in a one-vs-all fashion
t = binarizer.fit(iris.target).transform(iris.target)

#### impute

In [1]:
from sklearn import impute

In [10]:
simputer = impute.SimpleImputer(missing_values=np.nan, strategy='mean')

In [11]:
simputer.fit([[1, 2], [np.nan, 3], [7, 6]])

SimpleImputer()

In [12]:
simputer.statistics_

array([4.        , 3.66666667])

参数stragegy：mean，median，most_frequent，constant

In [14]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# 建立其他特征预测某一特征的回归模型，预测缺失值

In [15]:
imp = IterativeImputer(max_iter=10, random_state=0)

In [16]:
from sklearn.impute import KNNImputer

In [17]:
X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X)

array([[1. , 2. , 4. ],
       [3. , 4. , 3. ],
       [5.5, 6. , 5. ],
       [8. , 8. , 7. ]])

#### metrics

In [46]:
from sklearn import metrics

In [47]:
y = np.array([0, 0, 0, 1, 1, 1])
scores = np.array([0.2, 0.6, 0.4, 0.6, 0.8, 0.8])

In [48]:
fpr, tpr, threads = metrics.roc_curve(y, scores)

In [49]:
fpr

array([0.        , 0.        , 0.33333333, 1.        ])

In [50]:
tpr

array([0.        , 0.66666667, 1.        , 1.        ])

In [54]:
threads

array([1.8, 0.8, 0.6, 0.2])

In [52]:
metrics.auc(fpr, tpr)  # 计算曲线下面的面积，fpr必须是单调递增或递减

0.9444444444444444

In [53]:
metrics.roc_auc_score(y, scores)

0.9444444444444444

#### model_selection

In [62]:
from sklearn.model_selection import KFold
k_fold = KFold(3, shuffle=False) # shuffle=False时，按照原先的排序分
k_fold.split(iris.data)  # 得到生成器，每次返回(test_indices, train_indices)，训练集和测试集的下标

<generator object _BaseKFold.split at 0x12d6c6de0>

In [70]:
spl = model_selection.ShuffleSplit(n_splits=3,test_size=None, train_size=None)  # n_splits=3表示分3次
spl.split(iris.data, iris.target)  # 返回生成器，迭代3次，每次都是按照test_size,train_size比例划分的X和Y数据

<generator object BaseShuffleSplit.split at 0x12d6c6ed0>

In [69]:
model_selection.StratifiedShuffleSplit(n_splits=10) # 按照Y类别分层split数据

StratifiedShuffleSplit(n_splits=10, random_state=None, test_size=None,
            train_size=None)

In [74]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
      iris.data, iris.target, test_size=0.33, random_state=42)
# 实际上就是包装了ShuffleSplit，直接返回划分后的数据，而不是下标

In [76]:
# cross_val_score(c, data, target, cv=kv, scoring=scorer)
# cross_val_score(c, data, target, cv=kv)
# 交叉验证，返回每次验证集上的score

```python
# 看一下有没有过拟合的嫌疑
train_size, train_accuracy, test_accuracy = learning_curve(c, data, target, cv=kv, shuffle=True, random_state=999, train_sizes = [0.1,0.25,0.5,0.75,0.9,1])
plt.plot(train_size, train_accuracy.mean(axis=1), 'o-', color='r', label='Training')
plt.plot(train_size, test_accuracy.mean(axis=1), 'o-', color='g', label='Test')
plt.show()
```

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.model_selection import learning_curve,validation_curve,cross_val_score,GridSearchCV

In [31]:
breast = datasets.load_breast_cancer()

In [55]:
scorer = metrics.make_scorer(metrics.roc_auc_score, needs_threshold=True)

In [59]:
lpo = model_selection.LeavePOut(1)
kv = model_selection.StratifiedKFold(3, random_state=99)

In [39]:
n_estimators = [10,20,30,50,100,120,150]
max_depth = [2,3,4,5,6]
min_impurity_decrease = [0,0.05, 0.1, 0.15, 0.2, 0.5]
max_features = [5,10,15,20, 25] # 最后调
criterion = ['gini', 'entropy']

In [43]:
clf = RandomForestClassifier(random_state=999)

In [78]:
import xgboost as xgb

In [80]:
d = xgb.DMatrix(breast.data, breast.target)

作者：章华燕
链接：https://zhuanlan.zhihu.com/p/31182879
```
params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',  # 多分类的问题
    'num_class': 10,               # 类别数，与 multisoftmax 并用
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 12,               # 构建树的深度，越大越容易过拟合
    'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
    'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.007,                  # 如同学习率
    'seed': 1000,
    'nthread': 4,                  # cpu 线程数
}
```