In [1]:
# 对nominal型分类特征编码


#  tip   独热编码容易造成维度灾难，使用前要注意如何对于特征进行合理的区分

import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

In [2]:
# 使用 独热编码
features = np.array([['Texas'],
                    ['California'],
                    ['Texas'],
                    ['Delaware'],
                    ['Texas']])

# 创建one-hot 编码器
one_hot = LabelBinarizer()

# 对特征进行one_hot编码
one_hot.fit_transform(features)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [3]:
# 使用  classes_  查看分类
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [4]:
# 对于数据进行逆转换
one_hot.inverse_transform(one_hot.transform(features))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [5]:
# 利用pandas  进行独热编码的实现
import pandas as pd
# features[:,0]  把二维变为一维
pd.get_dummies(features[:,0])


Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [10]:
# sk 处理每个观察值有多个分类


# 创建多个分类的特征
multiclass_feature = [('Texas','Florida'),
                     ('Califorina','Alabama'),
                     ('Texas','Florida'),
                     ('Delware','Florida'),
                     ('Texas','Alabama'),
                     ('Alabama','Califorina')]

# 创建能处理多个分类的one-hot编码器
one_hot_multiclass = MultiLabelBinarizer()


one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1],
       [1, 1, 0, 0, 0]])

In [11]:
one_hot_multiclass.classes_

array(['Alabama', 'Califorina', 'Delware', 'Florida', 'Texas'],
      dtype=object)

In [12]:
# 对于ordinary分类特征编码

# 使用pandas将数据帧的数据转换为数字
import pandas as pd

dataframe = pd.DataFrame({"Score": ["Low",'Low',"Medium","Medium","High"]})

scale_mapper = {"Low":1,
               "Medium":2,
               "High":3}

# 使用映射器来进行替换操作mk                     
dataframe['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [14]:
# 处理特征字典编码
from sklearn.feature_extraction import DictVectorizer

In [15]:
data_dict = [{'Red':2, "Bule":4},
            {"Red":4, "Blue":3},
            {"Red":1, "Yellow":2},
            {"Red": 2, "Yellow":2}]

dictvectorizer = DictVectorizer(sparse=False)

features = dictvectorizer.fit_transform(data_dict)

features

array([[0., 4., 2., 0.],
       [3., 0., 4., 0.],
       [0., 0., 1., 2.],
       [0., 0., 2., 2.]])

In [16]:
# 获取特征的名字
features_name = dictvectorizer.get_feature_names()

features_name

['Blue', 'Bule', 'Red', 'Yellow']

In [17]:
dataframe = pd.DataFrame(features, columns=features_name)
dataframe

Unnamed: 0,Blue,Bule,Red,Yellow
0,0.0,4.0,2.0,0.0
1,3.0,0.0,4.0,0.0
2,0.0,0.0,1.0,2.0
3,0.0,0.0,2.0,2.0


In [4]:
# 处理缺失的分类值
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# 创建矩阵
X = np.array([[0, 2.10, 1.45],
             [1, 1.18, 1.33],
             [0, 1.22, 1.27],
             [1, -0.21, -1.19]])

# 创建带缺失值的特征矩阵
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                      [np.nan, -0.67, -0.22]])

# 训练Knn
clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:,1:], X[:, 0])

# 预测缺失值的分类
imputed_values = trained_model.predict(X_with_nan[:,1:])

# 将预测的分类和他们链接起来
X_with_impute = np.hstack((imputed_values.reshape(-1, 1), X_with_nan[:,1:]))

# 链接矩阵
np.vstack((X_with_impute, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [1]:
# 处理不均衡分类
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

  from numpy.core.umath_tests import inner1d


In [2]:
iris = load_iris()

# 创建特征矩阵
features = iris.data

# 创建目标向量
target = iris.target

# 移除前40个观察值
features = features[40:, :]
target = target[40:]

# 创建二元目标向量来标识观察值是否为类别0
target = np.where((target == 0),0, 1)

# 创建权重
weights = {0: 0.9, 1: 0.1}

# 创建带权重的随机森林的分类器
RandomForestClassifier(class_weight=weights)

RandomForestClassifier(bootstrap=True, class_weight={0: 0.9, 1: 0.1},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [14]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [3]:
# 给每个分类观察打标签
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

# 确定每个分类的观察值
n_class0 = len(i_class0)
n_class1 = len(i_class1)

# 对于每个分类为0的观察值，从分类为1的数据中进行无放回的随机采样
i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)

# 将分类为0的目标向量和下采样的分类为1的目标向量链接起来
np.hstack((target[i_class0], target[i_class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [10]:
# 将分类为0的特征矩阵和下采样的分类为1的目标向量链接起来
np.vstack((features[i_class0,:], features[i_class1_downsampled,:]))

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [5.6, 2.7, 4.2, 1.3],
       [5. , 2. , 3.5, 1. ],
       [6.3, 2.5, 5. , 1.9],
       [6.1, 3. , 4.9, 1.8],
       [6.6, 3. , 4.4, 1.4],
       [7.1, 3. , 5.9, 2.1],
       [5.8, 2.7, 4.1, 1. ],
       [5. , 2.3, 3.3, 1. ],
       [5.2, 2.7, 3.9, 1.4],
       [6.2, 2.2, 4.5, 1.5]])

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2]])