In [1]:
# 特征缩放
import numpy as np
from sklearn import preprocessing

In [2]:
# 创建特征
feature = np.array([[-500.5],
                   [-100.1],
                   [0],
                   [100.1],
                   [900.6]])
# 创建缩放器
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

# 缩放特征的值
scaled_feature = minmax_scale.fit_transform(feature)

# 查看特征
scaled_feature

array([[0.        ],
       [0.28577546],
       [0.35721933],
       [0.42866319],
       [1.        ]])

In [3]:
# 特征标准化
x = np.array([[-1000.5],
                   [-200.2],
                   [500.5],
                   [600.6],
                   [9000.6]])

scaler = preprocessing.StandardScaler()

standardized = scaler.fit_transform(x)

standardized

array([[-0.76066581],
       [-0.54174221],
       [-0.35006438],
       [-0.32268184],
       [ 1.97515424]])

In [4]:
# 如果数据中存在很大的异常值，推荐使用中位数，四分数间距进行缩放

robust_scaler = preprocessing.RobustScaler()

robust_standardized = robust_scaler.fit_transform(x)

robust_standardized

array([[-1.87437562],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61451049]])

In [5]:
# 归一化
import numpy as np
from sklearn.preprocessing import Normalizer

# 创建特征矩阵
feature = np.array([[0.5, 0.5],
                   [1.1, 3.5],
                   [1.5, 20.2],
                   [1.63, 34.4],
                   [10.9, 3.3]])

# 创建归一化器
normalizer = Normalizer(norm='l2')

# 转换特征矩阵
normalizer.fit_transform(feature)

array([[0.70710678, 0.70710678],
       [0.2998266 , 0.95399372],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [7]:
# 生成多项式和交互特征
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

# 创建特征矩阵
features = np.array([[2,3],
                    [2,3],
                    [2,3]])

# 创建PolynomialFeatures 对象   degree 表示最高阶数
polynomial_interaction = PolynomialFeatures(degree=2, include_bias=False)
# 创建多项式特征
polynomial_interaction.fit_transform(features)


array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [8]:
# 转换特征
from sklearn.preprocessing import FunctionTransformer

# 创建特征矩阵
features = np.array([[2,3],
                    [2,3],
                    [2,3]])
# 定义一个简单的函数
def add_ten(x):
    return x + 10

# 创建转换器
ten_transformer = FunctionTransformer(add_ten)

# 创建特征矩阵
ten_transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [9]:
# 可以用pandas的apply使用同样的转换
import pandas as pd

df = pd.DataFrame(features, columns=['feature_1', 'feature_2'])

df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


In [1]:
# 识别异常值
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [3]:
# 创建模拟数据
features, _ = make_blobs(n_samples= 10,
                        n_features = 2,
                        centers = 1,
                        random_state = 1)
# 将第一个观察值替换为极端值
features[0,0] = 10000
features[0,1] = 10000

#创建识别器
outlier_detector = EllipticEnvelope(contamination=.1)

# 拟合识别器
outlier_detector.fit(features)

# 预测异常值
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [4]:
# 创建一个特征
feature = features[:,0]
# 创建一个函数来返回异常值的下标
def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr* 1.5)
    upper_bound = q3 + (iqr* 1.5)
    return np.where((x > upper_bound) | (x < upper_bound))

# 执行函数
indicies_of_outliers(feature)
    

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64),)

In [6]:
# 处理异常值

# tip 1  丢弃
import pandas as pd
# 创建数据帧
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

# 筛选观察值
houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [7]:
# tip 2  标记  作为特殊值进行处理
houses['Outlier'] = np.where(houses['Bathrooms']< 20, 0, 1)

houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [8]:
# 对特征取对数值
houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]

# 查看数据
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


In [9]:
# 将特征离散化

# 1.根据阈值将特征二值化
import numpy as np
from sklearn.preprocessing import Binarizer

# 创建特征
age = np.array([[6],
               [12],
               [20],
               [36],
               [65]])
# 创建二值化器
binarizer =Binarizer(18)

# 转换特征
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [10]:
# 2. 根据多个阈值将数值型特征离散化
np.digitize(age, bins=[20, 30, 64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [11]:
# 使用聚类的方式将观察这进行分组
from sklearn.cluster import KMeans

features,_ = make_blobs(n_samples=50,
                       n_features=2,
                       centers=3,
                       random_state=1)

# 创建数据帧
dataframe = pd.DataFrame(features, columns=['feature_1', 'feature_2'])

# k_means 聚类器
clusterer = KMeans(3, random_state=0)

# 将聚类应用在特征上
clusterer.fit(features)

# 预测值
dataframe['group'] = clusterer.predict(features)

dataframe.head()

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2


In [13]:
#删除带有缺失值的观察值


# 1. 使用numpy
features = np.array([[1.1, 11.1],
                    [2.2, 22.2],
                    [3.3, 33.3],
                     [4.4, 44.4],
                    [np.nan, 55]])

# 只保留没有 缺失值的观察值
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [14]:
# 2.使用pandas

dataframe = pd.DataFrame(features, columns=['feature_1', 'feature_2'])

dataframe.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


In [15]:
# 填充缺失值

from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

features,_ = make_blobs(n_samples=1000,
                       n_features = 2,
                       random_state = 1)

# 标准化特征
scaler = StandardScaler()

standardized_features = scaler.fit_transform(features)

# 将第一个特征向量的第一个值替换为缺失值
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

features_knn_imputed = KNN(k=5, verbose=0).complete(standardized_features)

print(true_value)

print(features_knn_imputed[0,0])

ModuleNotFoundError: No module named 'fancyimpute'