## one-hot编码转换
可以将类别数据转换为整型数据。

In [17]:
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()
X = [
    {'city': 'New York'},
    {'city': 'San Francisco'},
    {'city': 'Chapel Hill'}
]

X1 = onehot_encoder.fit_transform(X).toarray()
X1

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [23]:
onehot_encoder.inverse_transform(X1)

[{'city=New York': 1.0},
 {'city=San Francisco': 1.0},
 {'city=Chapel Hill': 1.0}]

# 数据标准化

In [26]:
from sklearn import preprocessing
import numpy as np
X = np.array([
    [0., 0., 5., 13., 9., 1.],
    [0., 0., 13., 15., 10., 15.],
    [0., 3., 15., 2., 0., 11.]
])
print(preprocessing.scale(X))
#以均值为中心，以分量为单位缩放至单位方差。

[[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
 [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
 [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]


# 池袋分类

In [28]:
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game'
]

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'unc': 9, 'played': 6, 'duke': 2, 'in': 4, 'basketball': 1, 'lost': 5, 'the': 8, 'game': 3, 'ate': 0, 'sandwich': 7}


In [42]:
print(vectorizer.get_feature_names())

['ate', 'basketball', 'duke', 'game', 'in', 'lost', 'played', 'sandwich', 'the', 'unc']


In [43]:
corpus.append('I ate a sandwich')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'unc': 9, 'played': 6, 'duke': 2, 'in': 4, 'basketball': 1, 'lost': 5, 'the': 8, 'game': 3, 'ate': 0, 'sandwich': 7}


# 欧几里得范数


In [44]:
from sklearn.metrics.pairwise import euclidean_distances
X = vectorizer.fit_transform(corpus).todense()
print('Distance between 1st and 2nd documents:', euclidean_distances(X[0], X[1]))
print('Distance between 1st and 3rd documents:', euclidean_distances(X[0], X[2]))
print('Distance between 2nd and 3rd documents:', euclidean_distances(X[1], X[2]))

Distance between 1st and 2nd documents: [[2.44948974]]
Distance between 1st and 3rd documents: [[2.64575131]]
Distance between 2nd and 3rd documents: [[2.64575131]]
