In [3]:
# 分类数据
data=[
    {'price':850000,'rooms':4,'neighborhood':'Queen Anne'},
    {'price':700000,'rooms':3,'neighborhood':'Fremont'},
    {'price':650000,'rooms':2,'neighborhood':'Wallingford'},
    {'price':600000,'rooms':1,'neighborhood':'Fremont'}
]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False,dtype=int)
vec.fit_transform(data)
vec.get_feature_names()
# 这种方法的缺陷在于：如果分类有许多枚举值，数据维度会很大。因为数据中有许多0，因此用稀疏矩阵表示会比较高效。

['neighborhood=Fremont',
 'neighborhood=Queen Anne',
 'neighborhood=Wallingford',
 'price',
 'rooms']

In [4]:
# 稀疏矩阵来表示的数据
vec = DictVectorizer(sparse=True,dtype=int)
vec.fit_transform(data)
vec

DictVectorizer(dtype=<class 'int'>, separator='=', sort=True, sparse=True)

In [7]:
# 文本特征的表示  单词统计法
sample = ['problem of evil',
'evil queen',
'horizon problem']
from sklearn.feature_extraction.text import CountVectorizer
vec=CountVectorizer()
X=vec.fit_transform(sample)
X

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [10]:
# 用pandas的dataframe来表示以上稀疏矩阵
import pandas as pd
pd.DataFrame(X.toarray(),columns=vec.get_feature_names())
# 这样就得到了没个单词出现次数的DataFrame
# 但是该方法的缺点是：原始单词统计会让一些常用词（如is）聚集太高权重，在分类算法中这样并不合理，解决该问题可以用TF-IDF（term frequency-inverse document frequency，词频逆文档频率，其大小与一个词的常见成都成反比）,通过单词在文档中出现的频率来衡量其权重。

Unnamed: 0,evil,horizon,of,problem,queen
0,1,0,1,1,0
1,1,0,0,0,1
2,0,1,0,1,0


In [11]:
# 使用TF-IDF方法表示以上数据
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X=vec.fit_transform(sample)
pd.DataFrame(X.toarray(),columns=vec.get_feature_names())

Unnamed: 0,evil,horizon,of,problem,queen
0,0.517856,0.0,0.680919,0.517856,0.0
1,0.605349,0.0,0.0,0.0,0.795961
2,0.0,0.795961,0.0,0.605349,0.0


In [14]:
# 为了让X输入一个多项式必须对其进行变换使其获得多项式特征
import numpy as np
x=np.array([1,2,3,4,5])
X=x[:,np.newaxis]
from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(degree=3,include_bias=False)
X2=poly.fit_transform(X)
print(X2)
# 第一列表示x第二列表示x**2第三列表示x**3

[[  1.   1.   1.]
 [  2.   4.   8.]
 [  3.   9.  27.]
 [  4.  16.  64.]
 [  5.  25. 125.]]
