In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [4]:
def dictvec():
    """
    字典数据提取
    把字典中的类别提取为one-hot编码
    """
    # sparse=True表示数据为稀疏矩阵
    dict = DictVectorizer(sparse=False)

    # 调用fit_transform
    data = dict.fit_transform([{"city":"北京","temperature":100},
                               {"city":"上海","temperature":60},
                               {"city":"深圳","temperature":30}])

    print(data)
    print("-"*50)
    # 返回特征的名称
    print(dict.get_feature_names_out())
    print("-"*50)
    print(dict.inverse_transform(data))

dictvec()

[[  0.   1.   0. 100.]
 [  1.   0.   0.  60.]
 [  0.   0.   1.  30.]]
--------------------------------------------------
['city=上海' 'city=北京' 'city=深圳' 'temperature']
--------------------------------------------------
[{'city=北京': 1.0, 'temperature': 100.0}, {'city=上海': 1.0, 'temperature': 60.0}, {'city=深圳': 1.0, 'temperature': 30.0}]


In [6]:
def countvec():
    """
    文本特征抽取
    """
    # max_df min_df 该词的词频不小于最小值min_df，小于等于max_df
    vector = CountVectorizer(min_df=2)
    # 调用fit_transform输入并转换数据

    res = vector.fit_transform(
        ["life is shor,i like python life",
         "life is too long,i dislike python",
         "life is short"])

    # 将每个词都分离了，并且统计了词频
    print(vector.get_feature_names_out())
    print(res)
    print(type(res))
    print(res.toarray())

countvec()

['is' 'life' 'python']
  (0, 1)	2
  (0, 0)	1
  (0, 2)	1
  (1, 1)	1
  (1, 0)	1
  (1, 2)	1
  (2, 1)	1
  (2, 0)	1
<class 'scipy.sparse._csr.csr_matrix'>
[[1 2 1]
 [1 1 1]
 [1 1 0]]


In [8]:
def countChineseVec():
    vec = CountVectorizer()

    data = vec.fit_transform(["人生苦短，我 喜欢 python python","人生漫长，我 不用 python"])

    print(vec.get_feature_names_out())

    print(data)
    print(data.toarray())

    return None

countChineseVec()

['python' '不用' '人生漫长' '人生苦短' '喜欢']
  (0, 3)	1
  (0, 4)	1
  (0, 0)	2
  (1, 0)	1
  (1, 2)	1
  (1, 1)	1
[[2 0 0 1 1]
 [1 1 1 0 0]]


### tf-idf主要思想 如果某个词或者短语在一篇文章中出现的频率高，并且在其他文章中很少出现，则可以认为这个词或者短语具有很好的分类能力
### tf-idf主要用来评估一个词对于一个文件集或者一个语料库中的其中一份文件的重要程度
Tf: 词的频率
idf: 逆文档频率
log: 总文档数量/该词出现的文档数量
tf*idf来代表重要性中程度

In [10]:
def minMaxStand():
    """
    归一化处理
    """
    # 归一化容易受极端值的影响
    mm = MinMaxScaler(feature_range=(0,1))

    data = [[90,2,10,40],[460,4,15,45],[75,3,13,46]]
    res = mm.fit_transform(data)

    print(res)
    return None

minMaxStand()

[[0.03896104 0.         0.         0.        ]
 [1.         1.         1.         0.83333333]
 [0.         0.5        0.6        1.        ]]


In [12]:
def standerScale():
    """
    正太标准化
    """

    std = StandardScaler()

    data = [[90,2,10,40],[460,4,15,45],[75,3,13,46]]

    res = std.fit_transform(data)

    print(res)
    print("均值为:",std.mean_)
    print("方差为:",std.var_)
    print("样本数:",std.n_samples_seen_)

standerScale()

[[-0.66456798 -1.22474487 -1.29777137 -1.3970014 ]
 [ 1.41337698  1.22474487  1.13554995  0.50800051]
 [-0.748809    0.          0.16222142  0.88900089]]
均值为: [208.33333333   3.          12.66666667  43.66666667]
方差为: [3.17055556e+04 6.66666667e-01 4.22222222e+00 6.88888889e+00]
样本数: 3


### 标准化和归一化
对于归一化来说，如果出现了异常点，影响了最大值和最小值，那么结果会发送改变
对于标准化来说，如果出现了异常点，由于具有一定的数据量，异常点对于平均值的影响不大，所以方差改变较小

In [14]:
def im():
    """
    缺失值处理
    """
    # 缺失值必须是NaN nan这种形式，如果是?那么需要replace
    im = SimpleImputer(missing_values=np.nan,strategy='mean')

    data = [[1,2],[np.nan,3],[7,6]]
    res = im.fit_transform(data)

    print(res)
    return None

im()

[[1. 2.]
 [4. 3.]
 [7. 6.]]


### 特征选择
+ Filter过滤式:VarianceThreshold
+ Embedded嵌入式:正则化、决策树
+ Wrapper包裹式

In [16]:
from sklearn.feature_selection import VarianceThreshold

def var():
    """
    特征选择：删除低方差的特征
    """
    var = VarianceThreshold(threshold=0)

    data = [[0,2,0,3],[0,1,4,3],[0,1,1,3]]

    res = var.fit_transform(data)

    print(res)
    # 获得剩余特征的列编号
    print(f"The support is %s" % var.get_support(True))

    return None

var()

[[2 0]
 [1 4]
 [1 1]]
The support is [1 2]


In [18]:
from sklearn.decomposition import PCA

def pca():
    """
    主成分分析进行特征降维
    n_components参数 小数为百分比  整数为降维后特征个数
    """
    pca = PCA(n_components=0.9)

    data = [[0,2,0,3],[0,1,4,3],[0,1,1,3]]
    res = pca.fit_transform(data)

    print(res)
    return None

pca()

[[-1.76504522]
 [ 2.35339362]
 [-0.58834841]]
