# 1 DictVectorizer

In [2]:
from sklearn.feature_extraction import DictVectorizer
import pandas as pd

# 准备字典数据
data = [
    {'颜色': '红', '尺寸': '大', '价格': 100},
    {'颜色': '蓝', '尺寸': '中', '价格': 80},
    {'颜色': '红', '尺寸': '小', '价格': 50}
]

# 初始化字典提取器
# sparse=False 表示返回的矩阵是二维的，而不是稀疏矩阵
# sparse=True 默认值，返回的矩阵是稀疏矩阵, 稀疏矩阵只保存非零的元素, 可以节省内存
# 稀疏矩阵: 矩阵中，大多数值都为0的矩阵
dict_vec = DictVectorizer(sparse=False)

# 提取特征,类别变为one-hot（独热码）编码
X = dict_vec.fit_transform(data)
print(X)
# 查看特征名
print(dict_vec.get_feature_names_out())

[[100.   0.   1.   0.   1.   0.]
 [ 80.   1.   0.   0.   0.   1.]
 [ 50.   0.   0.   1.   1.   0.]]
['价格' '尺寸=中' '尺寸=大' '尺寸=小' '颜色=红' '颜色=蓝']


# 2 CountVectorizer

In [None]:
from nt import X_OK
from sklearn.feature_extraction.text import CountVectorizer
# 样本文本（3个文档）
documents = [
    "I love machine learning. Machine learning is interesting.",
    "I love coding. Coding is fun and useful.",
    "Machine learning and coding are my favorite skills."
]

# 创建CountVectorizer对象（默认参数：小写化文本、按空格/标点分词、不过滤停用词）
# min_df/max_df参数解释:
# min_df：最小文档频率，即单词在所有文档中 至少出现的次数或比例，低于此阈值的单词将被忽略
# max_df：最大文档频率，即单词在文档中的次数或比例，高于此阈值的单词将被忽略
# 当min_df和max_df都为整数时，表示单词出现的次数， 当min_df和max_df都为小数时，表示单词在文档中的比例
count_vec = CountVectorizer(min_df=2)

X=count_vec.fit_transform(documents)

df = pd.DataFrame(X.toarray(), columns=count_vec.get_feature_names_out())
df


Unnamed: 0,and,coding,is,learning,love,machine
0,0,0,1,2,1,2
1,1,2,1,0,1,0
2,1,1,0,1,0,1


In [9]:
# 如果有汉字
# 样本文本（3个文档）
documents = [
    "人生苦短，我喜欢 python", 
    "人生漫长，不用 python python",
    "人生漫漫，不用 python python python"
]

count_vec = CountVectorizer(min_df=1)

# 训练并转换文档
X = count_vec.fit_transform(documents)

# 结合DataFrame查看
df = pd.DataFrame(X.toarray(), columns = count_vec.get_feature_names_out())
df

Unnamed: 0,python,不用,人生漫漫,人生漫长,人生苦短,我喜欢
0,1,0,0,0,1,1
1,2,1,0,1,0,0
2,3,1,1,0,0,0


In [None]:
# 使用jieba（结巴）分词
documents = [
    "人生苦短，我喜欢 python", 
    "人生漫长，不用 python python",
    "人生漫漫，不用 python python python"
]
import jieba

# 定义一个分词函数
def jieba_tokenizer(text):
    return list(jieba.cut(text))

cut_doucuments = [ " ".join(jieba.cut(document)) for document in documents]
print(cut_doucuments) #jieba分词并拼接好的样本


count_vec = CountVectorizer(min_df=1)

X = count_vec.fit_transform(cut_doucuments)

df = pd.DataFrame(X.toarray(), columns=count_vec.get_feature_names_out())
df


['人生 苦短 ， 我 喜欢   python', '人生 漫长 ， 不用   python   python', '人生 漫漫 ， 不用   python   python   python']


Unnamed: 0,python,不用,人生,喜欢,漫漫,漫长,苦短
0,1,0,1,1,0,0,1
1,2,1,1,0,0,1,0
2,3,1,1,0,1,0,0


# 3 TF-IDFVectorizer

In [14]:
# 1. 导入包
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# 样本文本（3个文档）
documents = [
    "I love machine learning. Machine learning is interesting.",
    "I love coding. Coding is fun and useful.",
    "Machine learning and coding are my favorite skills."
]


# 2. 创建TF-IDF对象
# stopwords='english' 忽略英文中的停用词
# 停用词: 指文本中频繁出现，但通常对语义理解帮助不大的虚词或常见词
tfidf = TfidfVectorizer(min_df=1,stop_words='english')

# 3. 创建特征矩阵
X = tfidf.fit_transform(documents)

# 4. 显示结果
pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,coding,favorite,fun,interesting,learning,love,machine,skills,useful
0,0.0,0.0,0.0,0.401429,0.610594,0.305297,0.610594,0.0,0.0
1,0.687703,0.0,0.452123,0.0,0.0,0.343851,0.0,0.0,0.452123
2,0.393511,0.51742,0.0,0.0,0.393511,0.0,0.393511,0.51742,0.0
