# 数据准备

In [1]:
# 示例数据
texts = [
    "这个电影太棒了，值得一看！",
    "服务很好，我很满意。",
    "剧情很无聊，太差了。",
    "浪费时间，完全不好看。"
]
labels = ["正面", "正面", "负面", "负面"]  # 对应的标签


# jieba分词

In [4]:
import jieba

# 分词函数
def preprocess(text):
    return " ".join(jieba.cut(text))  # 分词后用空格连接

# 应用到数据
processed_texts = [preprocess(text) for text in texts]
print(processed_texts)
# 输出示例: ['这个 电影 太棒了 ， 值得 一看 ！', ...]


['这个 电影 太棒了 ， 值得一看 ！', '服务 很 好 ， 我 很 满意 。', '剧情 很 无聊 ， 太差 了 。', '浪费时间 ， 完全 不 好看 。']


# 特征提取

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 初始化 TF-IDF 向量化器
vectorizer = TfidfVectorizer()

# 转换为 TF-IDF 特征矩阵
X = vectorizer.fit_transform(processed_texts)
print(X.shape)  # 输出矩阵形状，例如 (4, 15)
print(X[0])
print(X[1])
print(X[2])
print(X[3])


(4, 12)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (1, 12)>
  Coords	Values
  (0, 11)	0.5
  (0, 10)	0.5
  (0, 3)	0.5
  (0, 0)	0.5
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2 stored elements and shape (1, 12)>
  Coords	Values
  (0, 7)	0.7071067811865476
  (0, 9)	0.7071067811865476
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1, 12)>
  Coords	Values
  (0, 1)	0.5773502691896257
  (0, 6)	0.5773502691896257
  (0, 2)	0.5773502691896257
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1, 12)>
  Coords	Values
  (0, 8)	0.5773502691896257
  (0, 5)	0.5773502691896257
  (0, 4)	0.5773502691896257


# 模型训练

In [8]:
from sklearn.linear_model import LogisticRegression

# 初始化模型
model = LogisticRegression()

# 训练模型
model.fit(X, labels)


# 测试与预测

In [12]:
# 新文本
new_texts = ["这个产品真不错！", "太差了，不推荐。"]
new_processed = [preprocess(text) for text in new_texts]
new_X = vectorizer.transform(new_processed)

# 预测
predictions = model.predict(new_X)
print(predictions)  # 输出: ['正面', '负面']


['正面' '负面']


# 评估模型

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 拆分数据
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42)

# 训练与预测
model.fit(X_train, y_train)
y_pred = model.predict(X_test)



print(y_test)
# 计算准确率
print("准确率:", accuracy_score(y_test, y_pred))


['正面']
准确率: 0.0
