In [21]:
import pandas as pd
import re

# 读取整合后的数据集
df = pd.read_csv("question_solution.csv")

#id存入列表,这个后面要存到数据库
ids = df['id'].tolist()
df["fault_desc_clean"] = df["fault_desc"].apply(lambda x: re.sub(r"[^\u4e00-\u9fa5a-zA-Z]", " ", str(x)))
stopwords = ["的", "我", "怎么办", "电脑", "软件", "故障"]
df["fault_desc_clean"] = df["fault_desc_clean"].apply(lambda x: " ".join([w for w in x.split() if w not in stopwords]))


In [78]:
#对中文文本进行分词
from sklearn.feature_extraction.text import TfidfVectorizer 
import jieba
from sklearn.preprocessing import LabelEncoder

# 分词并添加空格（TfidfVectorizer默认按空格分词）
df["fault_desc_tokenized"] = df["fault_desc_clean"].apply(lambda x: " ".join(jieba.cut(x)))

# 重新创建并训练TF-IDF向量器
tfidf = TfidfVectorizer(max_features=200)
X = tfidf.fit_transform(df["fault_desc_tokenized"]).toarray()

#标签编码
le = LabelEncoder()
y = le.fit_transform(df["fault_type"])

import joblib
joblib.dump(le, 'label_encoder.pkl')

print(y)



[1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2
 2 2 2 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 0 0 0 0 0 0 0 0 0 0 2 2 2 2
 2 2 2 0 0 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 0 0 0 0 0 0 0 0 0 0 2 2
 2 2 2 2 2 0 0 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3]


In [74]:
# 保存TF-IDF向量器
import joblib
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
# 检查非零元素比例（验证是否解决问题）
non_zero_ratio = (X != 0).sum() / X.size
print(f"非零元素比例: {non_zero_ratio:.4f}")

非零元素比例: 0.0351


In [75]:
# 按问题类型分别保存问题描述的TF - IDF向量
type_question_vectors = {label: tfidf.transform(df[df["fault_type"] == label]["fault_desc_clean"]) for label in df["fault_type"].unique()}
joblib.dump(type_question_vectors, 'type_question_vectors.pkl')
print(type_question_vectors)

{'系统类': <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 22 stored elements and shape (40, 200)>, '软件类': <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 18 stored elements and shape (40, 200)>, '硬件类': <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6 stored elements and shape (34, 200)>, '网络类': <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5 stored elements and shape (24, 200)>}


In [76]:
from sklearn.model_selection import train_test_split  
#测试集占总数据比例20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  
#支持向量机
from sklearn.svm import SVC  
model_svm = SVC(kernel="linear",C=10)  # 线性核适合文本  
model_svm.fit(X_train, y_train)  
# 保存模型
joblib.dump(model_svm, "fault_classifier.pkl")

['fault_classifier.pkl']

In [77]:
from sklearn.metrics import accuracy_score, confusion_matrix  
# 以SVM为例  
y_pred = model_svm.predict(X_test)  
print(f"准确率：{accuracy_score(y_test, y_pred):.2f}")  
print("混淆矩阵：\n", confusion_matrix(y_test, y_pred)) 

准确率：0.71
混淆矩阵：
 [[7 1 0 2]
 [1 4 0 0]
 [0 2 4 0]
 [2 0 0 5]]


In [81]:
model = joblib.load('fault_classifier.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')
type_question_vectors = joblib.load('type_question_vectors.pkl')
labelen = joblib.load('label_encoder.pkl')
user_question="打印机提示“卡纸”，但打开后没找到纸张"
# 文本清洗函数
def clean_text(text):
    text = re.sub(r"[^\u4e00-\u9fa5a-zA-Z]", " ", str(text))
    stopwords = ["的", "我", "怎么办", "电脑", "软件", "故障"]
    text = " ".join([w for w in text.split() if w not in stopwords])
    text = " ".join(jieba.cut(text))
    return text

clean_question = clean_text(user_question)
question_vector = tfidf.transform([clean_question]).toarray()
predicted_index = model_svm.predict(question_vector)[0]
predicted_type = le.inverse_transform([predicted_index])[0]

print(f"预测索引: {predicted_index}")
print(f"预测类型：{predicted_type}")

from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(question_vector, type_question_vectors[predicted_type])[0]
most_similar_index = similarities.argmax()
print(similarities)
print(most_similar_index)

# for k, v in label_mapping.items():
#     if v == predicted_index:
#         predicted_type = k
#         print(f"预测的文本标签: {predicted_type}")
#         break

预测索引: 0
预测类型：硬件类
[0.         0.         0.         0.         0.         0.
 0.51599777 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.51599777
 0.         0.19996349 0.         0.        ]
6


In [79]:
import pymysql
# 连接到MySQL数据库

conn = pymysql.connect(
    host='localhost',
    user='root',
    password='9696',
    database='qapairs',
    charset='utf8mb4'
)

try:
    with conn.cursor() as cursor:
        select_query = "SELECT id, fault_type, solution FROM combined_fault_data WHERE fault_type = %s"
        cursor.execute(select_query, (predicted_type,))
        data = cursor.fetchall()

        ids = [row[0] for row in data]
        solutions = [row[2] for row in data]

        id_value = ids[most_similar_index]
        solution = solutions[most_similar_index]
        #logging.info(f"Predicted type: {predicted_type}, Matched id: {id_value}")

finally:
    conn.close()

print({'solution': solution, 'predicted_type': predicted_type, 'id': id_value})



{'solution': '1. 检查打印机内部：打开打印机盖板，检查是否有残留纸张碎片\\n2. 重启打印机：关闭打印机电源，等待30秒后重新开启\\n3. 更新打印机驱动：到惠普官网下载对应型号的最新驱动程序', 'predicted_type': '硬件类', 'id': 27}
