# 设计多种模型的文本分类试验，上传不同参数配置下的试验结果

bayes分类(效果很差）

分析原因： 1. 分类少 2. 数据太少？

In [1]:
import math
import jieba
import re
import os
import json
from collections import defaultdict
import csv

jieba.initialize()

class BayesApproach:
    def __init__(self, path):
        self.p_class = defaultdict(int)
        self.word_class_prob = defaultdict(dict)
        self.load(path)
        
    def load(self, path):
        self.class_name_to_word_freq = defaultdict(dict)
        self.all_words = set()
        with open(path, encoding="utf-8") as file:
            csv_reader = csv.reader(file)
            next(csv_reader)
            for row in csv_reader:
                class_name = row[0]
                content = row[1]
                words = jieba.cut(content)
                self.all_words = self.all_words.union(set(words))
                self.p_class[class_name] += 1
                # class_name_to_word_freq： 【class : [word1: 10, word2: 20】
                word_freq = self.class_name_to_word_freq[class_name]
                for word in words:
                    if word not in word_freq:
                        word_freq[word] = 1
                    else:
                        word_freq += 1
        self.freq_to_prob()
        return
    
    def freq_to_prob(self):
        total_sample_count = sum(self.p_class.values())
        # p_class: [class1: 0.2, class2: 0.8]
        self.p_class = dict([c, self.p_class[c] / total_sample_count] for c in self.p_class)
        self.word_class_prob = defaultdict(dict)
        # class_name_to_word_freq: [class: [word1: 0.1, word2: 0.9]]
        for class_name, word_freq in self.class_name_to_word_freq.items():
            total_word_count = sum(count for count in word_freq.values()) #每个类别总词数
            for word in word_freq:
                prob = (word_freq[word] + 1) / (total_word_count + len(self.all_words))
                self.word_class_prob[class_name][word] = prob
            self.word_class_prob[class_name]["<unk>"] = 1 / (total_word_count + len(self.all_words))
        return
    
    def get_words_class_prob(self, words, class_name):
        result = 1
        for word in words:
            unk_prob = self.word_class_prob[class_name]["<unk>"]
            result *= self.word_class_prob[class_name].get(word, unk_prob)
        return result
    
    def get_class_prob(self, words, class_name):
        #P(x1)
        p_x = self.p_class[class_name]
        # P(w1, w2..wn|x1) = P(w1|x1) * P(w2|x1)...P(wn|x1)
        p_w_x = self.get_words_class_prob(words, class_name)
        return p_x * p_w_x
    
    def classify(self, sentence):
        words = jieba.lcut(sentence)
        results = []
        for class_name in self.p_class:
            # P(w1, w2..wn|x1) * P(x1)
            prob = self.get_class_prob(words, class_name)
            results.append([class_name, prob])
        results = sorted(results, key=lambda x:x[1], reverse=True)
        
        # P(w1, w2, w3...wn) = P(w1,w2..Wn|x1)*P(x1) + P(w1,w2..Wn|x2)*P(x2) ... P(w1,w2..Wn|xn)*P(xn)
        pw = sum([x[1] for x in results])
        results = [[c, prob/pw] for c, prob in results]
        
        for class_name, prob in results:
            print("classify[%s], prob[%s]"%(class_name, prob))
        return results
                
if __name__ == '__main__':
    path = "文本分类练习.csv"
    ba = BayesApproach(path)
    query = "味道很好，送餐快"
    result = ba.classify(query)
    print(result)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/44/bdwdgvls56gdjs1jzvrxf7q00000gn/T/jieba.cache
Loading model cost 0.292 seconds.
Prefix dict has been built successfully.


classify[0], prob[0.6663051639275882]
classify[1], prob[0.3336948360724118]
[['0', 0.6663051639275882], ['1', 0.3336948360724118]]


# svm 分类

In [2]:
import json
import jieba
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# 1 好评 0 差评
LABELS = {"1": 1, "0": 0}

#输入模型文件路径
#加载训练好的模型
def load_word2vec_model(path):
    model = Word2Vec.load(path)
    return model

#tag标签转化为类别标号
def label_to_label_index(labels):
    return [LABELS[y] for y in labels]

#文本向量化，使用了基于这些文本训练的词向量
def sentences_to_vectors(sentences, model):
    vectors = []
    for sentence in sentences:
        words = sentence.split()
        vector = np.zeros(model.vector_size)
        for word in words:
            try:
                vector += model.wv[word]
                # vector = np.max([vector, model.wv[word]], axis=0)
            except KeyError:
                vector += np.zeros(model.vector_size)
        vectors.append(vector / len(words))
    return np.array(vectors)

def load_sentence(path, model):
    sentences = []
    labels = []
    with open(path, encoding="utf-8") as file:
        csv_reader = csv.reader(file)
        next(csv_reader)
        for row in csv_reader:
            labels.append(row[0])
            sentences.append(" ".join(jieba.lcut(row[1])))
    train_x = sentences_to_vectors(sentences,model)
    train_y = label_to_label_index(labels)
    return train_x, train_y
            
            
def main():
    model = load_word2vec_model('model.w2v')
    train_x, train_y = load_sentence("文本分类练习.csv", model)
    classifier = SVC()
    classifier.fit(train_x, train_y)
    test_x, test_y = load_sentence("文本分类练习_test.csv", model)
    y_pred = classifier.predict(test_x)
    print(classification_report(test_y, y_pred))
    
if __name__ == "__main__":
    main()

              precision    recall  f1-score   support

           0       0.79      0.94      0.86      7987
           1       0.81      0.49      0.61      4000

    accuracy                           0.79     11987
   macro avg       0.80      0.72      0.74     11987
weighted avg       0.80      0.79      0.78     11987
