## 分词、去停用词

In [1]:
import multiprocessing
import jieba

datapath = './mldata/'
file_list = [datapath+'test.txt',datapath+'val.txt',datapath+'train.txt']
write_list = [datapath+'test_token.txt',datapath+'val_token.txt',datapath+'train_token.txt']

def tokenFile(file_path, write_path):
    # word_divider = WordCut()
    with open(write_path, 'w',encoding='utf-8') as w:
        with open(file_path, 'r',encoding='utf-8') as f:
            for line in f.readlines():
                line = line.strip()
                token_sen = jieba.lcut(line.split('\t')[1])
                w.write(line.split('\t')[0] + '\t' + str(token_sen) + '\n') 
    print (file_path + ' has been token and token_file_name is ' + write_path)

pool = multiprocessing.Pool(processes=4)
for file_path, write_path in zip(file_list, write_list):
    pool.apply_async(tokenFile(file_path, write_path))
pool.close()
pool.join() # 调用join()之前必须先调用close()
print( "Sub-process(es) done.")


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\szh\AppData\Local\Temp\jieba.cache
Loading model cost 0.790 seconds.
Prefix dict has been built successfully.


./mldata/test.txt has been token and token_file_name is ./mldata/test_token.txt
./mldata/val.txt has been token and token_file_name is ./mldata/val_token.txt
./mldata/train.txt has been token and token_file_name is ./mldata/train_token.txt
Sub-process(es) done.


## 文本向量化 tf-idf

In [2]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

def constructDataset(path):
    """
    path: file path
    rtype: lable_list and corpus_list
    """
    label_list = []
    corpus_list = []
    with open(path, 'r',encoding='utf-8') as p:
        for line in p.readlines():
            label_list.append(line.split('\t')[0])
            corpus_list.append(line.split('\t')[1])
    return label_list, corpus_list

datapath = './mldata/'
write_list = [datapath+'train_token.txt', datapath+'test_token.txt',  datapath+'val_token.txt']

train_label, train_set = constructDataset(write_list[0]) # 50000
test_label, test_set = constructDataset(write_list[1]) # 10000
val_label, val_set = constructDataset(write_list[2])
# 计算tf-idf
corpus_set = train_set + val_set + test_set # 全量计算tf-idf
print ("length of corpus is: " + str(len(corpus_set)))
vectorizer = CountVectorizer(min_df=1e-5) # drop df < 1e-5,去低频词
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus_set))
words = vectorizer.get_feature_names_out()
print ("how many words: {0}".format(len(words)))
print ("tf-idf shape: ({0},{1})".format(tfidf.shape[0], tfidf.shape[1]))


length of corpus is: 8660
how many words: 32848
tf-idf shape: (8660,32848)


## 构建训练集和测试集

In [3]:
from sklearn import preprocessing

# encode label
corpus_label = train_label + val_label + test_label
encoder = preprocessing.LabelEncoder()
corpus_encode_label = encoder.fit_transform(corpus_label)
train_label = corpus_encode_label[:int(len(corpus_set)*0.8)]
val_label = corpus_encode_label[int(len(corpus_set)*0.8):int(len(corpus_set)*0.9)]
test_label = corpus_encode_label[int(len(corpus_set)*0.9):]
# get tf-idf dataset
train_set = tfidf[:int(len(corpus_set)*0.8)]
val_set = tfidf[int(len(corpus_set)*0.8):int(len(corpus_set)*0.9)]
test_set = tfidf[int(len(corpus_set)*0.9):]
print(len(train_label))
print(len(val_label))
print(len(test_label))


6928
866
866


## 逻辑回归分类器

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr_model = LogisticRegression()
lr_model.fit(train_set, train_label)
print( "val mean accuracy: {0}".format(lr_model.score(val_set, val_label)))
y_pred = lr_model.predict(test_set)
print(classification_report(test_label, y_pred,target_names=['喜剧', '犯罪']))


val mean accuracy: 0.7829099307159353
              precision    recall  f1-score   support

          喜剧       0.81      0.93      0.87       541
          犯罪       0.84      0.65      0.73       325

    accuracy                           0.82       866
   macro avg       0.83      0.79      0.80       866
weighted avg       0.83      0.82      0.82       866



## 随机森林分类器

In [5]:
from sklearn.ensemble import RandomForestClassifier    

rf_model = RandomForestClassifier(n_estimators=200, random_state=1080)
rf_model.fit(train_set, train_label)
print( "val mean accuracy: {0}".format(rf_model.score(val_set, val_label)))
y_pred = rf_model.predict(test_set)
print(classification_report(test_label, y_pred,target_names=['喜剧', '犯罪']))

val mean accuracy: 0.7794457274826789
              precision    recall  f1-score   support

          喜剧       0.78      0.92      0.85       541
          犯罪       0.82      0.58      0.68       325

    accuracy                           0.79       866
   macro avg       0.80      0.75      0.76       866
weighted avg       0.80      0.79      0.78       866

