In [24]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from pprint import pprint
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from cnews_loader import *

In [25]:
# 设置数据读取、模型、结果保存路径
base_dir = '/cnews'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')
save_dir = os.path.join('checkpoints', 'textcnn')
save_path = os.path.join(save_dir, 'best_validation')

In [26]:
train_contents, train_labels = read_file('./cnews/cnews.train.txt')
test_contents, test_labels = read_file('./cnews/cnews.test.txt')
val_counts = Counter(train_labels)
val_counts

Counter({'体育': 5000,
         '娱乐': 5000,
         '家居': 5000,
         '房产': 5000,
         '教育': 5000,
         '时尚': 5000,
         '时政': 5000,
         '游戏': 5000,
         '科技': 5000,
         '财经': 5000})

In [38]:
import re
#去除文本中的表情字符（只保留中英文和数字）
def clear_character(sentence):
    pattern1= '\[.*?\]'     
    pattern2 = re.compile('[^\u4e00-\u9fa5^a-z^A-Z^0-9]')   
    line1=re.sub(pattern1,'',sentence)
    line2=re.sub(pattern2,'',line1)   
    new_sentence=''.join(line2.split()) #去除空白
    return new_sentence

# train_text=list(map(lambda s: clear_character(s), train_contents))
train_text = list(map(lambda s: clear_character(str(s)), train_contents))
test_text=list(map(lambda s: clear_character(str(s)), test_contents))

TypeError: expected string or bytes-like object

In [28]:
import jieba
train_seg_text=list(map(lambda s: jieba.lcut(s), train_text))
test_seg_text=list(map(lambda s: jieba.lcut(s), test_text))

In [29]:
# 读取停用词
stop_words_path = "./百度停用词列表.txt"
def get_stop_words():
    file = open(stop_words_path, 'rb').read().decode('gbk').split('\r\n')
    return set(file)
stopwords = get_stop_words()

# 去掉文本中的停用词
def drop_stopwords(line, stopwords):
    line_clean = []
    for word in line:
        if word in stopwords:
            continue
        line_clean.append(word)
    return line_clean

train_st_text=list(map(lambda s: drop_stopwords(s,stopwords), train_seg_text))
test_st_text=list(map(lambda s: drop_stopwords(s,stopwords), test_seg_text))

In [30]:
le = LabelEncoder()
le.fit(train_labels)
label_train_id=le.transform(train_labels)
label_test_id=le.transform(test_labels)

In [31]:
train_c_text=list(map(lambda s: ' '.join(s), train_st_text))
test_c_text=list(map(lambda s: ' '.join(s), test_st_text))
tfidf_model = TfidfVectorizer(binary=False,token_pattern=r"(?u)\b\w+\b")
train_Data = tfidf_model.fit_transform(train_c_text)
test_Data = tfidf_model.transform(test_c_text)


In [32]:
from sklearn.linear_model import LogisticRegression
'''LR模型分类训练'''
classifier=LogisticRegression()
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))


              precision    recall  f1-score   support

           0     1.0000    0.0450    0.0861      1000
           1     0.8235    0.0140    0.0275      1000
           2     0.1047    0.9890    0.1894      1000
           3     0.7778    0.0070    0.0139      1000
           4     1.0000    0.0030    0.0060      1000
           5     0.0000    0.0000    0.0000      1000
           6     0.8333    0.0050    0.0099      1000
           7     0.6930    0.0790    0.1418      1000
           8     0.9718    0.3450    0.5092      1000
           9     0.8000    0.0040    0.0080      1000

    accuracy                         0.1491     10000
   macro avg     0.7004    0.1491    0.0992     10000
weighted avg     0.7004    0.1491    0.0992     10000



In [33]:
parameters = {
    'tfidf__max_df': (0.75,),
#     'tfidf__stop_words':('english',stopwords),
    'tfidf__norm':('l2',),
    'tfidf__use_idf':(True,),
    'tfidf__smooth_idf':(True,),
    'tfidf__max_features':(None,),
#     'tfidf__ngram_range':((1, 1), (1, 2),(2, 2)),  # unigrams or bigrams

#     'clf__max_iter': (20,),
    'clf__penalty': ('l1','l2'),
    # 'clf__tol': (0.0001,0.00001,0.000001),
    'clf__solver': ( 'liblinear','saga',),
}
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")),
    ('clf', LogisticRegression()),
])

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_c_text, label_train_id)
print("done in %0.3fs" % (time() - t0))
print()


Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__penalty': ('l1', 'l2'),
 'clf__solver': ('liblinear', 'saga'),
 'tfidf__max_df': (0.75,),
 'tfidf__max_features': (None,),
 'tfidf__norm': ('l2',),
 'tfidf__smooth_idf': (True,),
 'tfidf__use_idf': (True,)}
Fitting 5 folds for each of 4 candidates, totalling 20 fits
done in 14.643s



In [34]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Best score: 0.118
Best parameters set:
	clf__penalty: 'l2'
	clf__solver: 'liblinear'
	tfidf__max_df: 0.75
	tfidf__max_features: None
	tfidf__norm: 'l2'
	tfidf__smooth_idf: True
	tfidf__use_idf: True


In [35]:
parameters = {
    'tfidf__max_df': (0.75,),
#     'tfidf__stop_words':('english',stopwords),
    'tfidf__norm':('l2',),
    'tfidf__use_idf':(True,),
    'tfidf__smooth_idf':(True,),
    'tfidf__max_features':(None,),
    # 'tfidf__ngram_range':((1, 1), (1, 2),(2, 2)),  # unigrams or bigrams

#     'clf__max_iter': (20,),
    'clf__penalty': ('l2',),
    'clf__C':(0.8,0.9,1.0,1.1,),
    'clf__tol': (0.001,0.0001,0.00001,0.000001,),
    'clf__solver': ( 'liblinear',),
}
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")),
    ('clf', LogisticRegression()),
])

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_c_text, label_train_id)
print("done in %0.3fs" % (time() - t0))
print()


Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__C': (0.8, 0.9, 1.0, 1.1),
 'clf__penalty': ('l2',),
 'clf__solver': ('liblinear',),
 'clf__tol': (0.001, 0.0001, 1e-05, 1e-06),
 'tfidf__max_df': (0.75,),
 'tfidf__max_features': (None,),
 'tfidf__norm': ('l2',),
 'tfidf__smooth_idf': (True,),
 'tfidf__use_idf': (True,)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits
done in 15.022s



In [36]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Best score: 0.118
Best parameters set:
	clf__C: 1.0
	clf__penalty: 'l2'
	clf__solver: 'liblinear'
	clf__tol: 0.001
	tfidf__max_df: 0.75
	tfidf__max_features: None
	tfidf__norm: 'l2'
	tfidf__smooth_idf: True
	tfidf__use_idf: True


In [37]:
'''LR模型分类训练'''
classifier=LogisticRegression(C=1.1)
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))


              precision    recall  f1-score   support

           0     1.0000    0.0450    0.0861      1000
           1     0.8235    0.0140    0.0275      1000
           2     0.1047    0.9890    0.1894      1000
           3     0.7778    0.0070    0.0139      1000
           4     1.0000    0.0030    0.0060      1000
           5     0.0000    0.0000    0.0000      1000
           6     0.8333    0.0050    0.0099      1000
           7     0.6930    0.0790    0.1418      1000
           8     0.9718    0.3450    0.5092      1000
           9     0.8000    0.0040    0.0080      1000

    accuracy                         0.1491     10000
   macro avg     0.7004    0.1491    0.0992     10000
weighted avg     0.7004    0.1491    0.0992     10000

