In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
## 设置字体
from matplotlib.font_manager import FontProperties
fonts = FontProperties(fname = "/Library/Fonts/华文细黑.ttf",size=14)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
train_df = pd.read_table('F:/DataSet/cnews/cnews.train.txt',names=['label','text'],encoding='utf-8')
train_df = train_df.dropna()
test_df = pd.read_table('F:/DataSet/cnews/cnews.test.txt',names=['label','text'],encoding='utf-8')
test_df = test_df.dropna()

In [4]:
train_df.head()

Unnamed: 0,label,text
0,体育,马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 来到沈阳，国奥队依然没有...
1,体育,商瑞华首战复仇心切 中国玫瑰要用美国方式攻克瑞典多曼来了，瑞典来了，商瑞华首战求3分的信心也...
2,体育,冠军球队迎新欢乐派对 黄旭获大奖张军赢下PK赛新浪体育讯12月27日晚，“冠军高尔夫球队迎新...
3,体育,辽足签约危机引注册难关 高层威逼利诱合同笑里藏刀新浪体育讯2月24日，辽足爆发了集体拒签风波...
4,体育,揭秘谢亚龙被带走：总局电话骗局 复制南杨轨迹体坛周报特约记者张锐北京报道 谢亚龙已经被公安...


In [6]:
content_train = train_df.text.values.tolist()
content_test = test_df.text.values.tolist()

In [8]:
import jieba
content_S_train = []
for line in content_train:
    current_segment = jieba.lcut(line)
    if len(current_segment) > 1 and current_segment != '\r\n':
        content_S_train.append(current_segment)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.228 seconds.
Prefix dict has been built succesfully.


In [9]:
content_S_test = []
for line in content_test:
    current_segment = jieba.lcut(line)
    if len(current_segment) > 1 and current_segment != '\r\n':
        content_S_test.append(current_segment)

In [10]:
df_content_train=pd.DataFrame({'cutword':content_S_train})
df_content_train.head()

Unnamed: 0,cutword
0,"[马晓旭, 意外, 受伤, 让, 国奥, 警惕, , 无奈, 大雨, 格外, 青睐, 殷家..."
1,"[商瑞华, 首战, 复仇, 心切, , 中国, 玫瑰, 要, 用, 美国, 方式, 攻克,..."
2,"[冠军, 球队, 迎新, 欢乐, 派对, , 黄旭获, 大奖, 张军, 赢, 下, PK,..."
3,"[辽足, 签约, 危机, 引, 注册, 难关, , 高层, 威逼利诱, 合同, 笑里藏刀,..."
4,"[揭秘, 谢亚龙, 被, 带走, ：, 总局, 电话, 骗局, , 复制, 南杨, 轨迹,..."


In [11]:
df_content_test=pd.DataFrame({'cutword':content_S_test})
df_content_test.head()

Unnamed: 0,cutword
0,"[鲍勃, 库西, 奖归, 谁, 属, ？, , NCAA, 最强, 控卫, 是, 坎巴, ..."
1,"[麦基, 砍, 28, +, 18, +, 5, 却, 充满, 寂寞, , 纪录, 之夜,..."
2,"[黄蜂, vs, 湖人, 首发, ：, 科比, 冲击, 七, 连胜, , 火箭, 两旧, ..."
3,"[双面, 谢亚龙, 作秀, 终成, 做作, , 谁, 来, 为, 低劣, 行政, 能力, ..."
4,"[兔年, 首战, 山西, 换帅, 后, 有, 虎胆, , 张学文, 用, 乔丹, 名言, ..."


In [12]:
stopwords=pd.read_csv("F:/PythonClassCode/stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'],encoding='utf-8')

In [14]:
def drop_stopwords(contents,stopwords):
    contents_clean = []
    all_words = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(word)
            all_words.append(str(word))
        contents_clean.append(line_clean)
    return contents_clean,all_words

contents_train = df_content_train.cutword.values.tolist()
stopwords = stopwords.stopword.values.tolist()
contents_clean_train,all_words_train = drop_stopwords(contents_train,stopwords)

contents_test = df_content_test.cutword.values.tolist()
contents_clean_test,all_words_test = drop_stopwords(contents_test,stopwords)

In [15]:
df_content_train = pd.DataFrame({'contents_clean_train':contents_clean_train})
df_content_test = pd.DataFrame({'contents_clean_test':contents_clean_test})

In [16]:
df_all_words_train = pd.DataFrame({'all_words_train':all_words_train})
df_all_words_test = pd.DataFrame({'all_words_test':all_words_test})

In [17]:
train_df = pd.DataFrame({'label':train_df['label'],'cutword':contents_clean_train})

In [18]:
test_df = pd.DataFrame({'label':test_df['label'],'cutword':contents_clean_test})

In [19]:
train_df.head()

Unnamed: 0,label,cutword
0,体育,"[马晓旭, 意外, 受伤, 国奥, 警惕, , 无奈, 大雨, 青睐, 殷家, 军, 傅亚..."
1,体育,"[商瑞华, 首战, 复仇, 心切, , 中国, 玫瑰, 美国, 方式, 攻克, 瑞典, 多..."
2,体育,"[冠军, 球队, 迎新, 欢乐, 派对, , 黄旭获, 大奖, 张军, 赢, PK, 赛,..."
3,体育,"[辽足, 签约, 危机, 引, 注册, 难关, , 高层, 威逼利诱, 合同, 笑里藏刀,..."
4,体育,"[揭秘, 谢亚龙, 带走, 总局, 电话, 骗局, , 复制, 南杨, 轨迹, 体坛周报,..."


In [4]:
train_df = pd.read_csv(r'F:\DataSet\cnews\cnews_train_cut.csv')
test_df = pd.read_csv(r'F:\DataSet\cnews\cnews_test_cut.csv')

In [5]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,classification,content,content_short
0,0,体育,马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 来到沈阳，国奥队依然没有...,"['马晓旭', '意外', '受伤', '让', '国奥', '警惕', '大雨', '格外..."
1,1,体育,商瑞华首战复仇心切 中国玫瑰要用美国方式攻克瑞典多曼来了，瑞典来了，商瑞华首战求3分的信心也...,"['商瑞华', '首战', '复仇', '心切', '玫瑰', '要', '用', '美国'..."
2,2,体育,冠军球队迎新欢乐派对 黄旭获大奖张军赢下PK赛新浪体育讯12月27日晚，“冠军高尔夫球队迎新...,"['冠军', '球队', '迎新', '欢乐', '派对', '获', '大奖', '张军'..."
3,3,体育,辽足签约危机引注册难关 高层威逼利诱合同笑里藏刀新浪体育讯2月24日，辽足爆发了集体拒签风波...,"['辽', '足', '签约', '危机', '引', '注册', '难关', '威逼利诱'..."
4,4,体育,揭秘谢亚龙被带走：总局电话骗局 复制南杨轨迹体坛周报特约记者张锐北京报道? 谢亚龙已经被公安...,"['揭秘', '谢亚龙', '被', '带走', '电话', '骗局', '南杨', '轨迹..."


In [6]:
train_df = pd.DataFrame({'label':train_df['classification'],'cutword':train_df['content_short']})
train_df.head()

Unnamed: 0,label,cutword
0,体育,"['马晓旭', '意外', '受伤', '让', '国奥', '警惕', '大雨', '格外..."
1,体育,"['商瑞华', '首战', '复仇', '心切', '玫瑰', '要', '用', '美国'..."
2,体育,"['冠军', '球队', '迎新', '欢乐', '派对', '获', '大奖', '张军'..."
3,体育,"['辽', '足', '签约', '危机', '引', '注册', '难关', '威逼利诱'..."
4,体育,"['揭秘', '谢亚龙', '被', '带走', '电话', '骗局', '南杨', '轨迹..."


In [7]:
test_df = pd.DataFrame({'label':test_df['classification'],'cutword':test_df['content_short']})
test_df.head()

Unnamed: 0,label,cutword
0,体育,"['鲍勃', '库西', '奖', '归', '谁', '属', 'NCAA', '最强',..."
1,体育,"['麦基', '砍', '28', '却', '充满', '寂寞', '之', '夜', '..."
2,体育,"['黄蜂', 'vs', '湖人', '首发', '冲击', '七', '连胜', '两',..."
3,体育,"['双面', '谢亚龙', '作秀', '终成', '做作', '来', '为', '低劣'..."
4,体育,"['兔年', '首战', '山西', '换帅', '后', '有', '虎胆', '用', ..."


In [8]:
train_y = train_df.label
test_y = test_df.label
le = LabelEncoder()
train_y = le.fit_transform(train_y).reshape(-1,1)
test_y = le.transform(test_y).reshape(-1,1)

## 对数据集的标签数据进行one-hot编码
ohe = OneHotEncoder()
train_y = ohe.fit_transform(train_y).toarray()
test_y = ohe.transform(test_y).toarray()

In [9]:
max_words = 5000
max_len = 600
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(train_df.cutword)

for ii,iterm in enumerate(tok.word_index.items()):
    if ii < 10:
        print(iterm)
    else:
        break
print("===================")  
for ii,iterm in enumerate(tok.word_counts.items()):
    if ii < 10:
        print(iterm)
    else:
        break

("'的'", 1)
("'在'", 2)
("'了'", 3)
("'是'", 4)
("'月'", 5)
("'和'", 6)
("'日'", 7)
("'中'", 8)
("'有'", 9)
("'将'", 10)
("'马晓旭'", 1)
("'意外'", 414)
("'受伤'", 600)
("'让'", 7314)
("'国奥'", 32)
("'警惕'", 76)
("'大雨'", 19)
("'格外'", 197)
("'青睐'", 314)
("'殷家'", 1)


In [10]:
train_seq = tok.texts_to_sequences(train_df.cutword)
test_seq = tok.texts_to_sequences(test_df.cutword)
## 将每个序列调整为相同的长度
train_seq_mat = sequence.pad_sequences(train_seq,maxlen=max_len)
test_seq_mat = sequence.pad_sequences(test_seq,maxlen=max_len)

print(train_seq_mat.shape)
print(test_seq_mat.shape)

(50000, 600)
(10000, 600)


In [11]:
inputs = Input(name='inputs',shape=[max_len])
## Embedding(词汇表大小,batch大小,每个新闻的词长)
layer = Embedding(max_words+1,128,input_length=max_len)(inputs)
layer = LSTM(128)(layer)
layer = Dense(128,activation="relu",name="FC1")(layer)
layer = Dropout(0.5)(layer)
layer = Dense(10,activation="softmax",name="FC2")(layer)
model = Model(inputs=inputs,outputs=layer)
model.summary()
model.compile(loss="categorical_crossentropy",optimizer=RMSprop(),metrics=["accuracy"])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 600)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 600, 128)          640128    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
FC1 (Dense)                  (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
FC2 (Dense)                  (None, 10)                1290      
Total params: 789,514
Trainable params: 789,514
Non-trainable params: 0
_________________________________________________________________


In [14]:
model_fit = model.fit(train_seq_mat,train_y,batch_size=128,epochs=10,                      
                      callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] 
                     )

Epoch 1/10
10368/50000 [=====>........................] - ETA: 27:36 - loss: 1.7020 - acc: 0.3165

KeyboardInterrupt: 

In [2]:
## 对测试集进行预测
test_pre = model.predict(test_seq_mat)

## 评价预测效果，计算混淆矩阵
confm = metrics.confusion_matrix(np.argmax(test_pre,axis=1),np.argmax(test_y,axis=1))
## 混淆矩阵可视化
Labname = ["体育","娱乐","家居","房产","教育","时尚","时政","游戏","科技","财经"]
plt.figure(figsize=(8,8))
sns.heatmap(confm.T, square=True, annot=True,
            fmt='d', cbar=False,linewidths=.8,
            cmap="YlGnBu")
plt.xlabel('True label',size = 14)
plt.ylabel('Predicted label',size = 14)
plt.xticks(np.arange(10)+0.5,Labname,fontproperties = fonts,size = 12)
plt.yticks(np.arange(10)+0.3,Labname,fontproperties = fonts,size = 12)
plt.show()


print(metrics.classification_report(np.argmax(test_pre,axis=1),np.argmax(test_y,axis=1)))


NameError: name 'model' is not defined