-
Notifications
You must be signed in to change notification settings - Fork 2
/
Bayes_News_Classification .py
245 lines (189 loc) · 9.46 KB
/
Bayes_News_Classification .py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# coding: utf-8
# 1) 【导入常用的处理包】
# ### 如何安装jieba:https://pypi.python.org/pypi/jieba/ ###
import pandas as pd
import numpy as np
import os
import jieba #需要先安装jieba库
os.chdir('/Users/a1/Desktop/算法实战/贝叶斯_新闻分类/贝叶斯-新闻分类/data')
# 2) 【导入数据】
#这里使用的是pd.read_table: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_table.html
df_news = pd.read_table('val.txt', names=['category', 'theme', 'URL', 'content'], encoding = 'utf-8')
df_news = df_news.dropna()
df_news.head()
#df_newss = pd.read_table('val.txt')
#df_newss
df_news.shape #看一下表格大概结构
# 3)【使用jieba对每一条content内容分词操作,存入list】
content = df_news.content.values.tolist() #在pandas中拿某一列的值,并把它转化为list格式
print(content[1000])
content_S = []
for line in content: #对每一个content中内容循环,分词操作
current_segment = jieba.lcut(line) #使用jieba.lcut()命令进行分词
if len(current_segment) > 1 and current_segment != '\r\r': #挑出content中长度大于1,且不是分隔符的字符
content_S.append(current_segment)
content_S[1000] #查看分词结果
df_content = pd.DataFrame({'content_S':content_S})
df_content.head()
# 4) 【开始清洗:去除停用词】
#os.chdir('/Users/a1/Desktop/算法实战/贝叶斯_新闻分类/贝叶斯-新闻分类/')
# 取出停用词表
stopwords=pd.read_csv("stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')
stopwords
type(stopwords)
type(df_content)
#定义一个函数,输入:list格式的文档与停用词表, 输出:去除停用词的文档,与去除的文档词
def drop_stopwords(contents,stopwords):
contents_clean = []
all_words = []
for line in contents:
line_clean = []
for word in line:
if word in stopwords:
continue
line_clean.append(word)
all_words.append(str(word))
contents_clean.append(line_clean)
return contents_clean,all_words #注意缩进
#print (contents_clean)
contents = df_content.content_S.values.tolist()
stopwords = stopwords.stopword.values.tolist()
contents_clean,all_words = drop_stopwords(contents,stopwords)
#df_content.content_S.isin(stopwords.stopword)
#df_content=df_content[~df_content.content_S.isin(stopwords.stopword)]
#df_content.head()
# 清洗完成第一步,去除了contents中在停词表出现过的词汇
df_content = pd.DataFrame({'contents_clean' : contents_clean}) #把处理完成的list,改成字典格式,传入DataFrame格式
df_content.head()
df_all_words = pd.DataFrame({'all_words':all_words}) #把处理完成的所有的all_words打印出来
df_all_words
# 5)【统计词频,groupby & matplotlib】
#计算词频groupby: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html
import numpy
words_count = df_all_words.groupby(by=['all_words'])['all_words'].agg({"count":numpy.size})
words_count = words_count.reset_index().sort_values(by=["count"], ascending = False)
words_count.head()
# 6) 【使用词云wordcloud展示】
#使用词云 :https://github.com/amueller/word_cloud
from wordcloud import WordCloud #导入安装的词云库
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib
#使用画图:https://matplotlib.org/users/customizing.html
matplotlib.rcParams['figure.figsize'] = [10.0, 8.0]
wordcloud = WordCloud(font_path='simhei.ttf', background_color = "white", max_font_size = 80)
word_frequence = {x[0]:x[1] for x in words_count.head(100).values} #前100个词,画出来
wordcloud = wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
### 8) 使用TF-IDF 提取关键字
import jieba.analyse
#打印未清洗数据
index = 2400
print(df_news['content'][index])
#打印清洗过后的数据
#使用jieba,连接起来,提取关键词
print(df_content['contents_clean'][index])
print("--------------------------------------------------")
content_S_str = "".join(contents_clean[index])
print(" ".join(jieba.analyse.extract_tags(content_S_str, topK=5, withWeight=False)))
### 9) LDA 主题模型
# 格式要求:list of list 形式,分词好的整个语料
# 找出文章中主题
#类似于无监督模型
#导入gensim库,自然语言处理,参考官网:https://radimrehurek.com/gensim/
from gensim import corpora, models, similarities
import gensim
#http://radimrehurek.com/gensim/
#首先做映射表(字典),相当于词袋,
dictionary = corpora.Dictionary(contents_clean)
corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
#使用gensim中的lda模型,可以参考:https://blog.csdn.net/angela2016/article/details/78208754
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) #类似Kmeans自己指定K值
#一号分类结果(一共分了20个主题)
#在一号中,找出前5个
print (lda.print_topic(1, topn=5))
# 20个主题中,排名前5个主题词
for topic in lda.print_topics(num_topics=20, num_words=5):
print (topic[1])
### 10)贝叶斯分类器对新闻数据进行分类
#课外学习“if __name__ == '__main__' ”,参考:http://blog.konghy.cn/2017/04/24/python-entry-program/
# 1)拿到数据,导入为DataFrame格式,带标签的
df_train=pd.DataFrame({'contents_clean':contents_clean,'label':df_news['category']})
df_train
#2)找出多少种label
df_train.label.unique()
#3)因为sklearn不认识这些label,需要把这些label转换为数字,用字典来做映射
label_mapping = {"汽车": 1, "财经": 2, "科技": 3, "健康": 4, "体育":5, "教育": 6,"文化": 7,"军事": 8,"娱乐": 9,"时尚": 0}
#把label进行替换 map()
df_train['label'] = df_train['label'].map(label_mapping)
df_train.head()
#4)sklearn, 先进行数据切分,分为训练集与测试集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_train['contents_clean'].values, df_train['label'].values, random_state=1)
#5)把训练集中的list转换为string,注意输入格式,后续需要把每条content转换为对应的向量,通过sklearn中的向量构造器
words = []
for line_index in range(len(x_train)):
try:
#x_train[line_index][word_index] = str(x_train[line_index][word_index])
#需要把list转换为string格式,可以用.join形式组合,并且用‘ ’空格取分开
#join()用法参考:https://blog.csdn.net/weixin_40475396/article/details/78227747
words.append(' '.join(x_train[line_index])) #words里面是训练数据
except:
print (line_index,word_index)
words[0]
print (len(words))
# 6)构造向量,有了上面的words的标准格式的内容,开始构建特征向量
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(analyzer='word', max_features=4000, lowercase = False)
#fit一下
vec.fit(words)
# 7) 导入在sklearn中,把beyes拿出来,有输入特征,还有label值
from sklearn.naive_bayes import MultinomialNB
#实例化分类器对象
classifier = MultinomialNB()
#把words向量传入classifier
classifier.fit(vec.transform(words), y_train)
# 8)测试集同样的3步处理操作:内容转换为string,空格分开 --- 把内容转换为特征向量 ---- 导入贝叶斯模型传入(输入特征向量与label值)
test_words = []
for line_index in range(len(x_test)):
try:
#x_train[line_index][word_index] = str(x_train[line_index][word_index])
test_words.append(' '.join(x_test[line_index]))
except:
print (line_index,word_index)
test_words[0]
#9) 基于词频向量的,进行贝叶斯,结果
classifier.score(vec.transform(test_words), y_test)
# 10) 另一种构造向量的方式,不采用词频,采用TF-IDF模式构造向量,发现最后结果稍好一些
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='word', max_features=4000, lowercase = False)
vectorizer.fit(words)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vectorizer.transform(words), y_train)
#11)执行当前的结果,贝叶斯分类器的精度
classifier.score(vectorizer.transform(test_words), y_test)
#12)补充知识点:构造特征向量的格式问题:
#先举例
from sklearn.feature_extraction.text import CountVectorizer
#我有4篇文章
#【格式要求】:这里输入为list,每篇文章用逗号分开,每篇文章里面用空格分开,不能写成字符串
texts=["dog cat fish","dog cat cat","fish bird", 'bird']
#向量构造器,实例化一个对象
cv = CountVectorizer()
#进行向量转换
cv_fit=cv.fit_transform(texts)
#现在语料库中不重复的词有几个,一共是4中
print(cv.get_feature_names())
#打印转换好的向量,解释第一行:‘bird’单词在第一篇文章中出现0次,所以第一个值为0,‘cat’出现一次,所以为1,以此类推
print(cv_fit.toarray())
print(cv_fit.toarray().sum(axis=0))
from sklearn.feature_extraction.text import CountVectorizer
texts=["dog cat fish","dog cat cat","fish bird", 'bird']
#注意这里有 ngram_range ,可以让词组合,让种类更多更复杂,向量从原来的4维转为了9维,当然也别太多
#通常为ngram 为 2就行
cv = CountVectorizer(ngram_range=(1,4))
cv_fit=cv.fit_transform(texts)
print(cv.get_feature_names())
print(cv_fit.toarray())
print(cv_fit.toarray().sum(axis=0))