-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster_for_data.py
119 lines (97 loc) · 3.25 KB
/
cluster_for_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#! /usr/bin/python
# -*- coding: utf-8 -*-
from numpy import *
import math
import jieba
from glob import glob
import json
from gensim import corpora, models, similarities, matutils
def loadData(filelist):
Data = []
i = 0
for fileName in filelist:
with open(fileName) as f:
for line in f:
#print line
line = json.loads(line)
i += 1
title = line['parse_title']
Data.append(title)
return Data
def getStopWords():
stopwords = []
for word in open("stopwords.txt", "r"):
stopwords.append(word.decode('utf-8').strip())
return stopwords
def cutContent(content, stopwords):
#print stopwords
cutWords = []
words = jieba.cut(content)
#print words
for word in words:
if word == u' ':
continue
if word not in stopwords:
cutWords.append(word)
#print unicode(word)
return cutWords
def getMaxSimilarity(dictTopic, vector):
maxValue = 0
maxIndex = -1
for k,cluster in dictTopic.iteritems():
oneSimilarity = mean([matutils.cossim(vector, v) for v in cluster])
if oneSimilarity > maxValue:
maxValue = oneSimilarity
maxIndex = k
return maxIndex, maxValue
def single_pass(corpus, titles, thres):
dictTopic = {}
clusterTopic = {}
numTopic = 0
cnt = 0
for vector,title in zip(corpus,titles):
if numTopic == 0:
dictTopic[numTopic] = []
dictTopic[numTopic].append(vector)
clusterTopic[numTopic] = []
clusterTopic[numTopic].append(title)
numTopic += 1
else:
maxIndex, maxValue = getMaxSimilarity(dictTopic, vector)
#join the most similar topic
if maxValue > thres:
dictTopic[maxIndex].append(vector)
clusterTopic[maxIndex].append(title)
#else create the new topic
else:
dictTopic[numTopic] = []
dictTopic[numTopic].append(vector)
clusterTopic[numTopic] = []
clusterTopic[numTopic].append(title)
numTopic += 1
cnt += 1
if cnt % 1000 == 0:
print "processing {}".format(cnt)
return dictTopic, clusterTopic
if __name__ == '__main__':
filelist = glob('../mobilenews/*.dat')
datMat = loadData(filelist) #4
print type(datMat)
stopWords = getStopWords()
n = len(datMat)
print 'total records:', n
cutData = []
for i in range(n):
cutData.append(cutContent(datMat[i], stopWords))
print 'cutData is done'
#get VSM
dictionary = corpora.Dictionary(cutData)
corpus = [dictionary.doc2bow(title) for title in cutData]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
thres = 0.5
dictTopic, clusterTopic = single_pass(corpus_tfidf, datMat, thres)
print "num of Topic: {}".format(len(dictTopic))
for k,v in clusterTopic.items():
cluster_title = '\t'.join(v).encode('utf-8')
print "cluster idx:{} --- {}".format(k,cluster_title)