forked from svakulenk0/tweet2vec_clustering
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering_pipeline.py
136 lines (116 loc) · 4.38 KB
/
clustering_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Created on Oct 6, 2016
.. codeauthor: svitlana vakulenko
<svitlana.vakulenko@gmail.com>
Pipeline to create Tweet2Vec embeddings and cluster them
Dependencies:
* pandas
* fastcluster
* theano
* lasagne
'''
import os
import numpy as np
from collections import Counter
import csv
import fastcluster
from scipy.cluster.hierarchy import fcluster
from sklearn import preprocessing
from sklearn.metrics import pairwise
from tweet2vec.encode_char import save_embeddings
def hierarchical_clustering(embeddings, distance_metric='euclidean', max_d=1.0):
'''
Groups similar vector-embeddings into event-clusters
'''
# max_ds = [0.8, 0.9, 1.0]
# precision flaw fix by cast to integers!!!
a = np.int64(embeddings)
# print 'Tweet vectors:', a.shape
# check the number of unique embeddings
b = np.ascontiguousarray(a).view(np.dtype((np.void,
a.dtype.itemsize * a.shape[1])))
_, idx = np.unique(b, return_index=True)
unique_a = a[idx]
print 'Unique tweet vectors:', unique_a.shape
# check out the embeddings
X = a.astype(float)
# print X[0][0]
X_scaled = preprocessing.scale(X)
X_normalized = preprocessing.normalize(X_scaled, norm='l2')
# print X_normalized.shape
X = X_normalized
HL = fastcluster.linkage(X, method='average', metric=distance_metric)
print HL
# for max_d in max_ds:
# topics_out = embeddings + "_clusters_maxd_" + str(max_d) + ".csv"
# # print topics_out
# writer = csv.writer(open(topics_out, 'w'))
print "max_d = ", max_d
cluster_ids = fcluster(HL, max_d, criterion='distance')
print len(cluster_ids)
return cluster_ids
def get_clustered_tweet_ids(cluster_ids=None, n_topics=None):
if cluster_ids == None:
cluster_ids = cPickle.load(open('cluster_ids.p', 'rb'))
freqTwCl = Counter(cluster_ids)
n_clusters = len(freqTwCl)
if not n_topics:
n_topics = n_clusters
print "n_clusters:",
print "Return top", n_topics
npindL = np.array(cluster_ids)
clusters = []
for clfreq in freqTwCl.most_common(n_topics):
cl = clfreq[0]
freq = clfreq[1]
clidx = (npindL == cl).nonzero()[0].tolist()
# print len(clidx), "tweets"
# print clidx
clusters.append(clidx)
return clusters
def show_tweets(clustered_tweets, tweets_path, output_path):
# read tweets & write clusters
with open(tweets_path, 'r') as f:
# tweets = list(csv.reader(tsv, delimiter='\t'))
tweets = f.read().splitlines()
# # array from 0 to 12999
for idx, cluster in enumerate(clustered_tweets):
print "Cluster (%d tweets)" % len(cluster)
# for (cl, freq, closest) in clusters:
# # print "freq:", freq
# # do not show duplicate tweets in the same cluster
tweet_cluster = []
# tweet_indexes = []
# for tweet_index in closest:
# # print tweet_index
if not os.path.exists(output_path):
os.makedirs(output_path)
with open(output_path+'cluster'+str(idx), 'w+') as fout:
for tweet_index in cluster:
text = tweets[tweet_index].decode('utf-8')
# #.split('\t')[2].rstrip()
# if text not in tweet_cluster:
# print text
fout.write(text+'\n')
def run_pipeline(tweets_path, models_path, embs_path, do_generate=True, do_cluster=True,
show_results=False, output_path=None, distance_threshold=1.0):
# 1st step: create vector-embeddings
if do_generate:
embeddings = save_embeddings(tweets_path, models_path, embs_path)
assert embeddings
embeddings = np.asarray(embeddings)
# print embeddings
# 2nd step: cluster
if do_cluster or show_tweets:
if not do_generate:
embeddings = np.load(embs_path)
cluster_ids = hierarchical_clustering(embeddings, max_d=distance_threshold)
assert embeddings.shape[0] == len(cluster_ids)
print cluster_ids
if show_results:
clustered_tweets = get_clustered_tweet_ids(cluster_ids)
show_tweets(clustered_tweets, tweets_path, output_path)
print "Finished."
return True