-
Notifications
You must be signed in to change notification settings - Fork 846
/
Copy pathdocument_clustering.py
250 lines (198 loc) · 8.74 KB
/
document_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 12 20:42:12 2016
@author: DIP
"""
import pandas as pd
import numpy as np
movie_data = pd.read_csv('movie_data.csv')
print movie_data.head()
movie_titles = movie_data['Title'].tolist()
movie_synopses = movie_data['Synopsis'].tolist()
print 'Movie:', movie_titles[0]
print 'Movie Synopsis:', movie_synopses[0][:1000]
from normalization import normalize_corpus
from utils import build_feature_matrix
# normalize corpus
norm_movie_synopses = normalize_corpus(movie_synopses,
lemmatize=True,
only_text_chars=True)
# extract tf-idf features
vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses,
feature_type='tfidf',
min_df=0.24, max_df=0.85,
ngram_range=(1, 2))
# view number of features
print feature_matrix.shape
# get feature names
feature_names = vectorizer.get_feature_names()
# print sample features
print feature_names[:20]
from sklearn.cluster import KMeans
def k_means(feature_matrix, num_clusters=5):
km = KMeans(n_clusters=num_clusters,
max_iter=10000)
km.fit(feature_matrix)
clusters = km.labels_
return km, clusters
num_clusters = 5
km_obj, clusters = k_means(feature_matrix=feature_matrix,
num_clusters=num_clusters)
movie_data['Cluster'] = clusters
from collections import Counter
# get the total number of movies per cluster
c = Counter(clusters)
print c.items()
def get_cluster_data(clustering_obj, movie_data,
feature_names, num_clusters,
topn_features=10):
cluster_details = {}
# get cluster centroids
ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
# get key features for each cluster
# get movies belonging to each cluster
for cluster_num in range(num_clusters):
cluster_details[cluster_num] = {}
cluster_details[cluster_num]['cluster_num'] = cluster_num
key_features = [feature_names[index]
for index
in ordered_centroids[cluster_num, :topn_features]]
cluster_details[cluster_num]['key_features'] = key_features
movies = movie_data[movie_data['Cluster'] == cluster_num]['Title'].values.tolist()
cluster_details[cluster_num]['movies'] = movies
return cluster_details
def print_cluster_data(cluster_data):
# print cluster details
for cluster_num, cluster_details in cluster_data.items():
print 'Cluster {} details:'.format(cluster_num)
print '-'*20
print 'Key features:', cluster_details['key_features']
print 'Movies in this cluster:'
print ', '.join(cluster_details['movies'])
print '='*40
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity
import random
from matplotlib.font_manager import FontProperties
def plot_clusters(num_clusters, feature_matrix,
cluster_data, movie_data,
plot_size=(16,8)):
# generate random color for clusters
def generate_random_color():
color = '#%06x' % random.randint(0, 0xFFFFFF)
return color
# define markers for clusters
markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
# build cosine distance matrix
cosine_distance = 1 - cosine_similarity(feature_matrix)
# dimensionality reduction using MDS
mds = MDS(n_components=2, dissimilarity="precomputed",
random_state=1)
# get coordinates of clusters in new low-dimensional space
plot_positions = mds.fit_transform(cosine_distance)
x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
# build cluster plotting data
cluster_color_map = {}
cluster_name_map = {}
for cluster_num, cluster_details in cluster_data.items():
# assign cluster features to unique label
cluster_color_map[cluster_num] = generate_random_color()
cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip()
# map each unique cluster label with its coordinates and movies
cluster_plot_frame = pd.DataFrame({'x': x_pos,
'y': y_pos,
'label': movie_data['Cluster'].values.tolist(),
'title': movie_data['Title'].values.tolist()
})
grouped_plot_frame = cluster_plot_frame.groupby('label')
# set plot figure size and axes
fig, ax = plt.subplots(figsize=plot_size)
ax.margins(0.05)
# plot each cluster using co-ordinates and movie titles
for cluster_num, cluster_frame in grouped_plot_frame:
marker = markers[cluster_num] if cluster_num < len(markers) \
else np.random.choice(markers, size=1)[0]
ax.plot(cluster_frame['x'], cluster_frame['y'],
marker=marker, linestyle='', ms=12,
label=cluster_name_map[cluster_num],
color=cluster_color_map[cluster_num], mec='none')
ax.set_aspect('auto')
ax.tick_params(axis= 'x', which='both', bottom='off', top='off',
labelbottom='off')
ax.tick_params(axis= 'y', which='both', left='off', top='off',
labelleft='off')
fontP = FontProperties()
fontP.set_size('small')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True,
shadow=True, ncol=5, numpoints=1, prop=fontP)
#add labels as the film titles
for index in range(len(cluster_plot_frame)):
ax.text(cluster_plot_frame.ix[index]['x'],
cluster_plot_frame.ix[index]['y'],
cluster_plot_frame.ix[index]['title'], size=8)
# show the plot
plt.show()
cluster_data = get_cluster_data(clustering_obj=km_obj,
movie_data=movie_data,
feature_names=feature_names,
num_clusters=num_clusters,
topn_features=5)
print_cluster_data(cluster_data)
plot_clusters(num_clusters=num_clusters,
feature_matrix=feature_matrix,
cluster_data=cluster_data,
movie_data=movie_data,
plot_size=(16,8))
from sklearn.cluster import AffinityPropagation
def affinity_propagation(feature_matrix):
sim = feature_matrix * feature_matrix.T
sim = sim.todense()
ap = AffinityPropagation()
ap.fit(sim)
clusters = ap.labels_
return ap, clusters
# get clusters using affinity propagation
ap_obj, clusters = affinity_propagation(feature_matrix=feature_matrix)
movie_data['Cluster'] = clusters
# get the total number of movies per cluster
c = Counter(clusters)
print c.items()
# get total clusters
total_clusters = len(c)
print 'Total Clusters:', total_clusters
cluster_data = get_cluster_data(clustering_obj=ap_obj,
movie_data=movie_data,
feature_names=feature_names,
num_clusters=total_clusters,
topn_features=5)
print_cluster_data(cluster_data)
plot_clusters(num_clusters=num_clusters,
feature_matrix=feature_matrix,
cluster_data=cluster_data,
movie_data=movie_data,
plot_size=(16,8))
from scipy.cluster.hierarchy import ward, dendrogram
def ward_hierarchical_clustering(feature_matrix):
cosine_distance = 1 - cosine_similarity(feature_matrix)
linkage_matrix = ward(cosine_distance)
return linkage_matrix
def plot_hierarchical_clusters(linkage_matrix, movie_data, figure_size=(8,12)):
# set size
fig, ax = plt.subplots(figsize=figure_size)
movie_titles = movie_data['Title'].values.tolist()
# plot dendrogram
ax = dendrogram(linkage_matrix, orientation="left", labels=movie_titles)
plt.tick_params(axis= 'x',
which='both',
bottom='off',
top='off',
labelbottom='off')
plt.tight_layout()
plt.savefig('ward_hierachical_clusters.png', dpi=200)
# build ward's linkage matrix
linkage_matrix = ward_hierarchical_clustering(feature_matrix)
# plot the dendrogram
plot_hierarchical_clusters(linkage_matrix=linkage_matrix,
movie_data=movie_data,
figure_size=(8,10))