-
Notifications
You must be signed in to change notification settings - Fork 5
/
gmm.py
72 lines (65 loc) · 2.68 KB
/
gmm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""Trains a GMM on formant data (e.g. from the Hillenbrand corpus).
"""
import numpy as np
import pylab as pl
from sklearn.mixture import GMM
from sklearn import metrics
from collections import defaultdict
def parse(fname):
with open(fname) as f:
d = map(lambda l: l.rstrip('\n').split(), f.readlines())
header = d[0]
d = filter(lambda x: not 'NaN' in x, d)
return header, np.array(d[1:])
def eval_clusters(y_pred, y, X, gmm):
# 2D distance based 1-to-1 matching between y_pred and y
sety = set(y)
mapsety = dict(zip(xrange(len(sety)), sety))
assert(len(set(y_pred)) == len(sety))
maps_to = {}
d_m = np.ndarray((len(sety), len(sety)))
for i, phone in enumerate(sety): # i, phone: ahah!
for ncomponent in xrange(gmm.means_.shape[0]):
d_m[i,ncomponent] = np.linalg.norm(gmm.means_[ncomponent]
- np.mean(X[y==phone]))
for _ in xrange(d_m.shape[0]):
indices = np.unravel_index(d_m.argmin(), d_m.shape)
while mapsety[indices[0]] in maps_to or indices[1] in maps_to.values():
d_m[indices[0],indices[1]] = np.finfo('d').max
indices = np.unravel_index(d_m.argmin(), d_m.shape)
maps_to[mapsety[indices[0]]] = indices[1]
d_m[indices[0],indices[1]] = np.finfo('d').max
print maps_to
y_gold = np.array(map(lambda x: maps_to[x], y))
print "Adjusted rand scores:",
print metrics.adjusted_rand_score(y_gold, y_pred)
print "Homogeneity:",
print metrics.homogeneity_score(y_gold, y_pred)
print "Completeness:",
print metrics.completeness_score(y_gold, y_pred)
print "V-measure:",
print metrics.v_measure_score(y_gold, y_pred)
return y_pred, y_gold, maps_to
if __name__ == "__main__":
h, d = parse('data/formants.dat')
X = d[:, 3:5].astype(np.float)
y = d[:, 2]
sety = set(y)
print "All the", len(sety), "vowels:", sety
gmm = GMM(n_components=len(sety)) # default covar='diag'
gmm.fit(X)
y_pred, y_gold, maps_to = eval_clusters(gmm.predict(X), y, X, gmm)
#pl.scatter(X[:,1], X[:,0], s=20, c=y_gold)
import matplotlib.cm as cm
colors = cm.rainbow(np.linspace(0, 1, 2*len(sety)))
ax = pl.subplot(2, 1, 1)
for i, phone in enumerate(sety): # oups, I did it again
pl.scatter(X[y==phone,1], X[y==phone,0], s=20,
c=colors[2*i], label=phone)
pl.legend(bbox_to_anchor=(0., 1.02, 1., 1.102), loc=8,
ncol=len(sety)/2, mode="expand", borderaxespad=0.)
ax = pl.subplot(2, 1, 2)
for i, phone in enumerate(set(y_pred)): # oups, I did it again
pl.scatter(X[y_pred==phone,1], X[y_pred==phone,0], s=20,
c=colors[2*i], label=phone)
pl.show()