forked from maciejkula/glove-python
/
glove_cython.pyx
175 lines (139 loc) · 5.78 KB
/
glove_cython.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!python
#cython: boundscheck=False, wraparound=False
import numpy as np
import scipy.sparse as sp
import collections
from cython.parallel import parallel, prange
cdef inline double double_min(double a, double b) nogil: return a if a <= b else b
cdef inline int int_min(int a, int b) nogil: return a if a <= b else b
cdef inline int int_max(int a, int b) nogil: return a if a > b else b
cdef extern from "math.h" nogil:
double sqrt(double)
double c_log "log"(double)
double c_abs "fabs"(double)
def fit_vectors(double[:, :] wordvec,
double[:,] wordbias,
int[:,] row,
int[:,] col,
double[:,] counts,
int[:,] shuffle_indices,
double learning_rate,
double max_count,
double alpha,
int no_threads):
"""
Estimate GloVe word embeddings given the cooccurrence matrix.
Modifies the word vector and word bias array in-place.
Training is performed via asynchronous stochastic gradient descent.
"""
# Get number of latent dimensions and
# number of cooccurrences.
cdef int dim = wordvec.shape[1]
cdef int no_cooccurrences = row.shape[0]
# Hold indices of current words and
# the cooccurrence count.
cdef int word_a
cdef int word_b
cdef double count
# Hold norms of the word vectors.
cdef double word_a_norm
cdef double word_b_norm
# Loss and gradient variables.
cdef double prediction
cdef double entry_weight = 0.0
cdef double loss = 0.0
# Iteration variables
cdef int j, i, shuffle_index
# We iterate over random indices to simulate
# shuffling the cooccurrence matrix.
with nogil:
for j in prange(no_cooccurrences, num_threads=no_threads,
schedule='dynamic'):
shuffle_index = shuffle_indices[j]
word_a = row[shuffle_index]
word_b = col[shuffle_index]
count = counts[shuffle_index]
# Get prediction, and accumulate
# vector norms as we go.
prediction = 0.0
word_a_norm = 0.0
word_b_norm = 0.0
for i in range(dim):
prediction = prediction + wordvec[word_a, i] * wordvec[word_b, i]
word_a_norm += wordvec[word_a, i] ** 2
word_b_norm += wordvec[word_b, i] ** 2
prediction = prediction + wordbias[word_a] + wordbias[word_b]
word_a_norm = sqrt(word_a_norm)
word_b_norm = sqrt(word_b_norm)
# Compute loss and the example weight.
entry_weight = double_min(1.0, (count / max_count)) ** alpha
loss = entry_weight * (prediction - c_log(count))
# Update step: apply gradients and reproject
# onto the unit sphere.
for i in xrange(dim):
wordvec[word_a, i] = (wordvec[word_a, i] - learning_rate
* loss * wordvec[word_b, i]) / word_a_norm
wordvec[word_b, i] = (wordvec[word_b, i] - learning_rate
* loss * wordvec[word_a, i]) / word_b_norm
# Update word biases.
wordbias[word_a] -= learning_rate * loss
wordbias[word_b] -= learning_rate * loss
def transform_paragraph(double[:, :] wordvec,
double[:,] wordbias,
double[:,] paragraphvec,
int[:,] row,
double[:,] counts,
int[:,] shuffle_indices,
double learning_rate,
double max_count,
double alpha,
int epochs):
"""
Compute a vector representation of a paragraph. This has
the effect of making the paragraph vector close to words
that occur in it. The representation should be more
similar to words that occur in it multiple times, and
less close to words that are common in the corpus (have
large word bias values).
This should be be similar to a tf-idf weighting.
"""
# Get number of latent dimensions and
# number of cooccurrences.
cdef int dim = wordvec.shape[1]
cdef int no_cooccurrences = row.shape[0]
# Hold indices of current words and
# the cooccurrence count.
cdef int word_b, word_a
cdef double count
# Hold norm of the paragraph vector.
cdef double paragraphnorm
# Loss and gradient variables.
cdef double prediction
cdef double entry_weight = 0.0
cdef double loss = 0.0
# Iteration variables
cdef int epoch, j, c, i, shuffle_index, start, stop
# We iterate over random indices to simulate
# shuffling the cooccurrence matrix.
for epoch in xrange(epochs):
for j in xrange(no_cooccurrences):
shuffle_index = shuffle_indices[j]
word_b = row[shuffle_index]
count = counts[shuffle_index]
# Get prediction, and accumulate
# vector norms as we go.
prediction = 0.0
paragraphnorm = 0.0
for i in range(dim):
prediction = prediction + paragraphvec[i] * wordvec[word_b, i]
paragraphnorm += paragraphvec[i] ** 2
prediction += wordbias[word_b]
paragraphnorm = sqrt(paragraphnorm)
# Compute loss and the example weight.
entry_weight = double_min(1.0, (count / max_count)) ** alpha
loss = entry_weight * (prediction - c_log(count))
# Update step: apply gradients and reproject
# onto the unit sphere.
for i in xrange(dim):
paragraphvec[i] = (paragraphvec[i] - learning_rate
* loss * wordvec[word_b, i]) / paragraphnorm