/
model.py
executable file
·133 lines (123 loc) · 5.79 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from gensim.models.word2vec import Word2Vec, LineSentence, PathLineSentences
from gensim import utils, matutils
from scipy.misc import logsumexp
import os
import numpy as np
import glob
import logging
import copy
gvocab = None
def my_rule(word, count, min_count):
if word in gvocab:
return utils.RULE_KEEP
else:
return utils.RULE_DISCARD
class twec_model(object):
def __init__(self, args):
self.args = args
self.size = args.size
self.sg = args.sg
self.static_iter = args.siter
self.dynamic_iter = args.diter
self.negative = args.ns
self.window = args.window
self.static_alpha = args.alpha
self.dynamic_alpha = args.alpha
self.min_count = args.min_count
self.workers = args.workers
self.train = args.train
self.test = args.test
self.opath = args.opath
self.init_mode = args.init_mode
self.compass = None
if not os.path.isdir(self.opath):
os.makedirs(self.opath)
with open(os.path.join(self.opath, "log.txt"), "w") as f_log:
f_log.write(str(self.args))
f_log.write('\n')
logging.basicConfig(filename=os.path.realpath(f_log.name),format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def initialize_from_compass(self, model):
print("Initializing temporal embeddings from the atemporal compass.")
if self.init_mode=="copy":
model = copy.deepcopy(self.compass)
else:
vocab_m = model.wv.index2word
indices = [self.compass.wv.vocab[w].index for w in vocab_m]
new_syn1neg = np.array([self.compass.syn1neg[index]for index in indices])
model.syn1neg = new_syn1neg
if self.init_mode=="both":
new_syn0 = np.array([self.compass.wv.syn0[index]for index in indices])
model.wv.syn0 = new_syn0
model.learn_hidden = False
model.alpha = self.dynamic_alpha
model.iter = self.dynamic_iter
return model
def train_model(self, sentences):
model = None
if self.compass == None or self.init_mode != "copy":
model = Word2Vec(sg=self.sg, size=self.size, alpha=self.static_alpha, iter=self.static_iter, negative=self.negative,
window=self.window, min_count=self.min_count, workers=self.workers)
model.build_vocab(sentences, trim_rule = my_rule if self.compass != None else None)
if self.compass != None:
model = self.initialize_from_compass(model)
model.train(sentences, total_words=sum([len(s) for s in sentences]), epochs=model.iter, compute_loss=True)
return model
def train_static(self):
if os.path.isfile(os.path.join(self.opath,"static.model")):
self.compass = Word2Vec.load(os.path.join(self.opath,"static.model"))
print("Stic model loaded.")
else:
sentences = PathLineSentences(self.train)
sentences.input_files = [s for s in sentences.input_files if not os.path.basename(s).startswith('.')]
print("Training static embeddings.")
self.compass = self.train_model(sentences)
self.compass.save(os.path.join(self.opath,"static.model"))
global gvocab
gvocab = self.compass.wv.vocab
def train_temporal_embeddings(self):
if self.compass == None:
self.train_static()
files = glob.glob(self.train+'/*.txt')
tot_n_files = len(files)
for n_file, fn in enumerate(sorted(files)):
print("Training temporal embeddings: slice {} of {}.".format(n_file+1, tot_n_files))
sentences = LineSentence(fn)
model = self.train_model(sentences)
model.save(os.path.join(self.opath,os.path.splitext(os.path.basename(fn))[0])+".model")
def evaluate(self):
mfiles = glob.glob(self.opath+'/*.model')
mods = []
vocab_len = -1
for fn in sorted(mfiles):
if "static" in os.path.basename(fn): continue
m = Word2Vec.load(fn)
m.cbow_mean = True
m.negative = self.negative
m.window = self.window
m.vector_size = self.size
if vocab_len > 0 and vocab_len != len(m.wv.vocab):
print("ERROR in evaluation: models with different vocab size {} != {}".format(vocab_len, len(m.wv.vocab)))
return
vocab_len = len(m.wv.vocab)
mods.append(m)
tfiles = glob.glob(self.test+'/*.txt')
if len(mods)!=len(tfiles):
print("ERROR in evaluation: number mismatch between the models ({}) in the folder {} and the test files ({}) in the folder {}".format(len(mods),self.opath,len(tfiles),self.test))
return
mplps = []
nlls = []
for n_tfn, tfn in enumerate(sorted(tfiles)):
sentences = LineSentence(tfn)
# Taddy's code (see https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb)
llhd = np.array([ m.score(sentences) for m in mods]) # (mods,sents)
lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
probs = (lhd/lhd.sum(axis=0)).mean(axis=1) # (sents, mods)
mplp = np.log( probs[n_tfn] )
mplps.append(mplp)
nwords = len([w for s in sentences for w in s if w in mods[n_tfn].wv.vocab])
nll = sum(llhd[n_tfn]) / (nwords)
nlls.append(nll)
print("Slice {} {}\n\t- Posterior log probability {:.4f}\n\tNormalized log likelihood {:.4f}".format(n_tfn,tfn,mplp,nll))
print
print("Mean posterior log probability: {:.4f}".format(sum(mplps)/(len(mplps))))
print("Mean normalized log likelihood: {:.4f}".format(sum(nlls)/(len(nlls))))