gensim tutorial:

https://github.com/RaRe-Technologies/gensim/blob/develop/tutorials.md#tutorials

1. set up dictionary for the corpus
2. token2id, id2token
3. extract questions and answers: question mark; and sentences less than 20 words
4. word2vec

convert to lower case; but no expansion
try not converting lower case

In [1]:
from gensim import corpora,models
import os
import chardet   #需要导入这个模块，检测编码格式
import re
import numpy as np
import pickle 
import pandas as pd

In [2]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [7]:
# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [9]:
words = [item for sublist in sentences for item in sublist]

In [10]:
tf_ = pd.DataFrame(words,columns=['word']).word.value_counts()/len(words)

In [70]:
tf_.describe()

count    3249.000000
mean        0.000308
std         0.002668
min         0.000003
25%         0.000013
50%         0.000029
75%         0.000072
max         0.075629
Name: word, dtype: float64

In [12]:
words = list(set(words))
len(words)

3249

In [13]:
tagged_sent = pos_tag(words)

wnl = WordNetLemmatizer()
lemmas = {}
for tag in tagged_sent:
    wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
    lemmas.update({tag[0]:wnl.lemmatize(tag[0], pos=wordnet_pos)})

In [14]:
# word number after lemmatizer
len(set(lemmas.values()))

2768

In [15]:
corpus_dict = corpora.Dictionary([words])
corpus = [corpus_dict.doc2bow(text) for text in [words]]
print(corpus_dict)

Dictionary(3249 unique tokens: ['!', '$', '%', "'", "'am"]...)


In [16]:
corpus_dict[0]
id2token = corpus_dict.id2token
token2id = corpus_dict.token2id

In [17]:
# pickle.dump(VAD_dict,open('./pre-data/VAD_dict.pickle','wb'))

In [18]:
VAD_extend = pd.read_csv('./affect-rich/VAD_extend_clean.csv')
VAD_extend.dropna(subset=['Word'],inplace=True)
VAD_extend.set_index('Word',inplace=True)

lambda_ = 0.1
VAD_extend = VAD_extend.clip(lower=3,upper=7) - [5,3,5]
VAD_extend = VAD_extend.mul(lambda_)

VAD_extend_dict = VAD_extend.to_dict('index')

In [19]:
len(VAD_extend)

26959

In [20]:
# number of lemmas in VAD dataframe (after extension)
sum([lemma in VAD_extend_dict.keys() for lemma in lemmas.values()])

2435

In [37]:
VAD_extend.describe()

Unnamed: 0,V,A,D
count,26959.0,26959.0,26959.0
mean,0.006111,0.122721,0.018835
std,0.107662,0.075985,0.082669
min,-0.2,0.0,-0.2
25%,-0.065,0.07,-0.032
50%,0.016667,0.115,0.026
75%,0.080679,0.165389,0.074333
max,0.2,0.4,0.2


---

In [21]:
model = models.Word2Vec(sentences, min_count=1,size=256)

In [27]:
word_embeddings = []
VAD_list = []
tf = []
for i in range(len(corpus_dict)):
    embedding = list(model.wv[id2token[i]])
    word_lemma = lemmas[id2token[i]]
    vad = list(VAD_extend_dict[word_lemma].values()) if word_lemma in VAD_extend_dict.keys() else [0,0,0]
    # after normalization for words outside dictionary the VAD value should be [0,0,0]
    word_embeddings.append(np.array(embedding))
    VAD_list.append(vad)
    tf.append(tf_.loc[id2token[i]])

In [28]:
word_embeddings = np.array(word_embeddings)

In [29]:
VAD = np.array(VAD_list)

In [88]:
pd.DataFrame(VAD).describe()

Unnamed: 0,0,1,2
count,3249.0,3249.0,3249.0
mean,0.033663,0.094035,0.033444
std,0.104415,0.094552,0.082482
min,-0.2,0.0,-0.2
25%,0.0,0.0,0.0
50%,0.014,0.076,0.017
75%,0.110167,0.149,0.089
max,0.2,0.4,0.2


In [30]:
tf = np.array(tf)

In [89]:
pd.DataFrame(tf).describe()

Unnamed: 0,0
count,3249.0
mean,0.000308
std,0.002668
min,3e-06
25%,1.3e-05
50%,2.9e-05
75%,7.2e-05
max,0.075629


In [5]:
def sentence_extract(line,dict_=dict_sub):
    encode_type = chardet.detect(line)  
    line = line.decode(encode_type['encoding']) #进行相应解码，赋给原标识符（变量）
    line = line.lower().replace('<eos>','\n').split('\n')
#     line = line.replace('<EOS>','\n').split('\n')
    line = list(filter(None,line))
    
    # extract easy dialogues
    idx = [i for i,x in enumerate(line) if '?' in x]
    diag_list = []
    for i in idx:
        if (i < len(line)-1):
            if (len(line[i].split())<20) & (len(line[i+1].split())<20):
                diag_list.append(line[i])
                diag_list.append('<go> '+line[i+1]+' <eos>')
                # diag_list.append(line[i+1])
    return diag_list

In [6]:
from smart_open import smart_open
class MyCorpus(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in smart_open(os.path.join(self.dirname, fname), 'rb'):
                yield from sentence_extract(line)
                # return sentence_extract(line)

In [8]:
file_dir = '/Users/yan/Documents/document/EPFL/MA2/semesterprj/code/processed_data/OpenSubtitle/'
sentences = MyCorpus(file_dir)
sentences = [x.lstrip().rstrip() for x in sentences]
sentences = list(filter(None,sentences))

In [31]:
sentence_token2id = []
for i in range(len(sentences)):
    sentence_token2id.append([token2id.get(item,item) for item in sentences[i]])

In [32]:
enc_input = sentence_token2id[::2]
dec_input = [x[:-1] for x in sentence_token2id[1::2]] # '<go>' + 
target = [x[1:] for x in sentence_token2id[1::2]] # + '<eos>'

In [33]:
max_uttr_len_enc = max([len(x) for x in enc_input])
max_uttr_len_dec = max([len(x) for x in dec_input])
max_uttr_len_target = max([len(x) for x in target])
print(max_uttr_len_enc,max_uttr_len_dec,max_uttr_len_target)

19 20 20


In [34]:
enc_input = np.array([i + [0]*(max_uttr_len_enc-len(i)) for i in enc_input])
dec_input = np.array([i + [0]*(max_uttr_len_dec-len(i)) for i in dec_input])
target = np.array([i + [0]*(max_uttr_len_target-len(i)) for i in target])

In [35]:
from sklearn.model_selection import train_test_split
enc_train, enc_test, dec_train, dec_test, target_train, target_test = train_test_split(
    enc_input, dec_input, target, test_size=0.2, random_state=1)

enc_test, enc_val, dec_test, dec_val,target_test,target_val = train_test_split(
    enc_test, dec_test, target_test, test_size=0.5, random_state=1)


---
incorporate VAD embedding of words into cross-entropy loss

In [45]:
delta = 0.15

In [38]:
VAD.shape

(3249, 3)

In [39]:
V = VAD.shape[0]

In [82]:
VAD_loss = 1 + delta*np.linalg.norm(VAD,axis=1)

In [83]:
VAD_loss = V*VAD_loss/sum(VAD_loss)

In [99]:
sum(1 + delta*np.linalg.norm(VAD,axis=1))

3323.727584211128

In [65]:
VAD_loss.shape

(3249,)

---
$\mu(x_t)$
- uniform importance
- global importance
- local importance

In [90]:
# uniform importance
mu_ui = np.ones(tf.shape[0])

In [93]:
# globale importance
a = 0.001
mu_gi = a/(a+tf)

In [95]:
mu_gi.max()/mu_gi.min()

76.378590078329

In [96]:
# local importance
epsilon = 10e-8
mu_li = np.log(1/tf+epsilon)/sum(np.log(1/tf+epsilon))

In [97]:
mu_li.max()/mu_li.min()

4.891468104394948

---
saving

dec_input: `<go>` + answer
target: answer + `<eos>`

In [98]:
np.save('./pre-data/word_embeddings.npy',word_embeddings)
np.save('./pre-data/VAD.npy',VAD)
np.save('./pre-data/tf.npy',tf)
np.save('./pre-data/VAD_loss.npy',VAD_loss)

np.save('./pre-data/mu_ui.npy',mu_ui)
np.save('./pre-data/mu_gi.npy',mu_gi)
np.save('./pre-data/mu_li.npy',mu_li)

In [60]:
enc_input_len = [sum(x!=0) for x in enc_train]
dec_input_len = [sum(x!=0) for x in dec_train]
np.save('./pre-data/train/enc_input.npy',enc_train)
np.save('./pre-data/train/dec_input.npy',dec_train)
np.save('./pre-data/train/target.npy',target_train)
np.save('./pre-data/train/enc_input_len.npy',enc_input_len)
np.save('./pre-data/train/dec_input_len.npy',dec_input_len)

enc_input_len = [sum(x!=0) for x in enc_val]
dec_input_len = [sum(x!=0) for x in dec_val]
np.save('./pre-data/validation/enc_input.npy',enc_val)
np.save('./pre-data/validation/dec_input.npy',dec_val)
np.save('./pre-data/validation/target.npy',target_val)
np.save('./pre-data/validation/enc_input_len.npy',enc_input_len)
np.save('./pre-data/validation/dec_input_len.npy',dec_input_len)


In [61]:
enc_input_len = [sum(x!=0) for x in enc_test]
dec_input_len = [sum(x!=0) for x in dec_test]
np.save('./pre-data/test/enc_input.npy',enc_test)
np.save('./pre-data/test/dec_input.npy',dec_test)
np.save('./pre-data/test/target.npy',target_test)
np.save('./pre-data/test/enc_input_len.npy',enc_input_len)
np.save('./pre-data/test/dec_input_len.npy',dec_input_len)


In [62]:
pickle.dump(id2token,open('./pre-data/id2token.pickle','wb'))
pickle.dump(token2id,open('./pre-data/token2id.pickle','wb'))

In [20]:
# with open('id2token.pickle', 'rb') as file:
#     test = pickle.load(file)
