In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import os
import re
from itertools import chain 

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet as wn

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from tensorflow.keras.utils import to_categorical
from tensorflow import keras
import tensorflow as tf
from scipy import sparse
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
def get_bow(data, encoder):
    bows = []
    for ex in data:
        encoded = [encoder.get(w, 1) for w in ex]
        x = to_categorical(encoded, num_classes=len(encoder))
        x = np.sum(x, axis=0)
        bows.append(x)
    return np.array(bows)


In [3]:
def is_noun(word, tag):
    return (tag == "NN" or tag == "NNS") and word.isalpha() and len(word) > 1


In [4]:
def get_wordnet_repr(word):
    synset = wn.synsets(word, "n")
    if len(synset) > 0:
        return synset[0]
    else:
        return None

In [5]:
def preprocess_sentence(sent):
    sent = sent.strip()
    # delete unwanted special characters
    sent = re.sub(r"[@#\^\*\(\)\\\|~;\"=+`]", "", sent)

    # handle some special characters
    sent = sent.replace("$", " dollar ")
    sent = sent.replace("%", " percent ")
    sent = sent.replace("&", " and ")
    sent = re.sub("[-_:]", " ", sent)
    sent = sent.lower()
    
    return sent

In [50]:
class Dataset:
    def __init__(self, mode="train", stem_func=None, tag_func=nltk.pos_tag):
        if stem_func is not None:
            self.stem_func = stem_func
        else:
            self.stem_func = lambda x: x

        self.tag_func = tag_func
        self.mode = mode

        self.word2idx = None
        self.idx2word = None
        self.similarity_matrix = None

    def get_corpus(self):
        """List of Dialogues. Each dialogue is list of sentences (question and answer 
        are separate sentences). Each sentece is tokenized.
        """
        corpus = []
        with open(f"drive/MyDrive/task/data/dialog_{self.mode}.txt") as f:
            for line in f:
                line = preprocess_sentence(line)

                # simple heuristic to avoid mistake in pos tagging
                line = line.replace("yes", "Yes")

                line = line.replace("</q>", "?")
                line = line.replace("</a>", ".")

                # split by sentence
                line = re.split("<[qa]>", line)
                # drop empty strings created from split, and tokenize
                line = [nltk.word_tokenize(s) for s in line if s]
                corpus.append(line)

        print("finished loading corpus")
        return corpus

    def get_description(self):
        """List of descriptions. The descriptions are tokenized."""
        with open(f"drive/MyDrive/task/data/desc_{self.mode}.txt") as f:
            descriptions = []
            for line in f:
                line = preprocess_sentence(line)
                line = nltk.word_tokenize(line)
                descriptions.append(line)

        print("finished loading descriptions")
        return descriptions

    def extract_nouns(self, tagged_sentence):
        nouns = []
        for w in tagged_sentence:
            if not is_noun(*w):
                continue
            synset = get_wordnet_repr(w[0])
            if synset is None:
                continue
            nouns.append(synset.name())
        return nouns


    def extract_context(self, corpus):
        """Context from dialogues. 
        shape: [number of dialogue, number of context]"""
        contexts = []
        for i, dialog in enumerate(corpus):
            tagged_dialog = self.tag_func(dialog)
            context = set()
            for sent in tagged_dialog:
                nouns = self.extract_nouns(sent)
                context.update(nouns)
            contexts.append(list(context))

        print("finished extracting contexts")
        return contexts
    
    def extract_target(self, description, contexts):
        """Extract ground truth OOC.

        NOTE: There are quite a lot of instances there is no OOC.
        """
        targets = []
        tagged_description = self.tag_func(description)
        for i, desc in enumerate(tagged_description):
            nouns = self.extract_nouns(desc)
            ooc = set(nouns) - set(contexts[i])
            targets.append(list(ooc))

        print("finished extracting targets (OOCs)")
        return targets
    
    def set_vocab(self, context, target):
        """Vocabulary of the dataset."""
        vocab = set()
        vocab.update(chain.from_iterable(context))
        vocab.update(chain.from_iterable(target))

        self.word2idx = {word: idx + 2 for idx, word in enumerate(vocab)}
        self.word2idx["[pad]"] = 0
        self.word2idx["[unk]"] = 1

        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
    

    def set_similarity_matrix(self):
        self.similarity_matrix = sparse.lil_matrix((len(self.word2idx), len(self.word2idx)))  # [V, V]

        for i, word1 in self.idx2word.items():
            if i in [0, 1]:
                continue

            w1 = wn.synset(word1)
            for j, word2 in self.idx2word.items():
                if j in [0, 1]:
                    continue
                if j == i:
                    self.similarity_matrix[i, j] = -1
                    continue

                w2 = wn.synset(word2)
                similarity = w1.wup_similarity(w2)
                if similarity is None:
                    similarity = 0

                self.similarity_matrix[i, j] = similarity
            if i % 100 == 0:
                print(i)
        self.similarity_matrix = self.similarity_matrix.tocsr()


        print("finished setting up similarity matrix")



    def get_data_target(self, word2idx=None, sim_mat=None):
        corpus = self.get_corpus()  #[N, D, t]
        description = self.get_description()  #[N, D]

        contexts = self.extract_context(corpus)  # [N, t]
        target = self.extract_target(description, contexts)  # [N, G]

        if word2idx is None:
            self.set_vocab(contexts, target)
            self.set_similarity_matrix()
            word2idx = self.word2idx
            sim_mat = self.similarity_matrix

        # TODO: change this part
        data_idx = [[word2idx.get(tok, 1) for tok in dialog] for dialog in contexts]  # [N, t]
        data = [sim_mat[idx].sum(0) for idx in data_idx]  # [N, V]
        # maxlen = np.max([ex.shape[0] for ex in data])
        # data = [csr_matrix((ex.data, ex.indices, np.pad(ex.indptr, (0, maxlen - ex.shape[0]))) for ex in data]
        data = np.vstack(data)
        print(data.shape)

        target_label = get_bow(target, word2idx)  # [N, V]

        defined_idx = np.where(target_label.sum(1) > 0)[0]

        return data[defined_idx], target_label[defined_idx]

In [264]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

tokenizer.fit_on_texts([list(chain.from_iterable(dialog)) for dialog in train_corpus])

In [278]:
x = tokenizer.texts_to_sequences(train_corpus[0])
x = np.expand_dims(x, axis=0)
to_categorical(list(chain.from_iterable(x)), len(tokenizer.word_counts)+1)

  return array(a, dtype, copy=False, order=order, subok=True)


ValueError: ignored

In [252]:
len(train_corpus[0]

20

In [51]:
from nltk.tag import StanfordPOSTagger

STANFORD_POS_MODEL_PATH = "/content/drive/MyDrive/stanford-postagger-full-2020-11-17/models/english-bidirectional-distsim.tagger"
STANFORD_POS_JAR_PATH = "/content/drive/MyDrive/stanford-postagger-full-2020-11-17/stanford-postagger-4.2.0.jar"

pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH, STANFORD_POS_JAR_PATH)

lemma = nltk.wordnet.WordNetLemmatizer()

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


In [52]:
train_ds = Dataset(mode="train", stem_func=lemma.lemmatize, tag_func=nltk.pos_tag_sents)
val_ds = Dataset(mode="valid", stem_func=lemma.lemmatize, tag_func=nltk.pos_tag_sents)

In [53]:
train_corpus = train_ds.get_corpus()
train_description = train_ds.get_description()

finished loading corpus
finished loading descriptions


In [54]:
train_context = train_ds.extract_context(train_corpus)
train_target = train_ds.extract_target(train_description, train_context)

finished extracting contexts
finished extracting targets (OOCs)


## Debug

In [39]:
x = zip(train_context, train_target)
next(x)

(['grey.n.01',
  'bus.n.01',
  'ad.n.01',
  'photograph.n.01',
  'top.n.01',
  'woman.n.01',
  'color.n.01',
  'yoga.n.01',
  'bloomers.n.01'],
 ['tour.n.01', 'business_district.n.01'])

In [45]:
next(x)

(['hair.n.01',
  'man.n.01',
  'clean_and_jerk.n.01',
  'shirt.n.01',
  'color.n.01',
  'position.n.03',
  'wood.n.01',
  'laptop.n.01',
  'taste.n.01',
  'wear.n.01',
  'spectacles.n.01',
  'screen.n.01',
  'beard.n.01',
  'model.n.01',
  'brown.n.01',
  'table.n.01'],
 ['work.n.01', 'people.n.01'])

## Again

In [56]:
train_ds.set_vocab(train_context, train_target)
print("vocab_size:", len(train_ds.word2idx))

train_ds.set_similarity_matrix()

vocab_size: 4263
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
finished setting up similarity matrix


In [151]:
# TODO: change this part
data_idx = [[train_ds.word2idx.get(tok, 1) for tok in dialog] for dialog in train_context]  # [N, t]
train_data = [train_ds.similarity_matrix[idx].sum(axis=0) for idx in data_idx]  # [N, V]
train_data = np.vstack(train_data)
print(train_data.shape)

train_target_label = get_bow(train_target, train_ds.word2idx)  # [N, V]

defined_idx = np.where(train_target_label.sum(1) > 0)[0]

train_data, train_target_label = train_data[defined_idx], train_target_label[defined_idx]

(9666, 4263)


In [76]:
val_data, val_target = val_ds.get_data_target(train_ds.word2idx, train_ds.similarity_matrix)

finished loading corpus
finished loading descriptions
finished extracting contexts
finished extracting targets (OOCs)
(1208, 4263)


In [173]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_data)

In [174]:
train_data

array([[0.        , 0.        , 0.20718095, ..., 0.21807308, 0.2448492 ,
        0.17003071],
       [0.        , 0.        , 0.34301971, ..., 0.37115245, 0.39008822,
        0.30458703],
       [0.        , 0.        , 0.31039884, ..., 0.4722585 , 0.34761265,
        0.26523564],
       ...,
       [0.        , 0.        , 0.18723287, ..., 0.38439629, 0.24057819,
        0.16607384],
       [0.        , 0.        , 0.35053555, ..., 0.42531133, 0.36530319,
        0.281625  ],
       [0.        , 0.        , 0.28140507, ..., 0.47912925, 0.33149306,
        0.25030169]])

## Save

In [57]:
with open("/content/drive/MyDrive/word2idx.txt", "w") as f:
    for k, v in train_ds.word2idx.items():
        f.write(f"{k}, {v}\n")

sparse.save_npz("/content/drive/MyDrive/sim_mat", train_ds.similarity_matrix)

## Model

In [175]:
x = keras.layers.Input(shape=(train_data.shape[1],))  # [N, V]

h = keras.layers.Dense(512, activation="relu")(x)
h = keras.layers.Dense(1024, activation="relu")(h)
h = keras.layers.Dense(2048, activation="relu")(h)
y = keras.layers.Dense(train_target_label.shape[-1], activation="sigmoid")(h)

model = keras.models.Model(x, y)

model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-5), loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [176]:
model.fit(train_data, train_target_label, batch_size=16, epochs=10) # validation_data=(val_data, val_target)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f66d6777550>

## Predictions

In [177]:
train_data[800]

array([0.        , 0.        , 0.14685103, ..., 0.17356423, 0.1892548 ,
       0.11852542])

In [230]:
x = 1000
# pred = model(np.expand_dims(train_data[x], axis=0)).numpy()
# top10 = pred.flatten().argsort()[-20:][::-1]
top10 = np.array(train_data)[x].argsort()[::-1][:10]

print("pred\n")
for i in top10.flatten():
    # print(train_ds.idx2word[i], pred[0, i])
    print(train_ds.idx2word[i])

print("\ntarget\n")
for i in np.where(train_target_label[x] == 1)[0]:
    print(train_ds.idx2word[i])

# print("\ninput\n")
# for i in train_contexts[x]:
#     print(i)

pred

moped.n.01
sport_utility.n.01
cab.n.03
minivan.n.01
hatchback.n.01
ambulance.n.01
jeep.n.01
limousine.n.01
convertible.n.01
sedan.n.01

target

back.n.01


In [226]:
from nltk.wsd import lesk

lesk("a small glass pitcher sitting on a table with a flower in it", "pitcher", "n")
wn.synsets("pitcher")[2].definition()
wn.synset("home_plate.n.01").lemma_names()

['home_plate', 'home_base', 'home', 'plate']

In [92]:
x = 12
pred = model(val_data[x]).numpy()
top10 = pred.flatten().argsort()[-10:][::-1]
# top10 = np.array(train_data)[x].argsort()[::-1][:10]

print("pred\n")
for i in top10.flatten():
    # print(train_ds.idx2word[i], pred[0, i])
    print(train_ds.idx2word[i])

print("\ntarget\n")
for i in np.where(val_target[x] == 1)[0]:
    print(train_ds.idx2word[i])

pred

top.n.01
man.n.01
group.n.01
people.n.01
field.n.01
woman.n.01
person.n.01
baseball.n.01
couple.n.01
slope.n.01

target

slope.n.01
person.n.01
snow.n.01
