In [None]:
%load_ext autoreload
%autoreload 2

#### Check SimpleQuestions dataset path:

In [None]:
! ls ../uopenie_qa/SimpleWikidataQuestions/csv\ decoded

#### Load dataset and annotate with stanza or CoreNLP:

In [None]:
import numpy as np
import pandas as pd
from features_extractor import TripletsParserStanza, TripletsParserCoreNLP

parser = TripletsParserCoreNLP('', verbose=True)

In [None]:
data['test'].head()

In [None]:
data = {}

for part in ["train", "valid", "test"]:
    print(f"Process {part}... ", end="", flush=True)
    path = f"../uopenie_qa/SimpleWikidataQuestions/csv decoded/annotated_wd_data_{part}_answerable_decoded.csv"
    data[part] = pd.read_csv(path)
    data[part] = parser.annotate(data[part])
    data[part].to_pickle(path.replace('.csv', '_annotated.pkl'))
    features = parser.extract_features(data[part])
    for i, name in enumerate(["object", "subject", "relation"]):
        np.save(open(path.replace('.csv', f'_{name}_features.npy'), 'wb'), features[i])
    print('[Done]')

In [None]:
for key in data.keys():
    print(f"{key} shape:\t{data[key].shape}")

In [None]:
from features_extractor import TripletsParser

parser = TripletsParser()

for key in data.keys():
    data[key] = parser.annotate(data['key'])
    data[key] = parser.extract_features()  

In [None]:
import os
from pathlib import Path
import wget

W2V_MODEL_PATH ='models/'
W2V_MODEL_NAME = 'wiki-news-300d-1M.vec.zip'  # 1.6G

directory = os.path.dirname(W2V_MODEL_PATH)
if not Path(directory).is_dir():
    print(f'Creating directory at {directory}',
          ' for saving word2vec pre-trained model')
    os.makedirs(directory)
if not Path(W2V_MODEL_PATH).is_file():
    w2v_archive = os.path.join(directory, W2V_MODEL_NAME)
    if not Path(w2v_archive).is_file():
        url = f'https://dl.fbaipublicfiles.com/fasttext/vectors-english/{W2V_MODEL_NAME}'
        print(f'Downloading word2vec pre-trained model to {w2v_archive}')
        wget.download(url, os.path.join(directory, W2V_MODEL_NAME))

In [None]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models.wrappers import FastText

    
if W2V_MODEL_NAME[-4:] in ['.vec', '.bin']:
    word2vec_model = KeyedVectors.load_word2vec_format(W2V_MODEL_PATH + W2V_MODEL_NAME,
                                                   binary=W2V_MODEL_NAME[-4:] == '.bin')
elif W2V_MODEL_NAME[-4:] == '.zip':
    word2vec_model = KeyedVectors.load_word2vec_format(W2V_MODEL_PATH + W2V_MODEL_NAME[:-4],
                                               binary=W2V_MODEL_NAME[-4:] == '.bin')
elif W2V_MODEL_NAME[-7:] == '.bin.gz':
    word2vec_model = KeyedVectors.load_word2vec_format(W2V_MODEL_PATH + W2V_MODEL_NAME, binary=True)
    
else:
    word2vec_model = Word2Vec.load(W2V_MODEL_PATH + W2V_MODEL_NAME)
    
word2vec_vector_length = len(word2vec_model.wv.get_vector('tree'))

In [None]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma,mwt,pos,ner')

In [None]:
%%time

data['subject_annot'] = data.subject_decoded.map(nlp)
data['property_annot'] = data.property_decoded.map(nlp)
data['object_annot'] = data.object_decoded.map(nlp)

In [None]:
! ls annotated*

In [None]:
data.to_pickle('annotated_test_data.pkl')

In [None]:
def _extract_plain_features(row):
    def _extract(document):
        postag_tagtypes = {
            'XPOS': ['JJ', 'CD', 'VBD', '', 'RB', 'VBN', 'PRP', 'IN', 'VBP', 'TO', 'NNP', 'VB',
                     'VBZ', 'VBG', 'POS', 'NNS', 'NN', 'MD'],
            'UPOS': ['ADJ', 'ADP', 'PUNCT', 'ADV', 'AUX', 'SYM', 'INTJ', 'CCONJ', 'X',
                     'NOUN', 'DET', 'PROPN', 'NUM', 'VERB', 'PART', 'PRON', 'SCONJ'],
        }

        ner_tagtypes = {
            'ontonotes': ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
                          'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 
                          'ORDINAL', 'CARDINAL'],
            'corenlp': ['TITLE', 'COUNTRY', 'DATE', 'PERSON', 'ORGANIZATION', 'MISC',
                        'LOCATION', 'NUMBER', 'CAUSE_OF_DEATH', 'NATIONALITY', 'ORDINAL',
                        'DURATION', 'CRIMINAL_CHARGE', 'CITY', 'RELIGION',
                        'STATE_OR_PROVINCE', 'IDEOLOGY', 'SET', 'URL', 'PERCENT', 'TIME',
                        'MONEY', 'HANDLE'],
        }

        def get_postags_sequence(sequence, predicate=False, tagtype='UPOS'):

            columns = postag_tagtypes[tagtype]

            sequence = sequence[:3]

            result = np.zeros((3, len(columns)))
            sequence = [[int(column == postag) for column in columns] for postag in sequence]

            if sequence:
                result[:len(sequence)] = sequence
                
            return result
#             return np.max(result, axis=0)

        def get_ner_occurrences(ner_annot, tagtype='ontonotes'):

            _ner_kinds = ner_tagtypes[tagtype]

            ner_annot = ner_annot[:3]

            mentions = [entity.type for entity in ner_annot]
            mentions = [[int(_ner_kind == mention) for _ner_kind in _ner_kinds] for mention in mentions][:3]
            result = np.zeros((3, len(_ner_kinds)))

            if mentions:
                result[:len(mentions)] = mentions

            return result
#             return np.max(result, axis=0)

        def _embed(placeholder, words):
            for j in range(len(words)):
                if j == len(placeholder):
                    break

                word = words[j]
                if word and word in word2vec_model:
                    placeholder[j, :] = word2vec_model[word]
            
            return placeholder
#             return np.average(placeholder, axis=0)


        def _embed_arg(row):
            result = []
            result.append(_embed(np.zeros((3, word2vec_vector_length)), row['lemmas']))

            return result

        deprecated = []
        deprec_rels = []

        _object = {
            'tokens': [token.text for token in document.object_annot.sentences[0].tokens],
            'lemmas': [token.lemma for token in document.object_annot.sentences[0].words],
            'ner': get_ner_occurrences(document.object_annot.ents),
            'postag': get_postags_sequence(
                [token.upos for token in document.object_annot.sentences[0].words]),
        }
        _object.update({
            'w2v': _embed(np.zeros((3, word2vec_vector_length)), _object['lemmas']),
        })
        _relation = {
            'tokens': [token.text for token in document.property_annot.sentences[0].tokens],
            'lemmas': [token.lemma for token in document.property_annot.sentences[0].words],
            'ner': get_ner_occurrences(document.property_annot.ents),
            'postag': get_postags_sequence(
                [token.upos for token in document.property_annot.sentences[0].words]),
        }
        _relation.update({
            'w2v': _embed(np.zeros((3, word2vec_vector_length)), _relation['lemmas']),
        })
        _subject = {
            'tokens': [token.text for token in document.subject_annot.sentences[0].tokens],
            'lemmas': [token.lemma for token in document.subject_annot.sentences[0].words],
            'ner': get_ner_occurrences(document.subject_annot.ents),
            'postag': get_postags_sequence(
                [token.upos for token in document.subject_annot.sentences[0].words]),
        }
        _subject.update({
            'w2v': _embed(np.zeros((3, word2vec_vector_length)), _subject['lemmas']),
        })

        subjects, relations, objects, dep_path = [], [], [], []
        subjects.append(_subject)
        relations.append(_relation)
        objects.append(_object)

        return subjects, relations, objects

    _subject, _relation, _object = _extract(row)

    return _subject, _relation, _object

In [None]:
data.head()

In [None]:
res.head()

In [None]:
def extract_matrix(row):
    _matrix = np.concatenate([row[0]['ner'], row[0]['postag'], row[0]['w2v']], axis=1).flatten()
    return _matrix

def extract_one_matrix(row):
    _matrix = np.concatenate([extract_matrix(row['subject']), 
                             extract_matrix(row['relation']), 
                             extract_matrix(row['object'])], axis=0)
    return _matrix

def _extract_features(document):    
    features = {}
    features['subject'], features['relation'], features['object'] = _extract_plain_features(document)
    
    return features

res = data.apply(_extract_features, axis=1)
features = res.apply(extract_one_matrix).values

In [None]:
res.iloc[0]['subject'][0]['ner'].shape

In [None]:
res

In [None]:
res = data.apply(_extract_features, axis=1)

In [None]:
features = res.apply(extract_one_matrix).values

In [None]:
np.stack(features).shape

In [None]:
np.save('test_features.pkl', features)

In [None]:
! ls *.pkl

In [None]:
import numpy as np


features = np.load('test_features.pkl')

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(init='k-means++', n_clusters=20, n_init=10)
kmeans.fit(features.tolist())

In [None]:
data['class'] = kmeans.predict(features.tolist())

In [None]:
data.head()

In [None]:
data[["question", "subject_decoded", "property_decoded", "object_decoded", "class"]].to_csv("annotated_wd_data_test_classified.csv", sep="\t")

In [None]:
data[data['class'] == 5].property_decoded.value_counts()

In [None]:
data[data['class'] == 1].property_decoded.value_counts()

In [None]:
data[data['class'] == 1].property_decoded.unique()