In [1]:
import os
import pandas as pd
import numpy as np
import pycrfsuite
import sklearn_crfsuite
import gensim
import re

from collections import namedtuple
from scipy.spatial.distance import cosine

from pymystem3 import Mystem

from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

from sklearn.model_selection import train_test_split, ShuffleSplit

Маппинг тем

In [2]:
translation = [27, 37, 19, [3, 16, 33], 11, 25, 21, 6, 29, [13, 33], 17, 25, 15, 22, 28, 41, 18, 40, 23, [7, 8], 31,
 20, 17, 32, 25, 38, 14, 25, 36, 25, 4, 24, 5, [7, 8], 25, 12, 9, 25, 24, 26, 30, 35, 13, 3,
 1, 14, 2, 17, 10, 0, 5, 21, 12, -1]
translation[27] = [25, 42, 34, 33, 12]
translation[43] = [3, 34]
translation[22] = [17, 18]
translation[47] = [17, 38]
translation[12] = [15, 27]
translation[43] = [3, 34, 12]
translation[21] = [20, 12]
translation[25] = [38, 12, 25]
translation[42] = [13, 16]
translation[10] = [17, 38]
translation[24] = [25, 37]
del translation[47]
translation_set = []
for element in translation:
    try:
        translation_set.append(set(element))
    except:
        translation_set.append({element})
translation = translation_set[:]

In [3]:
translation[1]

{37}

Лемматизация

In [4]:
mystem = Mystem()

def stem(s):
    return [(e['text'].strip(), \
             e['analysis'][0]['lex'] \
                 if 'analysis' in e and len(e['analysis']) > 0 else '', \
             re.match('^([A-Z]+)', e['analysis'][0]['gr']).group(0) \
                 if 'analysis' in e and len(e['analysis']) > 0 else '', \
             ','.join(set(re.findall(r"[\w']+", e['analysis'][0]['gr'])[1:])) \
                 if 'analysis' in e and len(e['analysis']) > 0 else '')\
             for e in mystem.analyze(s) if len(e['text'].strip()) > 0]

def get_pos_tag(word):
    temp = stem(word)[0]
    return temp[2]

Загрузка данных

In [5]:
with open("./data/vw.txt") as fin:
    text = fin.read()
topics = pd.read_pickle("./data/topics_assessors.pkl")
ptdw = pd.read_pickle("./data/ptdw")
w2v_model = gensim.models.Word2Vec.load("./data/transcriptions_w2v")

In [6]:
"банк_S" in w2v_model

True

Их предобработка

In [7]:
phrases, topic_probs, labels = [], [], []
for index, line in enumerate(text.split("\n")):
    temp = line.split()
    phrase_id = temp[0]
    phrase = temp[2:]
    probs = ptdw[phrase_id].as_matrix()
    phrase_size = 0
    if len(probs.shape) < 2:
        phrase_size = 1
        probs = np.expand_dims(probs, axis=1)
    else:
        phrase_size = probs.shape[1]
    if phrase_size != len(phrase):
        raise
    if phrase_id in topics:
        phrases.append(phrase)
        topic_probs.append(probs)
        labels.append(topics[phrase_id])

In [8]:
def argmax_topic_segmentation(topic_prob):
    return [np.argmax(topic_prob[:, i]) for i in range(topic_prob.shape[1])]

In [9]:
[translation[i] for i in argmax_topic_segmentation(topic_probs[1])]

[{2}, {22}, {12}, {27}]

In [10]:
"23a".split("_")

['23a']

Генерация фичей

In [11]:
def meta_word2vec(meta_word, w2v_model):
    meta_word_vector = np.zeros(300)
    length = 0
    for w in meta_word.split("_"):
        with_pos = w + "_" + get_pos_tag(w)
        if with_pos in w2v_model:
            meta_word_vector += w2v_model[with_pos]
            length += 1
    if length == 0:
        return None
    return meta_word_vector / length
    
def distance_vector(phrase, index, w2v_model):
    source_vector = meta_word2vec(phrase[index], w2v_model)
    distances = []
    for word in phrase:
        distances.append(cosine(metaword2vec(word), source_vector))
    return distances

In [12]:
def phrase_to_features(phrase, topic_prob=None, w2v_model=None):
    features = []
    for word_index, word in enumerate(phrase):
        word_features = {}
        word_features["word"] = word
        if topic_prob is not None:
            for t in range(topic_prob.shape[0]):
                word_features["topic {}".format(t)] = topic_prob[t][word_index]
        if w2v_model is not None:
            meta_word_vector = meta_word2vec(word, w2v_model)
#             for i in range(len(meta_word_vector)):
#                 word_features["w2v {}".format(i)] = meta_word_vector[i]
            try:
                if meta_word_vector is not None:
                    if word_index > 0:
                        left_word = meta_word2vec(phrase[word_index - 1], w2v_model)
                        if left_word is not None:
                            word_features["-1 w2v"] = cosine(left_word, meta_word_vector)
                    if word_index < len(phrase) - 1:
                        right_word = meta_word2vec(phrase[word_index + 1], w2v_model)
                        if right_word is not None:
                            word_features["+1 w2v"] = cosine(right_word, meta_word_vector)
            except:
                print(phrase)
                print(word, word_index)
                raise
        features.append(word_features)
    return features

5-fold кросс валидация модели

In [15]:
Param = namedtuple("Param", field_names=["name", "value"])

params = {
    "topics": [#Param("Topics disabled", [None for i in range(len(phrases))]), 
               Param("Topics enabled", topic_probs)],
    "w2v": [#Param("w2v disabled", None), 
            Param("w2v enabled", w2v_model)],
}

In [16]:
%%time
params_scores = {}
rs = ShuffleSplit(n_splits=5)
splits_indexes = list(rs.split(list(range(len(phrases)))))
for topic_param in params["topics"]:
    rs = ShuffleSplit(n_splits=5)
    for w2v_param in params["w2v"]:
        X_data = [phrase_to_features(phrases[i], topic_param.value[i], w2v_param.value) for i in range(len(phrases))]
        y_data = [[str(x) for x in labels[i]] for i in range(len(labels))]
        scores = []
        for train_indexes, test_indexes in splits_indexes:
            X_train, X_test, y_train, y_test = [], [], [], []
            for train_index in train_indexes:
                X_train.append(X_data[train_index])
                y_train.append(y_data[train_index])
            for test_index in test_indexes:
                X_test.append(X_data[test_index])
                y_test.append(y_data[test_index])
            crf = sklearn_crfsuite.CRF(algorithm="lbfgs", max_iterations=100, 
                                   all_possible_transitions=True, all_possible_states=True)
            crf.fit(X_train, y_train)
            y_pred = crf.predict(X_test)
            scores.append(metrics.flat_f1_score(y_test, y_pred, average="weighted"))
        score = np.mean(scores)
        print("Topic param: {}, w2v_param : {} --- {}".format(topic_param.name, w2v_param.name, score))
        params_scores[(topic_param.name, w2v_param.name)] = score

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Topic param: Topics enabled, w2v_param : w2v enabled --- 0.5870079509871325
CPU times: user 53.7 s, sys: 144 ms, total: 53.8 s
Wall time: 54.7 s


0.59124186393515565