## Загрузка данных

In [None]:
import re
from conllu import parse
from conllu import models
from itertools import product
from collections import defaultdict
from typing import Iterable

In [None]:
from math import inf

In [None]:
conllu_path = 'en_ewt-ud-train.conllu'

In [None]:
# # чтобы сверяться с grew-match
# dev_path = 'en_ewt-ud-dev.conllu.txt'
# train_path = 'en_ewt-ud-train.conllu'
# test_path = 'en_ewt-ud-test.conllu.txt'

# with open(dev_path, 'r', encoding='utf-8') as f:
#     devfile = f.read()
# dev = parse(devfile)

# with open(train_path, 'r', encoding='utf-8') as f:
#     trainfile = f.read()
# train = parse(trainfile)

# with open(test_path, 'r', encoding='utf-8') as f:
#     testfile = f.read()
# test = parse(testfile)

In [None]:
with open(conllu_path, 'r', encoding='utf-8') as f:
    conllufile = f.read()
    sentences = parse(conllufile)

In [None]:
some_sentence = sentences[23]
some_sentence.metadata['text']

## Проверка на узлы

In [None]:
def token_match_node(token: models.Token, node_pattern: dict) -> bool:
    "Проверяет соответствует ли данный токен заданному паттерну"
    
    for feat in node_pattern:
        if feat in token.keys():
            if not re.match(node_pattern[feat], token[feat], re.I):
                return False
        elif token['feats']:
            if feat in token['feats']:
                if not re.match(node_pattern[feat], token['feats'][feat]): 
                    return False
            elif feat == "exclude":
                for ef in node_pattern[feat]:
                    if ef in token['feats']:
                        return False
            else:
                return False
        else:
            return False
    return True

In [None]:
# # пример для token_match_node

# ttoken = sentences[23][6]
# node_patterns = [{'upos': '^AUX$', 'Number': '^Plur$'}, # есть признаки, но не то значение
#                 {'upos': '^AUX$', 'Number': '^Sing$'}, # подходит
#                 {'NumType': '^.*$'}] # нет такого признака у токена

# for np in node_patterns:
#     print(np, token_match_node(ttoken, np))
# ttoken

In [None]:
def pretty_print(found_nodes, sentence):
    """Вывод предложения с выделением токенов, подошедших под паттерн"""
    pretty_string = ''
    all_suitable_tokens = [t for ts in found_nodes.values() for t in ts]
    for token in sentence:
        word = token['form'] + ' '
        if isinstance(token['id'], int):
            if token['id'] - 1 in all_suitable_tokens:
                word = '\033[1m' + token['form'] + '\033[0m' + ' '
        pretty_string += word
    print(pretty_string)

In [None]:
def search_suitable_tokens(token_list: models.TokenList, node_pattern: dict) -> list:
    """Проходит по списку из токенов и возвращает список 
    токенов, который подошли под заданный паттерн"""
    
    suitable_tokens = []
    for token in token_list:
        if token_match_node(token, node_pattern) and isinstance(token['id'], int):
            suitable_tokens.append(token['id'] - 1)
    return suitable_tokens

In [None]:
# # for pretty_print() and search_suitable_tokens()
# print(some_sentence.serialize())
# print(search_suitable_tokens(some_sentence, {'Number': 'Sing'}))
# pretty_print({'N': search_suitable_tokens(some_sentence, {'Number': 'Sing'})}, some_sentence)

In [None]:
def find_all_nodes(nodes: dict, sentence: models.TokenList):
    """Ищет подходящие токены для каждого нода в паттерне.
    Возвращает словарь {node_name: [possible_tokens]}
    или пустой словарь, если не все ноды найдены"""
    
    nodes_tokens = {}
    for node in nodes:
        sutable_tokens = search_suitable_tokens(sentence, nodes[node])
        if sutable_tokens:
            nodes_tokens[node] = sutable_tokens
        else:
            return False #changed {} to False
    return nodes_tokens

In [None]:
# #пример 
# node_pattern = {
#     'N': {'NumType': '^Card$'},
#     'M': {'upos': '^aux$'},
# }
# for sentence in sentences:
#     if find_all_nodes(node_pattern, sentence):
#         pretty_print(find_all_nodes(node_pattern, sentence), sentence)

## Проверка на ограничения

### Relations

In [None]:
def all_deprels(token_list: models.TokenList) -> defaultdict:
    """Cоздает словарь вида {'relation': (head, dependent)} из всех отношений в предложении"""
    
    deprels = defaultdict(list)
    for t in token_list:
        if isinstance(t['head'], int) and isinstance(t['id'], int):
            deprels[t['deprel']].append((t['head'] - 1, t['id'] - 1))
    return deprels

In [None]:
# #пример
# from pprint import pprint

# print(some_sentence.metadata['text'])
# pprint(all_deprels(some_sentence))

In [None]:
def pattern_relations(rel_pattern: str, sent_rels: defaultdict):
    """Возвращает все названия отношений в предложении, 
    которые описываются заданной регуляркой"""
    
    rels = []
    for rel in sent_rels:
        if re.search(rel_pattern, rel):
            rels.append(rel)
    return rels

In [None]:
# # пример
# p = r'mod$'
# pattern_relations(p, all_deprels(some_sentence))

In [None]:
def tokens_with_rel(rel_name: str, sent_rels: defaultdict, possible_pairs: Iterable[tuple]) -> set: 
    """Выбирает из пар токенов те, между которыми отношение rel_name"""
    
    if not rel_name in sent_rels:
        return False
    else:
        return set(sent_rels[rel_name]).intersection(possible_pairs)

In [None]:
# # пример
# ex_rel_name = 'advmod'
# ex_sent_rels = all_deprels(some_sentence)
# ex_possible_pairs = list(product(list(range(0, 30)), list(range(0, 30))))
# tokens_with_rel(ex_rel_name, ex_sent_rels, ex_possible_pairs)

In [None]:
def relpattern_tokens(possible_pairs: Iterable[tuple], sentence:models.TokenList, rel_pattern: str) -> set:
    """Ищет среди всех возможных пар токенов те, между которыми отношения,
    описываемые заданным паттерном rel_pattern"""
    
    sent_rels = all_deprels(sentence)
    all_suitable_rels = set()
    for rel in pattern_relations(rel_pattern, sent_rels):
        all_suitable_rels = all_suitable_rels | tokens_with_rel(rel, sent_rels, possible_pairs)
    return all_suitable_rels

In [None]:
# # example
# ex_rel_pattern = r'mod$'
# ex_possible_pairs = list(product(list(range(0, 30)), list(range(0, 30))))
# relpattern_tokens(ex_possible_pairs, some_sentence, ex_rel_pattern)

### Linear distance

In [None]:
def linear_distance(possible_tokens_pairs: Iterable[tuple], lindist: tuple) -> set:
    """Ищет среди данных пар токенов те, между которыми заданное расстояние.
    lindist = tuple(min_distance, max_distance)"""
    
    suitable_tokens = set()
    for pair in possible_tokens_pairs:
        dist = pair[1] - pair[0]
        if dist >= lindist[0] and dist <= lindist[1]:
            suitable_tokens.add(pair)
    return suitable_tokens

In [None]:
# # example
# ex_tokens_pairs = [(2, 3), (4, 8), (4, 5), (2, 7)]
# linear_distance(ex_tokens_pairs, (1, 4))

### Совпадение/Несовпадение значений признаков

In [None]:
def pair_match_fconstraint(token_pair: tuple, sentence: models.TokenList, c_pattern: dict) -> bool:
    """Проверяет соответствует ли пара токенов ограничениям на признаки"""
    
    t1_feats = sentence[token_pair[0]]['feats']
    t2_feats = sentence[token_pair[1]]['feats']
    if t1_feats and t2_feats:
        for c in c_pattern:
            for f in c_pattern[c]:
                if (f in t1_feats) and (f in t2_feats): 
                    if c == 'intersec':
                        if t1_feats[f] != t2_feats[f]:
                            return False
                    elif c == 'disjoint':
                        if t1_feats[f] == t2_feats[f]:
                            
                            return False
                else:
                    return False
        return True
    else:
        return False

In [None]:
# # пример
# constr = {'intersec': ['Number']}
# print(some_sentence[20], some_sentence[25], some_sentence[27])
# pair1, pair2 = (27, 25), (20, 25)
# pair_match_fconstraint(pair1, some_sentence, constr), pair_match_fconstraint(pair2, some_sentence, constr)

In [None]:
def feature_constraint(possible_token_pairs: Iterable[tuple], sentence: models.TokenList, constr_pattern: dict) -> set:
    """Ищет среди данных пар токенов такие, которые соответствуют ограничениям на признаки"""
    
    suitable_pairs = set()
    for pair in possible_token_pairs:
        if pair_match_fconstraint(pair, sentence, constr_pattern):
            suitable_pairs.add(pair)
    return suitable_pairs

### Сопоставление со всеми ограничениями

In [None]:
def match_constraints(nodes_constraints: dict, nodes_tokens: dict, sentence: models.TokenList) -> bool:
    """Для каждой пары нодов из ограничений отбирает те пары токенов, которые соответствуют всем ограничениям."""
    
    for np in nodes_constraints:
        suitable_pairs = list(product(nodes_tokens[np[0]], nodes_tokens[np[1]])) #всевозможные комбинации токенов для нодов
        for constraint in nodes_constraints[np]:
            if constraint == 'deprels':
                suitable_pairs = relpattern_tokens(suitable_pairs, sentence, nodes_constraints[np][constraint])
            elif constraint == 'lindist':
                suitable_pairs = linear_distance(suitable_pairs, nodes_constraints[np][constraint])
            elif constraint == 'fconstraint':
                suitable_pairs = feature_constraint(suitable_pairs, sentence, nodes_constraints[np][constraint])
            else:
                raise ValueError('wrong constraint type')
            
            if not suitable_pairs:
                return False 
            else:
                #print('before:', nodes_tokens)
                # удаляем токены, которые не подошли
                nodes_tokens[np[0]] = list(set([p[0] for p in suitable_pairs]))
                nodes_tokens[np[1]] = list(set([p[1] for p in suitable_pairs]))
                #print('after:', nodes_tokens)
    return nodes_tokens #changed from True to nodes_tokens

In [None]:
def filter_sentence(sentence: models.TokenList, nodes_pattern: dict, constraints: dict) -> bool:
    """Проверяет предложение на соответствие паттерну. Если соответствует паттерну,
    возвращает для каждого нода список из  подошедших токенов"""
    
    found_nodes = find_all_nodes(nodes_pattern, sentence) 
    if not found_nodes:
        return False
    else:
        nodes_suitable_tokens = match_constraints(constraints, found_nodes, sentence)
        if not nodes_suitable_tokens: 
            return False
        else: 
            return nodes_suitable_tokens 

In [None]:
# example 
n_pattern = {
    'N': {'NumType': '^Card$'}, 
    'M': {}
}
r_pattern = {('M', 'N'): {'deprels': '^nummod$'}}
print(sentences[2].metadata['text'])
filter_sentence(sentences[2], n_pattern, r_pattern)

In [None]:
for sentence in sentences:
    tokens = filter_sentence(sentence, n_pattern, r_pattern)
    if tokens:
        pretty_print(tokens, sentence)

## Примеры

In [None]:
# примеры запросов
patterns_names = ['passive with by Agent', 'all the', '2+amod', 'SOmatchnumber']
node_patterns = [
    {
        'V': {},
        'S': {},
        'BY': {'lemma': '^by$'},
        'N': {},
    },
    {
        'A': {'lemma': '^all$'},
        'T': {'lemma': '^the$'},
    },
    {
        'N': {},
        'M': {'upos': 'ADJ'},
    },
    {
        'S': {},
        'V': {},
        'O': {}
    },
]
constraints = [
    {
        ('V', 'S'): {'deprels': '^aux:pass$'},
        ('V', 'N'): {'deprels': '^obl$'},
        ('N', 'BY'): {'deprels': '^case$'},
    },
    {
        ('A', 'T'): {'lindist': (1, 1)}
    },
    {
        ('N', 'M'): {'deprels': '^amod$', 'lindist': (-inf, -2)}
    },
    {
        ('V', 'S'): {'deprels': '^.subj$'},
        ('V', 'O'): {'deprels': '^obj$'},
        ('S', 'O'): {'fconstraint': {'intersec': 'Number'}}
    }
]
tasks = [(patterns_names[i], node_patterns[i], constraints[i]) for i in range(len(patterns_names))]

In [None]:
def probing_dict(class_label: str, sentences: Iterable[models.TokenList],
                nodes_pattern: dict, constraints: dict) -> dict:
    """Составляет словарь {название класса: список предложений}"""
    
    pd = defaultdict(list)
    for sentence in sentences:
        if filter_sentence(sentence, nodes_pattern, constraints):
            pd[class_label].append(sentence.metadata['text'])
    return pd

In [None]:
task = tasks[0]
probing_dict(task[0], sentences, task[1], task[2])

In [None]:
task = tasks[2]

In [None]:
for sentence in sentences:
    tokens = filter_sentence(sentence, task[1], task[2])
    if tokens:
        pretty_print(tokens, sentence)