### Загрузка данных

In [19]:
import re
from conllu import parse
from conllu import models
from itertools import product
from collections import defaultdict
from typing import Iterable

In [2]:
conllu_path = 'en_ewt-ud-train.conllu'

In [3]:
with open(conllu_path, 'r', encoding='utf-8') as f:
    conllufile = f.read()
sentences = parse(conllufile)

In [4]:
some_sentence = sentences[2]

In [5]:
type(some_sentence)

conllu.models.TokenList

In [6]:
node_pattern = {'N': {'upos':'VERB', 'lemma':'walk|go', 'Mood':'Ind|Imp'},
          'M': {'Number':'Plur'}}

## Проверка на узлы

_❓ - можно ли как-то убрать из регулярки ^ и $ при этом, чтобы от искал полное соответствие?_

_for Vika: проверить все ли норм при поиске с регистром_

In [35]:
def token_match_node(token: models.Token, node_pattern: str) -> bool:
    "Проверяет соответствует ли данный токен заданному паттерну"
    
    for feat in node_pattern:
        if feat in token.keys():
            if not re.search(node_pattern[feat], token[feat]): #match
                return False
        elif token['feats']:
            if feat in token['feats']:
                if not re.search(node_pattern[feat], token['feats'][feat]): #match
                    return False
        else:
            return False
    return True

In [9]:
def search_suitable_tokens(token_list: models.TokenList, node_pattern: str) -> list:
    "Ищет все токены подходящие под заданный паттерн"
    
    suitable_tokens = []
    for token in token_list:
        if token_match_node(token, node_pattern):
            suitable_tokens.append(token['id'])
    return suitable_tokens

In [46]:
def find_all_nodes(nodes: dict, sentence: models.TokenList) -> dict:
    "проверяет есть ли в предложении все заданные ноды"
    
    nodes_tokens = {}
    for node in nodes:
        sutable_tokens = search_suitable_tokens(sentence, nodes[node])
        if sutable_tokens:
            nodes_tokens[node] = sutable_tokens
        else:
            return False
    return nodes_tokens

_надо, наверное, как-то проверять, что для разных паттернов разные токены?_

## Проверка на ограничения

_❓ как лучше? пройтись по всем constraints (deprels, lindist, treedist, common_feats) и на каждом шагу выкидывать те пары, которые не подходят или проходить по каждой паре и проверять на все constraints (и соответственно если хоть одна подошла, то оставлять предложение)_

In [12]:
nodes_constraints = {("N", "M"): {'deprels': "nsubj|aux", "lindist": (-1, 1)}}

In [14]:
def all_deprels(token_list: models.TokenList) -> defaultdict:
    "создает словарь вида {'relation': (head, dependent)} из всех отношений в предложении"
    
    deprels = defaultdict(list)
    for t in token_list:
        deprels[t['deprel']].append((t['head'], t['id']))
    return deprels

In [15]:
def pattern_relations(rel_pattern: str, sent_rels: defaultdict):
    "возвращает все названия отношений, которые подходят под заданный паттерн"
    
    rels = []
    for rel in sent_rels:
        if re.search(rel_pattern, rel):
            rels.append(rel)
    return rels

In [22]:
def tokens_with_rel(rel_name: str, sent_rels: defaultdict, possible_pairs: Iterable[tuple]): #rel_in_sent
    "возвращает множество из пар токенов, между которыми отношение rel_name"
    
    if not rel_name in sent_rels:
        return False
    else:
        return set(sent_rels[rel_name]).intersection(possible_pairs)

In [23]:
def relpattern_tokens(possible_pairs: Iterable[tuple], sentence:models.TokenList, rel_pattern: str) -> set:
    "возвращает все пары токенов, которые попадают под заданный паттерн rel_pattern"
    
    sent_rels = all_deprels(sentence)
    pattern_rels = pattern_relations(rel_pattern, sent_rels)
    all_suitable_rels = set()
    for rel in pattern_rels:
        all_suitable_rels = all_suitable_rels | tokens_with_rel(rel, sent_rels, possible_pairs)
    return all_suitable_rels

In [47]:
# возвращает пары токенов, между которыми заданное расстояние
def linear_distance(possible_tokens_pairs: set, lindist: tuple) -> set:
    "возвращает пары токенов, между которыми заданное расстояние"
    
    suitable_tokens = set()
    for pair in possible_tokens_pairs:
        dist = pair[1] - pair[0]
        if dist >= lindist[0] and dist <= lindist[1]:
            suitable_tokens.add(pair)
    return suitable_tokens

In [25]:
def match_constraints(nodes_constraints: dict, nodes_tokens: dict, sentence: models.TokenList) -> bool:
    for nodes in nodes_constraints:
        suitable_pairs = list(product(nodes_tokens[nodes[0]], nodes_tokens[nodes[1]])) #всевозможные комбинации токенов для нодов
        for constraint in nodes_constraints[nodes]:
            if constraint == 'deprels':
                suitable_pairs = relpattern_tokens(suitable_pairs, sentence, nodes_constraints[nodes][constraint])
                if not suitable_pairs:
                    return False
            if constraint == 'lindist':
                suitable_pairs = linear_distance(suitable_pairs, nodes_constraints[nodes][constraint])
                if not suitable_pairs:
                    return False
            #союда надо добавить то же самое для расстояния в дереве и совпадения/несовпадения признаков 
    return True

In [42]:
def filter_sentence(sentence: models.TokenList, nodes_pattern: dict, constraints: dict) -> bool:
    "возвращает True, если предложение соответствует заданному паттерну"
    
    found_nodes = find_all_nodes(nodes_pattern, sentence)
    if not found_nodes:
        return False
    else:
        if not match_constraints(constraints, found_nodes, sentence): 
            return False
        else: 
            return True 

In [49]:
# примеры запросов
patterns_names = ['passive with by Agent', 'all the']
node_patterns = [
    {
        'V': {},
        'S': {},
        'BY': {'lemma': '^by$'},
        'N': {},
    },
    {
        'A': {'lemma': '^all$'},
        'T': {'lemma': '^the$'},
    }
]
constraints = [
    {
        ('V', 'S'): {'deprels': '^aux:pass$'},
        ('V', 'N'): {'deprels': '^obl$'},
        ('N', 'BY'): {'deprels': '^case$'},
    },
    {
        ('A', 'T'): {'lindist': (1, 1)}
    }
]

In [50]:
probing_data = defaultdict(list)
for pattern in patterns_names:
    for sentence in sentences:
        if filter_sentence(sentence, node_patterns[1], constraints[1]):
            probing_data[pattern].append(sentence.metadata['text'])

In [54]:
probing_data['passive with by Agent']

['If he or she did not, then they should have all the same rights as other Iraqis.',
 'Unreported by the international media, the Valley of Kashmir has seen an ethnic and cultural genocide that has resulted in the fleeing from the valley of almost all the Hindu families who have been living there since human habitation was first recorded.',
 'Unreported by the international media, the Valley of Kashmir has seen an ethnic and cultural genocide that has resulted in the fleeing from the valley of almost all the Hindu families who have been living there since human habitation was first recorded.',
 'All the regional countries have publicly backed the Karzai government and supported the electoral process, but serious undercurrents remain as they all have their favourite contenders in Afghanistan.',
 'And of all the people who the county is getting to find a replacement...is none other than Lee Brown, the original supervisor of the Atlanta PD, who was in office during the murders and the sub