In [None]:
import os
import stanza
from spacy_stanza import StanzaLanguage
import time
from datetime import datetime, timedelta
from collections import defaultdict

In [None]:
stanza_nlp = stanza.Pipeline(lang='uk')
nlp = StanzaLanguage(stanza_nlp)

In [None]:
def is_animate(word):
    if word.pos in ["NOUN", "PROPN"] and "=Anim" in word.feats:
        return True
    return False

def find_noun_children(text, animate_word, collocs):
    doc = nlp(text)
    for sent in doc.sents:
        for token in sent:
            if token.text == animate_word:
                for child in token.children:
                    if child.pos_ == 'ADJ' and child.dep_ == 'amod':
                        key = ' '.join([child.lemma_, token.lemma_])
                        collocs[key] += 1
                        for grandch in child.children:
                            # "То її держить йому тутешній волохатий бог, лютий, скупий і ненажерливий;" -> знаходить "скупий", "ненажерливий"
                            if grandch.pos_ == "ADJ" and grandch.dep_ == 'conj':
                            key = ' '.join([grandch.lemma_, token.lemma_]) # "скупий бог", "ненажерливий бог"
                            collocs[key] += 1
    return collocs

In [None]:
t1 = time.time()
script_path = os.path.abspath('__file__') 
path_list = script_path.split(os.sep)
script_directory = path_list[0:len(path_list)-1]
rel_path = "tasks/02-structural-linguistics/data/"
PATH = "/".join(script_directory[:4]) + "/" + rel_path

def get_collocations():
    with open(PATH + "tyhrolovy.txt", "r") as f:
        data = f.readlines()
    
    lines = 0
    anim_count = 0
    uk_collocs = defaultdict(int)
    out_file = open("uk_collocations.txt", "w+")
    
    for line in data:
        lines += 1
        if line is None or len(line) <= 1:
            continue 
        
        stanza_doc = stanza_nlp(line)
        for sent in stanza_doc.sentences:
            for word in sent.words:
                if is_animate(word) == True:
                    anim_count += 1
                    collocations = find_noun_children(line, word.text, uk_collocs)
                    uk_collocs.update(collocations)
                    
    sorted_colls = sorted(uk_collocs.items(), reverse = True, key=lambda kv: kv[1])               
    for coll in sorted_colls:
        out_file.write('{}: {}\n'.format(coll[1], coll[0]))
    out_file.close()
    
    print("Total lines processed: {}".format(lines))
    print("\nTotal number of animate nouns: {}".format(anim_count))
    print("\nTotal number of collocations: {}".format(len(sorted_colls)))
    print("\nSee all collocations in 'uk_collocations.txt'")
    print("\nMost common collocations:", sorted_colls[:20])
    
get_collocations()    
t2 = time.time()
end_time = t2 - t1
print("Time: ", str(timedelta(seconds=end_time)))

# Total lines processed: 5327
# Total number of animate nouns: 4976
# Total number of collocations: 655
# See all collocations in 'uk_collocations.txt'
# Most common collocations: [('старий Сірко', 86), ('дикий коза', 6), ('старий Сірчих', 6), ('великий начальник', 5), ('сонячний зайчик', 5), ('цибатий зять', 5), ('старий Мороз', 5), ('керований Григорій', 5), 
# ('вірний пес', 5), ('рідний людина', 4), ('близький людина', 4), ('нав’ючений кінь', 4), ('звіровий собака', 4), 
# ('плямистий оленя', 4), ('ходовий вивірка', 4), ('смугастий звірок', 3), ('другий ведмідь', 3), ('якутський пес', 3), 
# ('вражий мама', 3), ('великий пан', 3)]