# Collocations

In [2]:
import spacy
import nltk

In [93]:
from tqdm.notebook import tqdm

In [3]:
nlp = spacy.load("en_core_web_md")

In [4]:
from nltk.corpus import wordnet

In [35]:
first_set = ['say', 'tell', 'speak', 'claim', 'communicate']

In [43]:
syn_series = set()
for word in first_set:
    word_synset = wordnet.synsets(word, 'v')[0]
    syn_lemmas = word_synset.lemma_names()
    print(f'{word} --- {syn_lemmas}')
    syn_series.update([word]+syn_lemmas)
print(f'\nResulting series: {syn_series}')

say --- ['state', 'say', 'tell']
tell --- ['state', 'say', 'tell']
speak --- ['talk', 'speak', 'utter', 'mouth', 'verbalize', 'verbalise']
claim --- ['claim']
communicate --- ['communicate', 'pass_on', 'pass', 'pass_along', 'put_across']

Resulting series: {'state', 'utter', 'tell', 'speak', 'pass_on', 'put_across', 'pass', 'say', 'talk', 'pass_along', 'verbalise', 'claim', 'communicate', 'verbalize', 'mouth'}


In [44]:
with open('/home/yevhen/prj/prj-nlp-2020/tasks/02-structural-linguistics/data/blog2008.txt', 'r') as f:
    blogs = f.read().splitlines()

In [45]:
len(blogs)

303994

In [49]:
n = 215
blogs[n]

'" We do n\'t mind going to any extent , as nobody is involved from the government or agency side , " he said .'

In [98]:
def find_adverbs_with_ly(doc, syn_series, stats_dict):
    for token in doc:
        if token.lemma_ in syn_series:
            for child in token.children:
                if child.pos_ == 'ADV' and child.text[-2:] == 'ly':
                    current_num = stats_dict[token.lemma_].get(child.lemma_, 0)
                    stats_dict[token.lemma_][child.lemma_] = current_num + 1
    return stats_dict

In [296]:
stats_dict = {verb:{} for verb in syn_series}
for blog in tqdm(blogs):
    doc = nlp(blog)
    stats_on_all_blogs = find_adverbs_with_ly(doc, syn_series, stats_dict)

HBox(children=(FloatProgress(value=0.0, max=303994.0), HTML(value='')))




In [297]:
stats_on_all_blogs

{'state': {'subtly': 1,
  'suggestively': 1,
  'only': 5,
  'calmly': 1,
  'bluntly': 7,
  'unequivocally': 12,
  'exactly': 2,
  'repeatedly': 5,
  'merely': 2,
  'absurdly': 1,
  'previously': 6,
  'specifically': 4,
  'publicly': 22,
  'briefly': 3,
  'expressly': 3,
  'simply': 7,
  'clearly': 19,
  'really': 1,
  'especially': 1,
  'emphatically': 5,
  'recently': 6,
  'obscenely': 1,
  'forcefully': 2,
  'plainly': 3,
  'directly': 1,
  'formally': 1,
  'honestly': 1,
  'accurately': 1,
  'correctly': 4,
  'explicitly': 13,
  'authoritatively': 1,
  'persistently': 1,
  'shortly': 1,
  'famously': 2,
  'officially': 1,
  'flatly': 7,
  'basically': 2,
  'ludicrously': 1,
  'generically': 1,
  'virtually': 2,
  'properly': 1,
  'categorically': 2,
  'rightly': 2,
  'baldly': 1,
  'formerly': 1,
  'finally': 1,
  'respectively': 1,
  'essentially': 1,
  'firmly': 1,
  'adamantly': 1,
  'inaccurately': 1,
  'incorrectly': 3,
  'uncategorically': 1,
  'additionally': 1,
  'naturally'

In [298]:
sorted_dict  = {}
for verb in stats_on_all_blogs.keys():
    sorted_dict[verb] = {
        key: value 
        for key, value 
        in sorted(stats_on_all_blogs[verb].items(), key=lambda item: item[1], reverse=True)
    }

In [299]:
sorted_dict

{'state': {'publicly': 22,
  'clearly': 19,
  'explicitly': 13,
  'unequivocally': 12,
  'bluntly': 7,
  'simply': 7,
  'flatly': 7,
  'previously': 6,
  'recently': 6,
  'only': 5,
  'repeatedly': 5,
  'emphatically': 5,
  'specifically': 4,
  'correctly': 4,
  'briefly': 3,
  'expressly': 3,
  'plainly': 3,
  'incorrectly': 3,
  'exactly': 2,
  'merely': 2,
  'forcefully': 2,
  'famously': 2,
  'basically': 2,
  'virtually': 2,
  'categorically': 2,
  'rightly': 2,
  'falsely': 2,
  'subtly': 1,
  'suggestively': 1,
  'calmly': 1,
  'absurdly': 1,
  'really': 1,
  'especially': 1,
  'obscenely': 1,
  'directly': 1,
  'formally': 1,
  'honestly': 1,
  'accurately': 1,
  'authoritatively': 1,
  'persistently': 1,
  'shortly': 1,
  'officially': 1,
  'ludicrously': 1,
  'generically': 1,
  'properly': 1,
  'baldly': 1,
  'formerly': 1,
  'finally': 1,
  'respectively': 1,
  'essentially': 1,
  'firmly': 1,
  'adamantly': 1,
  'inaccurately': 1,
  'uncategorically': 1,
  'additionally': 

In [305]:
for verb, adverbs in sorted_dict.items():
    print(verb, list(adverbs.items())[:10], '\n')

state [('publicly', 22), ('clearly', 19), ('explicitly', 13), ('unequivocally', 12), ('bluntly', 7), ('simply', 7), ('flatly', 7), ('previously', 6), ('recently', 6), ('only', 5)] 

utter [('actually', 2), ('really', 1), ('casually', 1), ('pejoratively', 1), ('only', 1)] 

tell [('recently', 25), ('reportedly', 14), ('privately', 12), ('only', 9), ('basically', 9), ('finally', 9), ('repeatedly', 8), ('actually', 8), ('really', 8), ('specifically', 8)] 

speak [('directly', 33), ('publicly', 15), ('fiercely', 12), ('only', 11), ('briefly', 9), ('generally', 9), ('recently', 8), ('openly', 8), ('politically', 7), ('loudly', 7)] 

pass_on [] 

put_across [] 

pass [('quickly', 13), ('unanimously', 11), ('only', 9), ('probably', 8), ('recently', 5), ('overwhelmingly', 5), ('easily', 4), ('narrowly', 4), ('eventually', 4), ('apparently', 3)] 

say [('actually', 76), ('recently', 73), ('repeatedly', 55), ('simply', 46), ('explicitly', 39), ('publicly', 36), ('basically', 34), ('really', 31),

## Про Тигроловів

In [112]:
with open('/home/yevhen/prj/prj-nlp-2020/tasks/02-structural-linguistics/data/tyhrolovy.txt', 'r') as f:
    tigers = f.read().splitlines()
    tigers = [item for ]

In [306]:
import stanza #former StanfordNLP

In [116]:
stanza.download('uk')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 116kB [00:00, 3.79MB/s]                    
2020-03-21 00:59:36 INFO: Downloading default packages for language: uk (Ukrainian)...
Downloading http://nlp.stanford.edu/software/stanza/1.0.0/uk/default.zip: 100%|██████████| 239M/239M [01:52<00:00, 2.13MB/s] 
2020-03-21 01:01:34 INFO: Finished downloading models and saved to /home/yevhen/stanza_resources.


In [117]:
stanza_nlp = stanza.Pipeline('uk')

2020-03-21 01:02:45 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package |
-----------------------
| tokenize  | iu      |
| mwt       | iu      |
| pos       | iu      |
| lemma     | iu      |
| depparse  | iu      |

2020-03-21 01:02:45 INFO: Use device: cpu
2020-03-21 01:02:45 INFO: Loading: tokenize
2020-03-21 01:02:45 INFO: Loading: mwt
2020-03-21 01:02:45 INFO: Loading: pos
2020-03-21 01:02:46 INFO: Loading: lemma
2020-03-21 01:02:46 INFO: Loading: depparse
2020-03-21 01:02:47 INFO: Done loading processors!


In [290]:
def get_anim_amod(doc, stats_dict):
    for sent in doc.sentences:
        for word in sent.words:
            if word.upos == 'ADJ' and word.deprel in ('conj', 'amod'):
                if word.deprel == 'conj':
                    head_noun_index = sent.words[word.head-1].head
                elif word.deprel == 'amod': 
                    head_noun_index = word.head
                parent_word = sent.words[head_noun_index-1]
                if parent_word.upos == 'NOUN' and 'Animacy=Anim' in parent_word.feats:
                    phrase = ' '.join([word.lemma,parent_word.lemma])
                    current_num = stats_dict.get(phrase, 0)
                    stats_dict[phrase] = current_num + 1
    return stats_dict

In [292]:
tigers_stats_dict = {}
for tiger in tqdm(tigers2):
    doc = stanza_nlp(tiger)
    stats_on_all_tigers = get_anim_amod(doc, tigers_stats_dict)

HBox(children=(FloatProgress(value=0.0, max=2665.0), HTML(value='')))




In [293]:
stats_on_all_tigers

{'вогненноокий хробак': 1,
 'пекельний потвора': 1,
 'фіктивний потвора': 1,
 'наївний казок': 1,
 'китайський казок': 1,
 'легендарний дракон': 1,
 'справжній дракон': 1,
 'найбільший дракон': 1,
 'найстрашніший дракон': 1,
 'велетенський циклоп': 1,
 'двоокий циклоп': 1,
 'живий мертвяк': 1,
 'приречений безнадійний': 1,
 'змордований безнадійний': 1,
 'великий начальник': 5,
 'рідний товариш': 1,
 'обірваний людина': 1,
 'брудний людина': 1,
 'зарослий людина': 1,
 'випадковий людина': 1,
 'змарнований людина': 1,
 'знеосіблений людина': 1,
 'гордий сокола': 1,
 'відомий нащадка': 1,
 'гордий нащадка': 1,
 'перший каторжанина': 1,
 'страшний злочинця': 1,
 'державний злочинця': 1,
 'русявий Юнак': 1,
 'екзальтований пасажир': 1,
 'безперервний мандрівка': 1,
 'десятиденний мандрівка': 1,
 'різнокольоровий абажур': 1,
 'експансивний мешканець': 1,
 'горластий мешканець': 1,
 'колгоспний колективізатор': 1,
 'радгоспний бюрократ': 1,
 'індустріальний авантюрник': 1,
 'потенціальний зл

In [307]:
stats_on_all_tigers['сонячний зайчик']

4

In [314]:
the_most_value = 0
the_most_key = ""
for key, value in stats_on_all_tigers.items():
    if value > the_most_value:
        the_most_key, the_most_value =key, value
print(the_most_key, the_most_value)

великий начальник 5
