In [0]:
from IPython.display import clear_output

In [0]:
!pip install pycodestyle flake8 pycodestyle_magic
%load_ext pycodestyle_magic
clear_output()

In [0]:
import pandas as pd
import numpy as np
import pickle

In [0]:
!wget https://www.dropbox.com/s/7om3xdy3psg5207/collocation_counts.zip?dl=0 -O collocation_counts.zip
!unzip collocation_counts.zip
clear_output()

In [0]:
domains = ['linguistics', 'sociology', 'history',
           'politology', 'law', 'psychology', 'economics']

unigram_probs = dict()
bigram_probs = dict()

In [0]:
for domain in domains:
    unigrams = pd.read_excel(f"{domain}_collocation_counts.xlsx",
                             sheet_name='unigrams')
    unigrams = unigrams.query("tag == 'S' | tag == 'V'")
    unigrams = unigrams.reset_index(drop=True)
    unigram_probs[domain] = {unigrams['ngram'][i]: unigrams['raw frequency'][i]
                             for i in range(len(unigrams.index))}

In [0]:
for domain in domains:
    bigrams = pd.read_excel(f"{domain}_collocation_counts.xlsx",
                            sheet_name='bigrams')
    if domain == 'history':
        bigrams.columns = ["index", "frequency", "likelihood ratio", "ngram",
                           "pmi", "raw frequency", "t score", "tag"]
    bigrams = bigrams.query(
        "tag == 'S S' | tag == 'S V' | tag == 'V S' | tag == 'V V'")
    bigrams = bigrams.reset_index(drop=True)
    bigram_probs[domain] = {bigrams['ngram'][i]: bigrams['raw frequency'][i]
                            for i in range(len(bigrams.index))}

In [0]:
# объединяем несколько частотных словарей в один
# если какое-то слово или биграмма встречается больше
# чем в одном словаре, то усредняем значения
unigram = dict()
bigram = dict()

unigram_keys = set()
bigram_keys = set()
for domain in domains:
    unigram_keys.update(unigram_probs[domain].keys())
    bigram_keys.update(bigram_probs[domain].keys())
unigram_keys = list(unigram_keys)
bigram_keys = list(bigram_keys)

In [0]:
for key in unigram_keys:
    freqs = []
    for domain in domains:
        if key in unigram_probs[domain]:
            freqs.append(unigram_probs[domain][key])
    unigram[key] = np.mean(freqs)

In [0]:
for key in bigram_keys:
    freqs = []
    for domain in domains:
        if key in bigram_probs[domain]:
            freqs.append(bigram_probs[domain][key])
    bigram[key] = np.mean(freqs)

In [0]:
with open("unigrams.pkl", "wb") as file:
    pickle.dump(unigram, file)
with open("bigrams.pkl", "wb") as file:
    pickle.dump(bigram, file)

In [0]:
from google.colab import files
files.download("unigrams.pkl")
files.download("bigrams.pkl") 