# Вычисление показателей

In [None]:
import re
import os
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from tqdm import tqdm_notebook

In [None]:
# PATH_bigrams -- путь к папке с лемматизированными текстами основного корпуса
# PATH_bigrams_lists -- путь к папке со списками биграмм
# PATH_reference -- путь к папке с лемматизированными текстами референтного корпуса
# PATH_reference_lists -- путь к папке со списками биграмм референтного корпуса
# PATH2 -- путь для сохранения файлов
# файлы, заканчивающиеся на _words.txt -- списки биграмм, очищенные от данных о частотности и тд

#### Функция для вычисления показателя Weirdness:

In [None]:
def count_weirdness(pos, PATH_bigrams_lists, PATH_reference_lists, PATH2):
    corp = pd.read_csv('{}/bigrams_{}.txt'.format(PATH_bigrams_lists, pos), sep='\t',
                       names=['id', 'freq', 'range', 'ngram'], index_col=False)
    ref = pd.read_csv('{}/reference_{}.txt'.format(PATH_reference_lists, pos), sep='\t',
                        names=['id', 'freq', 'range', 'ngram'], index_col=False).drop([0, 1], axis=0)
    
    df = pd.merge(corp, ref, on='ngram', suffixes=('_corp', '_ref')).drop(['id_corp', 'id_ref'], axis=1)
    
    df['size_corp'] = 1970426
    df['size_ref'] = 1165252
    
    df['weird'] = (df['freq_corp']/df['size_corp'])/(df['freq_ref']/df['size_ref'])
    
    weirdness_sorted = df.sort_values('weirdness', ascending=False)
    weirdness_sorted[['ngram',
                      'weirdness',
                      'freq_corp',
                      'size_corp',
                      'freq_ref',
                      'size_ref']].to_csv(PATH2 + pos, sep='\t', index=False)

In [None]:
pos_list = ['s_v', 'v_s', 'pr_s', 's_pr', 'a_s', 'adv_v', 'v_pr']
for pos in tqdm_notebook(pos_list):
    count_weirdness(pos, PATH_bigrams_lists, PATH_reference_lists, PATH2)

#### Функция для вычисления показателя G2 от LogLikelihood:

In [None]:
def count_loglikelihood(pos, PATH_bigrams_lists, PATH_reference_lists, PATH2):
    corp = pd.read_csv('{}/bigrams_{}.txt'.format(PATH_bigrams_lists, pos), sep='\t',
                       names=['id', 'freq', 'range', 'ngram'], index_col=False)
    ref = pd.read_csv('{}/reference_{}.txt'.format(PATH_reference_lists, pos), sep='\t',
                        names=['id', 'freq', 'range', 'ngram'], index_col=False).drop([0, 1], axis=0)
    
    df = pd.merge(corp, ref, on='ngram', suffixes=('_corp', '_ref')).drop(['id_corp', 'id_ref'], axis=1)
    
    df['size_corp'] = 1970426
    df['size_ref'] = 1165252
    
    df['e1'] = df['size_corp'] * (df['freq_corp'] + df['freq_ref']) / (df['size_corp'] + df['size_ref'])
    df['e2'] = df['size_ref'] * (df['freq_corp'] + df['freq_ref']) / (df['size_corp'] + df['size_ref'])
    df['g2'] = 2 * ((df['freq_corp'] * np.log(df['freq_corp'] / df['e1']))
                  + (df['freq_ref'] * np.log(df['freq_ref'] / df['e2'])))
    
    g2_sorted = df.sort_values('g2', ascending=False)
    g2_sorted[['ngram',
               'g2',
               'freq_corp',
               'size_corp',
               'freq_ref',
               'size_ref']].to_csv(PATH2 + pos, sep='\t', index=False)

In [None]:
pos_list = ['s_v', 'v_s', 'pr_s', 's_pr', 'a_s', 'adv_v', 'v_pr']
for pos in tqdm_notebook(pos_list):
    count_loglikelihood(pos, PATH_bigrams_lists, PATH_reference_lists, PATH2)

#### Функция для вычисления TF-IDF на основе внешнего корпуса:

Вычисление для каждой биграммы количества документов внешнего корпуса, в которых она встречается:

In [None]:
def count_number_of_documents(words, num_documents, PATH_reference):
    for file in os.listdir(PATH_reference):
        
        with open(PATH_reference + file, 'r', encoding='utf-8') as f:
            text = f.read()
            
            for elem in words:
                res = re.search(elem, text)
                if res:
                    num_documents[elem] += 1
                else:
                    num_documents[elem] == 1
    return num_documents

In [None]:
pos_list = ['pr_s', 's_v', 'v_s', 's_pr', 'a_s', 'adv_v', 'v_pr']
num_documents = defaultdict(lambda: 1)

for pos in tqdm_notebook(pos_list):
    with open('{}/bigrams_{}_words.txt'.format(PATH_bigrams, pos)) as f:
        lines = f.read()
        words = lines.split('\n')
        
        num_documents = count_number_of_documents(words, num_documents, PATH_reference)

Подсчёт TF-IDF:

In [None]:
def count_tf_idf(pos, num_documents, PATH_bigrams_lists, PATH2):
    df = pd.read_csv('{}/bigrams_{}_words.txt'.format(PATH_bigrams_lists, pos), sep='\t',
                   names=['id', 'freq', 'range', 'ngram'], index_col=False)
    
    df['tf_idf'] = ''
    for i in df.index:    
        df['tf_idf'].loc[i] = df['freq'].loc[i] * np.log10(20 / num_documents[df['ngram'].loc[i]])
    
    tf_idf_sorted = df.sort_values('tf_idf', ascending=False)
    tf_idf_sorted.to_csv(PATH2 + pos, sep='\t', index=False)

In [None]:
pos_list = ['s_v', 'v_s', 'pr_s', 's_pr', 'a_s', 'adv_v', 'v_pr']
for pos in tqdm_notebook(pos_list):
    count_tf_idf(pos, num_documents, PATH_bigrams_lists, PATH2)

#### Функция для вычисления T-Score:

Сначала с помощью программы AntConc был получен частотный список слов на основе исследуемого корпуса.

In [None]:
wordlist = pd.read_csv(PATH_bigrams_lists + 'wordlist.txt', sep='\t',
                   names=['id', 'freq_wordlist', 'word'], index_col=False).drop([0, 1, 2], axis=0)

In [None]:
def add_word_freqs(df, wordlist):  
    
    for i in tqdm_notebook(df.index):
        df['first_word'].loc[i] = df['split'].loc[i][0]
        df['sec_word'].loc[i] = df['split'].loc[i][1]
    
    df_w = pd.merge(df, wordlist[['freq_wordlist', 'word']],
         left_on='first_word', right_on='word').drop('word', axis=1)
    df_w = df_w.rename(columns={'freq_wordlist': 'freq_first_word'})
    
    df_w = pd.merge(df_w, wordlist[['freq_wordlist', 'word']],
         left_on='sec_word', right_on='word').drop('word', axis=1)
    df_w = df_w.rename(columns={'freq_wordlist': 'freq_sec_word'})
    
    return df_w

In [None]:
def count_t_score(pos, PATH_bigrams, PATH2, wordlist):
    df = pd.read_csv('{}/bigrams_{}_words.txt'.format(PATH_bigrams, pos), sep='\t',
                   names=['id', 'freq', 'range', 'ngram'], index_col=False)
    
    df['first_word'] = ''
    df['sec_word'] = ''
    df['split'] = df['ngram'].str.split()

    df = add_word_freqs(df, wordlist)
    
    df_w['t-score'] = (df_w['freq'] / 125277 - (df_w['freq_first_word'] / 125277) * (df_w['freq_sec_word']
                        / 125277)) / np.sqrt(df_w['freq'] / 125277 / 125277)
    
    t_score_sorted = df_w.sort_values('t-score', ascending=False)
    t_score_sorted.to_csv(PATH2 + pos, sep='\t', index=False)

In [None]:
pos_list = ['s_v', 'v_s', 'pr_s', 's_pr', 'a_s', 'adv_v', 'v_pr']
for pos in tqdm_notebook(pos_list):
    count_t_score(pos, PATH_bigrams, PATH2, wordlist)

#### Функция для вычисления C-Value и NC-Value:

Сначала с помощью программы AntConc были получены списки 3-grams, 4-grams, 5-grams и 6-grams на основе исследуемого корпуса.

In [None]:
ngrams3 = pd.read_csv('ngrams3.txt', names=['rank', 'freq', 'range', 'ngrams'],
                      sep='\t', low_memory=False).drop([0, 1], axis=0)
ngrams4 = pd.read_csv('ngrams4.txt', names=['rank', 'freq', 'range', 'ngrams'],
                      sep='\t', low_memory=False).drop([0, 1], axis=0)
ngrams5 = pd.read_csv('ngrams5.txt', names=['rank', 'freq', 'range', 'ngrams'],
                      sep='\t', low_memory=False).drop([0, 1], axis=0)
ngrams6 = pd.read_csv('ngrams6.txt', names=['rank', 'freq', 'range', 'ngrams'],
                      sep='\t', low_memory=False).drop([0, 1], axis=0)

data = pd.concat([ngrams6, ngrams5, ngrams4, ngrams3], ignore_index=True)

Затем создаем словарь, в котором для каждого сочетания двух слов из списков 3-grams, 4-grams, 5-grams и 6-grams записаны все ngrams, в состав которых оно входит, и частотность этих ngrams.

In [None]:
d = defaultdict(lambda: {'freq': 0, 'ngrams': {}})

for i in tqdm_notebook(data.index):
    lst = data['ngrams'].loc[i].split()
    
    for idx in range (len(lst) - 1):
        d[lst[idx] + ' ' + lst[idx+1]]['freq'] += data['freq'].loc[i]
        d[lst[idx] + ' ' + lst[idx+1]]['ngrams'][data['ngrams'].loc[i]] = data['freq'].loc[i]

Функция для подсчёта C-Value:

In [None]:
def count_c_value_for_ngrams(d, df, d_for_ngrams):
    for i in tqdm_notebook(df.index):
        
        if not d.get(df['ngram'].loc[i]):
            df['c-value'].loc[i] = np.log2(2) * df['freq'].loc[i]
        else:
            df['c-value'].loc[i] = np.log2(2) * (df['freq'].loc[i] - d[df['ngram'].loc[i]]['freq'] /
                                                 len(d[df['ngram'].loc[i]]['ngrams']))
            d_for_ngrams = count_c_value_for_bigger_ngrams(d_for_ngrams, d, df, i)
    
    return df, d_for_ngrams

Функция для подсчёта C-Value для 3-grams, 4-grams, 5-grams и 6-grams, если в них встречается биграмма, для которой уже вычислено C-Value:

In [None]:
def count_c_value_for_bigger_ngrams(d_for_ngrams, d, df, i):
    for elem in d[df['ngram'].loc[i]]['ngrams']:
        bigger_ngrams = {'freq': 0, 'ngrams': {}}
        
        for b_elem in d[df['ngram'].loc[i]]['ngrams']:
            if elem in b_elem and elem != b_elem:
                bigger_ngrams['freq'] += d[df['ngram'].loc[i]]['ngrams'][b_elem]
                bigger_ngrams['ngrams'][b_elem] = d[df['ngram'].loc[i]]['ngrams'][b_elem]
        
        d_for_ngrams['ngram'].append(elem)
        
        if bigger_ngrams['freq'] != 0:
            d_for_ngrams['c-value'].append(np.log2(len(elem.split()) * (d[df['ngram'].loc[i]]['ngrams'][elem] -
                                             bigger_ngrams['freq']/len(bigger_ngrams['ngrams']))))
        else:
            d_for_ngrams['c-value'].append(np.log2(len(elem.split()) * d[df['ngram'].loc[i]]['ngrams'][elem]))
    
    return d_for_ngrams

Функция, которая для каждого ngram записывает в словарь контекстные слова: 

In [None]:
def create_w_context(c_value_250, pos, PATH_bigrams):
    w_context = defaultdict(lambda: defaultdict(lambda: 0))
    
    for file in tqdm_notebook(os.listdir(PATH_bigrams)):
        with open(PATH_bigrams + file, 'r', encoding='utf-8') as f:
            text = f.read()
            
            for elem in c_value_250['ngram']:
                pat = re.compile('(\w+ )?{} (\w+)?'.format(elem))
                res = re.findall(pat, text)
                for words in res:
                    for w in words:
                        w_context[elem][w] += 1
    return w_context

Функция для подсчета количества ngrams, с которыми встречаются извлеченные контекстные слова:

In [None]:
def create_w_count(w_context, pos):
    w_count = defaultdict(lambda: 0)
    
    for ngram in w_context:
        words = []
        for w in w_context[ngram]:
            if w not in words:
                w_count[w] += 1
                words.append(w)
                
    return w_count

In [None]:
def count_nc_value(i, w_context, w_count, c_value_250):
    context_words = 0
    
    for elem in set(w_context[c_value_250['ngram'].loc[i]]):
        count_elem = 0
        
        for other in w_context[c_value_250['ngram'].loc[i]]:
            if elem == other:
                count_elem += 1
        
        context_words += count_elem * w_count[elem] / len(w_count)
    
    return 0.8 * c_value_250['c-value'].loc[i] + 0.2 * context_words

In [None]:
def count_c_nc_value(pos, d, PATH_bigrams, PATH2):
    
    df = pd.read_csv('{}/bigrams_{}.txt'.format(PATH_bigrams, pos), sep='\t',
                   names=['id', 'freq', 'range', 'ngram'], index_col=False)
    
    df['c-value'] = 0
    d_for_ngrams = {'ngram': [], 'c-value': []}
    
    df, d_for_ngrams = count_c_value_for_ngrams(d, df, d_for_ngrams)
    
    c_value_all = pd.concat([df[['ngram','c-value']],
                             pd.DataFrame(d_for_ngrams)]).sort_values('c-value', ascending=False)
    c_value_all.to_csv('/home/zu_ann/jahresarbeit/testfile/c_value_{}.txt'.format(pos),
                           sep='\t')
    
    c_value_250 = c_value_all[0:249]

    w_context = create_w_context(c_value_250, pos, PATH_bigrams)
    
    w_count = create_w_count(w_context, pos)
    
    c_value_250['nc-value'] = 0
    
    for i in tqdm_notebook(c_value_250.index):
        c_value_250['nc-value'].loc[i] = count_nc_value(i, w_context, w_count, c_value_250)
    
    c_value_250.sort_values('nc-value', ascending=False).to_csv('{}/nc_value_{}.txt'.format(PATH2, pos),
                           sep='\t', index=False)

In [None]:
pos = ['s_v', 'v_s', 'pr_s', 's_pr', 'a_s', 'adv_v', 'v_pr']
for pos in tqdm_notebook(pos_list):
    count_c_nc_value(pos, d, PATH_bigrams, PATH2)

В результате для каждого морфологического шаблона (Verb + Noun, Noun + Verb, Prep + Noun, Noun + Prep, Verb + Prep, Adj + Noun, Adv + Verb) были получены списки bigrams, отсортированные с использованием перечисленных методов.