In [1]:
import pandas as pd
from tqdm import tqdm
from scipy.stats import mannwhitneyu
import numpy as np
import warnings
import statistics
warnings.filterwarnings("ignore")

tqdm.pandas()

# Import data

Import all data, where mentioned corpus, path to audio, transcription, duration of audio, informant code, number of tokens in transcription

In [3]:
df = pd.read_excel('manifest.xlsx')

Let's set a threshold value for each case.

In [4]:
amount_of_minutes_per_corpus = {
    'Keba': 63, 
    'Dvina': 63, 
    'Pinega': 63, 
    'Pyoza': 63, 
    'Tserkovnoe': 63, 
    'Vaduga': 63, 
    'Veegora': 63, 
    'Manturovo': 441, 
    'LukhTeza': 441, 
    'Novgorod': 441,
    'Opochka': 147, 
    'Pyatiusovo': 147,
    'Shetnevo': 147,
    'Luzhnikovo': 441, 
    'Nekhochi': 441, 
    'Rogovatka': 441, 
    'Malinino': 147, 
    'Mikhaylov': 147, 
    'Popovka': 147,
    'Don': 441
}

In [5]:
df['duration_min'] = df['duration']/60
df.astype({'tokens_len': np.float64}).dtypes

Unnamed: 0.1      int64
Unnamed: 0        int64
corpus           object
path             object
text             object
duration        float64
informant        object
tokens_len      float64
duration_min    float64
dtype: object

# Prepare functions for creating subcorpuses

We define a function for selecting an element to add to the dataset. We randomly select utterances from each informant in the corpus. If the phrase has already been selected to the dataset, it is flagged and is not selected again.

In [6]:
def get_random_element(corpus, limit, random_state):
    global amount_of_minutes_per_corpus, df
    
    df_filtered = df[(df['corpus']==corpus)]
    df_filtered['AlreadyChosen'] = 0
    speakers = df_filtered['informant'].unique()
    df_result = pd.DataFrame(columns=df.columns)
    last_size = -1
    
    while round(df_result[df_result['corpus']==corpus]['duration_min'].sum())-0.3 < limit:
        for speaker in speakers:
            df_filtered2 = df_filtered[(df_filtered['informant'] == speaker)&(df_filtered['AlreadyChosen'] == 0)]
            if df_filtered2.size>0:
                row = df_filtered2.sample(n=1, random_state=random_state)
                df_filtered.loc[row.index, 'AlreadyChosen'] = 1
                already_minutes = df_result[df_result['corpus']==corpus]['duration_min'].sum()
                if already_minutes + row['duration_min'].sum() < limit:
                    df_result = pd.concat([df_result, row])
        if last_size == df_result.size:
            return df_result
        else:
            last_size = df_result.size
    return df_result

In this function, we take into account the distributions that result from selecting a random seed using the Mann-Whitney test. Here we use test for audio durations and number of tokens.

In [7]:
def count_distributions(corpus, limit):
    d = []
    for i in tqdm(range(200, 500)):
        res = get_random_element(corpus, limit, i)
        U_length_audio, p_length_audio = mannwhitneyu(res['duration_min'].astype(np.float64), 
                                                      df[df['corpus']== corpus]['duration_min'].astype(np.float64))
        U_length_sent, p_length_sent = mannwhitneyu(res['tokens_len'].astype(np.float64), 
                                                    df[df['corpus']== corpus]['tokens_len'].astype(np.float64))
        
        if p_length_audio>0.05 and p_length_sent>0.05:
            d.append([corpus, i, 1, 1, sum(res['duration_min']), sum(res['tokens_len'])])
        elif p_length_audio>0.05 and p_length_sent<=0.05:
            d.append([corpus, i, 1, 0, sum(res['duration_min']), sum(res['tokens_len'])])
        elif p_length_audio<=0.05 and p_length_sent>0.05:
            d.append([corpus, i, 0, 1, sum(res['duration_min']), sum(res['tokens_len'])])
        else:
            d.append([corpus, i, 0, 0, sum(res['duration_min']), sum(res['tokens_len'])])
    return d

# Find the best random state where most of the subcorpuses will have similar distribution 

In [68]:
all_stats_res2 = []
df['AlreadyChosen'] = 0
for corpus, limit in tqdm(amount_of_minutes_per_corpus.items()):
    corpus_res = count_distributions(corpus, limit)
    all_stats_res2.append(corpus_res)

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [6:28:49<00:00, 1166.48s/it]


In [69]:
def flatten(xss):
    return [x for xs in xss for x in xs]
all_stats_res_2 = flatten(all_stats_res2)

In [70]:
df_all_stats_res = pd.DataFrame(all_stats_res_2, columns=['Corpus', 'Random_state', 'P-value greater than 0.5 (audio)', 
                                                          'P-value greater than 0.5 (text)', 'Sum audio', 'Sum tokens'])
df_all_stats_res.to_excel('сorpus_statistics_by_random_state.xlsx')

# Count tokens for each variant

In [10]:
all_random_states = pd.read_excel('corpus_statistics_by_random_state_all.xlsx')
variants = list(all_random_states['Variant'].unique())

In [100]:
def count_entries(corpus, limit, random_state):
    res = get_random_element(corpus, limit, random_state)
    return res.shape[0]

In [101]:
all_random_states['entries'] = all_random_states.progress_apply(lambda x: count_entries(x['Corpus'], 
                                                                                        amount_of_minutes_per_corpus[x['Corpus']], 
                                                                                        x['Random_state']), axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████| 3453/3453 [7:50:28<00:00,  8.17s/it]


In [102]:
all_random_states.to_excel('corpus_statistics_by_random_state_all_entries')

In [11]:
median = all_random_states.groupby('Variant')['Sum tokens'].agg(['max']).to_dict()['max']
median.pop("Arhangelsk")
median.pop("pskovskie")
median.pop("ryazan")
statistics.mean(list(median.values()))

56172.142857142855

In [12]:
median

{'Desna': 58007,
 'Kostroma': 64281,
 'Povolzie': 57290,
 'donskie': 56434,
 'mezhzon': 47455,
 'novgorod': 55929,
 'seliger': 53809}

# Find the optimal amount of tokens for variant: we are trying to make proportionate volumes for each group of dialects

Previously, we calculated the volume of tokens for each group of dialects depending on the random state. Now we are trying to find the closest volume to the median (which was calculated for all possible groups). If the corpus volume is closest to the median, we choose it.

In [13]:
corpora = list(all_random_states['Corpus'].unique())
choosed_data_all = []
for corpus in tqdm(corpora):
    corpus_data = all_random_states[all_random_states['Corpus']==corpus].values.tolist()
    choose = corpus_data[0]
    min_delta = 100000
    for i in range(len(corpus_data)):
        if abs(corpus_data[i][6]-56172.142857142855) < min_delta:
            choose = corpus_data[i]
            min_delta = abs(corpus_data[i][6]-56172.142857142855)
    choosed_data_all.append(choose)

100%|████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 2856.67it/s]


In [14]:
random_state_result = pd.DataFrame(choosed_data_all, columns=['Corpus', 'Variant', 'Random_state', 
                                                              'P-value greater than 0.5 (audio)', 
                                                              'P-value greater than 0.5 (text)', 'Sum audio', 'Sum tokens'])
random_state_result.groupby('Variant')['Sum tokens', 'Sum audio'].agg(['sum'])

Unnamed: 0_level_0,Sum tokens,Sum audio
Unnamed: 0_level_1,sum,sum
Variant,Unnamed: 1_level_2,Unnamed: 2_level_2
Arhangelsk,57160,440.884767
Desna,57700,440.932917
Kostroma,63941,440.970433
Povolzie,56715,440.994333
donskie,56174,440.9919
mezhzon,47455,440.985933
novgorod,55929,440.98955
pskovskie,59809,440.934667
ryazan,56529,440.9499
seliger,53809,440.997933


In [15]:
res = random_state_result.groupby('Variant')['Sum tokens'].agg(['sum']).to_dict()['sum']
np.std(list(res.values()))

3971.4174660944423

In [16]:
res_state = random_state_result.to_dict('records')

In [17]:
df['AlreadyChosen'] = 0
df_balanced = pd.DataFrame(columns=df.columns)
for item in tqdm(res_state):
    subcorpus = get_random_element(item['Corpus'], amount_of_minutes_per_corpus[item['Corpus']], item['Random_state'])
    df_balanced = pd.concat([df_balanced, subcorpus])

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:22<00:00,  7.13s/it]


In [20]:
df_balanced.to_excel('manifest_balanced.xlsx')