In [1]:
import numpy as np
import pandas as pd

In [2]:
stats = pd.read_csv(
    'oh2011-statistics.csv',
    comment='#',
    index_col = ('gender', 'utterance', 'onset')
)
stats

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,vot_mean,vot_sd,f0_mean,f0_sd
gender,utterance,onset,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,isolation,fortis,14,9,246,19
female,isolation,lenis,58,21,211,19
female,isolation,aspirated,72,21,255,21
female,sentence,fortis,14,8,270,20
female,sentence,lenis,44,21,224,20
female,sentence,aspirated,52,20,288,25
male,isolation,fortis,17,9,139,21
male,isolation,lenis,57,21,120,18
male,isolation,aspirated,85,20,147,23
male,sentence,fortis,16,8,151,25


In [239]:
genders = ('female', 'male')
n_subject_per_gender = 19 # 성별당 19명
n_condition = stats.shape[0] // len(genders) # 성별을 제외한 조건의 가짓수 6
repetition = 6 # 각 조건당 6회 반복

sid_list = []
gender_list = []
for i in range(n_subject_per_gender):
    for gender in genders:
        ismale = int(gender == 'male')
        ind = ismale * n_subject_per_gender + i
        sid_list.extend([f's{ind:02d}'] * n_condition * repetition)
        gender_list.extend([gender] * n_condition * repetition)

df = pd.DataFrame({'sid': sid_list, 'gender': gender_list})

conditions = [condition for condition in stats.index.to_list() for _ in range(repetition)]
df[stats.index.names] = np.tile(conditions, (n_subject_per_gender,1))
df.sort_values(['sid', 'utterance'], inplace=True, ignore_index=True)
df

Unnamed: 0,sid,gender,utterance,onset
0,s00,female,isolation,fortis
1,s00,female,isolation,fortis
2,s00,female,isolation,fortis
3,s00,female,isolation,fortis
4,s00,female,isolation,fortis
...,...,...,...,...
1363,s37,male,sentence,aspirated
1364,s37,male,sentence,aspirated
1365,s37,male,sentence,aspirated
1366,s37,male,sentence,aspirated


In [266]:
def get_gamma_params(mean, std):
    alpha = mean ** 2 / std ** 2 # shape
    theta = std ** 2 / mean # scale
    return (alpha, theta)

def generate_vot(condition, n=n_subject_per_gender*repetition):
    # VOT: 한국어 무성음은 양수 값만 가지므로 감마분포
    stat = stats.loc[condition]
    shape, scale = get_gamma_params(*stat[['vot_mean', 'vot_sd']])
    vots = np.random.gamma(shape, scale, n)
    return vots

def generate_f0(condition, n=n_subject_per_gender*repetition):
    # F0: 정규분포
    stat = stats.loc[condition]
    f0s = np.random.normal(*stat[['f0_mean', 'f0_sd']], n)

    return f0s

np.random.seed(988)

df['vot'] = 0.
df['f0'] = 0.

for condition in stats.index:
    where = (df[stats.index.names] == condition).all(axis=1)
    df.loc[where,'vot'] = generate_vot(condition)
    df.loc[where,'f0'] = generate_f0(condition)

df.to_csv('../data/vowel/vot-f0.csv', index=False)
df

Unnamed: 0,sid,gender,utterance,onset,vot,f0
0,s00,female,isolation,fortis,15.403059,289.105041
1,s00,female,isolation,fortis,11.418402,253.149570
2,s00,female,isolation,fortis,7.220005,255.336127
3,s00,female,isolation,fortis,10.753237,252.388272
4,s00,female,isolation,fortis,12.803663,261.077793
...,...,...,...,...,...,...
1363,s37,male,sentence,aspirated,66.914421,148.526309
1364,s37,male,sentence,aspirated,79.489516,160.233426
1365,s37,male,sentence,aspirated,46.449577,118.192883
1366,s37,male,sentence,aspirated,45.119535,154.551274
