In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [36]:
base_dir = "/Users/lmatayoshi/Documents/Projects/tesis_notebooks/final_v2_experiments/dev/features_combination/"
eers_legendre_dir = base_dir + "kappa_eers/mix_legendre/"
eers_dct_dir = base_dir + "kappa_eers/mix_dct/"

In [37]:
kappa_phonemes = ['G', 'b', 'w', 'B', 'D', 'm', 'i', 's']
filenames = ['s', 'i_m', 'b_B_D_G_w']
proportions = ['0.0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '1.0']

In [38]:
def kappa_positives_negatives_dict():
    phonemes = ['G', 'b', 'w', 'B', 'D', 'm', 'i', 's']
    total_values = [865, 923, 1243, 1597, 2929, 3920, 6167, 8035]
    total_dict = dict(zip(phonemes, total_values))
    positives_values = [222, 528, 743, 428, 920, 3234, 4929, 7555]
    positives_dict = dict(zip(phonemes, positives_values))
    negatives_values = [643, 395, 500, 1169, 2009, 686, 1238, 480]
    negatives_dict = dict(zip(phonemes, negatives_values))
    return (total_dict, positives_dict, negatives_dict)

In [39]:
def load_proportions_dict(dir):
    proportions_dict = {}
    for p in kappa_phonemes:
        proportions_dict[p] = dict(zip(proportions, [[] for i in range(11)]))
    for filename in filenames:
        for p in proportions:
            proportions_f = dir + filename + "/" + p 
            with open(proportions_f, "r") as f:
                lines = f.readlines() 
                lines = [l.strip().split(":") for l in lines][:-1]
                for line in lines:
                    proportions_dict[line[0]][p] = float(line[1])
    return proportions_dict

In [40]:
legendre_proportions_dict = load_proportions_dict(eers_legendre_dir)

In [41]:
dct_proportions_dict = load_proportions_dict(eers_dct_dir)

### Step by step

In [42]:
proportions_dict = {}
for p in kappa_phonemes:
    proportions_dict[p] = dict(zip(proportions, [[] for i in range(11)]))

In [43]:
for filename in filenames:
    for p in proportions:
        proportions_f = eers_legendre_dir + filename + "/" + p 
        with open(proportions_f, "r") as f:
            lines = f.readlines() 
            lines = [l.strip().split(":") for l in lines][:-1]
            for line in lines:
                proportions_dict[line[0]][p] = float(line[1])

# New dataframe 

In [44]:
def dataframe_from_proportions_dict(proportions_dict):
    kappa_total_dict, kappa_positives_dict, kappa_negatives_dict = kappa_positives_negatives_dict()
    kappa_total = np.array([kappa_total_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
    kappa_positives = np.array([kappa_positives_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
    kappa_negatives = np.array([kappa_negatives_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
    kappa_column = np.array(kappa_phonemes).reshape(-1, 1)
    
    minimum = []
    
    base_matrix = np.array([None]*11).reshape((1,-1))
    for phoneme in kappa_phonemes:
        phoneme_values = np.array([proportions_dict[phoneme][p] for p in proportions])
        phoneme_values = phoneme_values.reshape((1,-1))
        minimum.append(np.min(phoneme_values))
        base_matrix = np.vstack((base_matrix, phoneme_values))
    
    base_matrix = base_matrix[1:]
    minimum = np.array(minimum)
    
    base_matrix = np.column_stack((base_matrix, minimum.reshape(-1, 1)))
    base_matrix = np.column_stack((base_matrix, kappa_positives))
    base_matrix = np.column_stack((base_matrix, kappa_negatives))
    base_matrix = np.column_stack((base_matrix, kappa_total))
    
    dataframe = pd.DataFrame(base_matrix, index=kappa_phonemes, columns=proportions + ['min', 'positives', 'negatives', 'total'])
    dataframe = dataframe.astype(np.float)
    return dataframe

In [45]:
legendre_dataframe = dataframe_from_proportions_dict(legendre_proportions_dict)
legendre_dataframe

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,min,positives,negatives,total
G,0.162162,0.166667,0.166667,0.18018,0.18018,0.189736,0.197512,0.195956,0.198198,0.199669,0.202703,0.162162,222.0,643.0,865.0
b,0.121519,0.128788,0.146835,0.157197,0.164557,0.162025,0.164557,0.164773,0.164557,0.168126,0.16962,0.121519,528.0,395.0,923.0
w,0.150605,0.152086,0.151364,0.149872,0.15074,0.149394,0.15074,0.153432,0.151377,0.152086,0.152938,0.149394,743.0,500.0,1243.0
B,0.205607,0.200935,0.200935,0.209829,0.212617,0.214953,0.214024,0.215495,0.214953,0.214953,0.21729,0.200935,428.0,1169.0,1597.0
D,0.181682,0.179691,0.187436,0.193629,0.200597,0.207609,0.212544,0.221503,0.225,0.228261,0.231458,0.179691,920.0,2009.0,2929.0
m,0.153989,0.141399,0.144712,0.144315,0.147495,0.151206,0.153061,0.154607,0.154581,0.154519,0.155612,0.141399,3234.0,686.0,3920.0
i,0.254443,0.25421,0.25269,0.254443,0.255427,0.256058,0.257515,0.258537,0.260097,0.259282,0.258959,0.25269,4929.0,1238.0,6167.0
s,0.325444,0.295833,0.307964,0.313848,0.314583,0.316667,0.320833,0.320833,0.316744,0.316667,0.31875,0.295833,7555.0,480.0,8035.0


In [46]:
legendre_dataframe['0.0'].values

array([ 0.16216216,  0.12151899,  0.15060462,  0.20560748,  0.18168243,
        0.15398887,  0.25444265,  0.32544379])

In [47]:
dct_dataframe = dataframe_from_proportions_dict(dct_proportions_dict)
output_filename = base_dir + "kappa_csvs/feature_combination_dct.csv"
dct_dataframe.round(3).to_csv(path_or_buf=output_filename, index_label="Phonemes")
dct_dataframe

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,min,positives,negatives,total
G,0.162162,0.167963,0.189189,0.198198,0.202177,0.203468,0.211712,0.216767,0.220721,0.221127,0.225225,0.162162,222.0,643.0,865.0
b,0.121519,0.134177,0.146835,0.151899,0.15937,0.162025,0.164773,0.162879,0.163744,0.16962,0.172348,0.121519,528.0,395.0,923.0
w,0.150605,0.15074,0.153184,0.158,0.161507,0.16,0.163828,0.165429,0.168237,0.169583,0.171198,0.150605,743.0,500.0,1243.0
B,0.205607,0.200376,0.206159,0.206095,0.212273,0.215569,0.223268,0.224299,0.228642,0.232272,0.232311,0.200376,428.0,1169.0,1597.0
D,0.181682,0.188043,0.190396,0.203261,0.21561,0.221739,0.232952,0.233449,0.238291,0.238043,0.243902,0.181682,920.0,2009.0,2929.0
m,0.153989,0.144315,0.144712,0.147186,0.146424,0.145428,0.14723,0.150146,0.151603,0.153061,0.154519,0.144315,3234.0,686.0,3920.0
i,0.254443,0.251986,0.252019,0.256318,0.261489,0.263745,0.265445,0.266383,0.26979,0.272213,0.271405,0.251986,4929.0,1238.0,6167.0
s,0.325444,0.283333,0.285109,0.297177,0.308959,0.310417,0.3137,0.314319,0.312932,0.314583,0.3125,0.283333,7555.0,480.0,8035.0


In [31]:
dct_dataframe['0.0'].values

array([ 0.16216216,  0.12151899,  0.15060462,  0.20560748,  0.18168243,
        0.15398887,  0.25444265,  0.32544379])

In [159]:
kappa_total_dict, kappa_positives_dict, kappa_negatives_dict = kappa_positives_negatives_dict()
kappa_total = np.array([kappa_total_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
kappa_positives = np.array([kappa_positives_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
kappa_negatives = np.array([kappa_negatives_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
kappa_column = np.array(kappa_phonemes).reshape(-1, 1)

In [160]:
minimum = []

In [161]:
base_matrix = np.array([None]*11).reshape((1,-1))
for phoneme in kappa_phonemes:
    phoneme_values = np.array([proportions_dict[phoneme][p] for p in proportions])
    phoneme_values = phoneme_values.reshape((1,-1))
    minimum.append(np.min(phoneme_values))
    base_matrix = np.vstack((base_matrix, phoneme_values))
base_matrix = base_matrix[1:]
minimum = np.array(minimum)

In [162]:
base_matrix = np.column_stack((base_matrix, minimum.reshape(-1, 1)))
base_matrix = np.column_stack((base_matrix, kappa_positives))
base_matrix = np.column_stack((base_matrix, kappa_negatives))
base_matrix = np.column_stack((base_matrix, kappa_total))

In [163]:
dataframe = pd.DataFrame(base_matrix, index=kappa_phonemes, columns=proportions + ['min', 'positives', 'negatives', 'total'])

In [167]:
dataframe = dataframe.astype(np.float)

In [168]:
output_filename = base_dir + "feature_combination_legendre.csv"

In [172]:
dataframe

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,min,positives,negatives,total
G,0.162162,0.166667,0.166667,0.18018,0.18018,0.189736,0.197512,0.195956,0.198198,0.199669,0.202703,0.162162,222.0,643.0,865.0
b,0.121519,0.128788,0.146835,0.157197,0.164557,0.162025,0.164557,0.164773,0.164557,0.168126,0.16962,0.121519,528.0,395.0,923.0
w,0.150605,0.152086,0.151364,0.149872,0.15074,0.149394,0.15074,0.153432,0.151377,0.152086,0.152938,0.149394,743.0,500.0,1243.0
B,0.205607,0.200935,0.200935,0.209829,0.212617,0.214953,0.214024,0.215495,0.214953,0.214953,0.21729,0.200935,428.0,1169.0,1597.0
D,0.181682,0.179691,0.187436,0.193629,0.200597,0.207609,0.212544,0.221503,0.225,0.228261,0.231458,0.179691,920.0,2009.0,2929.0
m,0.153989,0.141399,0.144712,0.144315,0.147495,0.151206,0.153061,0.154607,0.154581,0.154519,0.155612,0.141399,3234.0,686.0,3920.0
i,0.254443,0.25421,0.25269,0.254443,0.255427,0.256058,0.257515,0.258537,0.260097,0.259282,0.258959,0.25269,4929.0,1238.0,6167.0
s,0.325444,0.295833,0.307964,0.313848,0.314583,0.316667,0.320833,0.320833,0.316744,0.316667,0.31875,0.295833,7555.0,480.0,8035.0


In [169]:
dataframe.round(3).to_csv(path_or_buf=output_filename, index_label="Phonemes")

# Main

In [5]:
def main(eers_dir, csv_filename, values):
    eers_by_value = dict(zip(values, [None] * len(values)))
    
    for v in values:
        with open(eers_dir + str(v), "r") as f:
            lines = f.readlines()
            eer_degree_i = [lines[j].strip().split(":") for j in range(len(lines))]
            eer_degree_i = [value for value in eer_degree_i if len(value) == 2]
            eer_degree_i = dict(eer_degree_i)
            eers_by_value[v] = eer_degree_i
    
    kappa_total_dict, kappa_positives_dict, kappa_negatives_dict = kappa_positives_negatives_dict()
    kappa_total = np.array([kappa_total_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
    kappa_positives = np.array([kappa_positives_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
    kappa_negatives = np.array([kappa_negatives_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
    
    base_matrix = np.array(kappa_phonemes).reshape(-1,1)
    for key in values:
        results_dict = eers_by_value[key]
        results = [float(results_dict[phoneme]) for phoneme in kappa_phonemes]
        results = np.array(results).reshape(-1,1)
        base_matrix = np.hstack((base_matrix, results))
    
    base_matrix = np.hstack((base_matrix, kappa_positives))
    base_matrix = np.hstack((base_matrix, kappa_negatives))
    base_matrix = np.hstack((base_matrix, kappa_total))
    
    dataframe = pd.DataFrame(base_matrix[:, 1:], index=kappa_phonemes, columns=values+['corrects', 'incorrects', 'total'])
    dataframe = dataframe.astype(np.float)
    dataframe = dataframe.sort_values(by=['total'])
    dataframe.round(3).to_csv(path_or_buf=csv_filename, index_label="Phonemes")
    return dataframe


# Fusion systems

In [12]:
dataframe = main(
    eers_dir + "legendre_best_alpha_eers/",
    csvs_dir + "legendre_best_alpha.csv",
    [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
)

In [13]:
dataframe

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,corrects,incorrects,total
G,0.222395,0.225505,0.223071,0.220721,0.22395,0.224124,0.22395,0.216216,0.225225,0.222395,222.0,643.0,865.0
b,0.183612,0.175515,0.174363,0.176136,0.179747,0.183712,0.18481,0.1875,0.189873,0.186177,528.0,395.0,923.0
w,0.158348,0.160097,0.161507,0.158,0.158666,0.159162,0.16142,0.160162,0.161507,0.158816,743.0,500.0,1243.0
B,0.2284,0.226408,0.227371,0.227055,0.221963,0.222412,0.221963,0.221963,0.224123,0.225423,428.0,1169.0,1597.0
D,0.23913,0.234783,0.238043,0.242907,0.240014,0.240014,0.244803,0.244898,0.246889,0.24888,920.0,2009.0,2929.0
m,0.16035,0.161808,0.161581,0.16035,0.161741,0.162028,0.162415,0.166181,0.16517,0.166181,3234.0,686.0,3920.0
i,0.25563,0.255224,0.253222,0.253635,0.256058,0.256058,0.255224,0.256866,0.258006,0.259289,4929.0,1238.0,6167.0
s,0.304167,0.312984,0.320833,0.321305,0.325,0.330756,0.329167,0.325,0.324024,0.31875,7555.0,480.0,8035.0


In [18]:
dataframe.mean(axis=0)[:10].values

array([ 0.21900393,  0.21904028,  0.219999  ,  0.22001357,  0.22089228,
        0.22228324,  0.222969  ,  0.22234812,  0.22435224,  0.22323873])

In [12]:
main(
    eers_dir + "features_combination_supervectors_legendre_eers/",
    csvs_dir + "features_combination_supervectors_legendre.csv",
    [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
)

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,corrects,incorrects,total
G,0.162162,0.166539,0.167963,0.174184,0.183515,0.189189,0.193694,0.200622,0.198198,0.202703,0.202703,222.0,643.0,865.0
b,0.121519,0.125,0.136709,0.14264,0.153409,0.162025,0.163004,0.164557,0.16468,0.170455,0.173825,528.0,395.0,923.0
w,0.150605,0.156,0.154724,0.152086,0.152086,0.149394,0.149394,0.15074,0.15,0.152086,0.154,743.0,500.0,1243.0
B,0.205607,0.198598,0.209396,0.214953,0.222412,0.226867,0.222992,0.221963,0.221557,0.224123,0.224979,428.0,1169.0,1597.0
D,0.181682,0.181522,0.189149,0.192391,0.201593,0.207609,0.213043,0.217279,0.219333,0.22001,0.221503,920.0,2009.0,2929.0
m,0.153989,0.145022,0.147186,0.147959,0.150146,0.151515,0.15315,0.152752,0.154519,0.154917,0.154519,3234.0,686.0,3920.0
i,0.254443,0.258481,0.257739,0.254413,0.25525,0.254616,0.255021,0.253635,0.25421,0.253446,0.252827,4929.0,1238.0,6167.0
s,0.325444,0.296492,0.299026,0.302846,0.302083,0.300993,0.30086,0.299694,0.298345,0.298743,0.3,7555.0,480.0,8035.0


In [13]:
main(
    eers_dir + "features_combination_supervectors_dct_eers/",
    csvs_dir + "features_combination_supervectors_dct.csv",
    [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
)

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,corrects,incorrects,total
G,0.162162,0.174184,0.175676,0.189189,0.19038,0.191291,0.192773,0.206843,0.207207,0.207207,0.211509,222.0,643.0,865.0
b,0.121519,0.126582,0.141282,0.151899,0.156962,0.162025,0.162025,0.164359,0.167089,0.172152,0.177215,528.0,395.0,923.0
w,0.150605,0.150442,0.15121,0.15,0.156,0.157738,0.158773,0.158,0.16,0.162853,0.163315,743.0,500.0,1243.0
B,0.205607,0.201882,0.203271,0.21028,0.213003,0.216424,0.217282,0.222295,0.224979,0.229256,0.231308,428.0,1169.0,1597.0
D,0.181682,0.182781,0.190144,0.201593,0.212359,0.218019,0.221982,0.223992,0.231458,0.230435,0.233449,920.0,2009.0,2929.0
m,0.153989,0.140723,0.145714,0.145462,0.144811,0.15102,0.152041,0.15337,0.153989,0.153989,0.153061,3234.0,686.0,3920.0
i,0.254443,0.251775,0.250404,0.253398,0.255224,0.260097,0.263131,0.263328,0.267367,0.268174,0.268174,4929.0,1238.0,6167.0
s,0.325444,0.283333,0.289583,0.294748,0.297816,0.298312,0.303111,0.304167,0.3,0.29996,0.301787,7555.0,480.0,8035.0


# Step by step

In [19]:
base_dir = "/Users/lmatayoshi/Documents/Projects/tesis_notebooks/kappa_experiments/"
eers_dir = base_dir + "features_combination_supervectors_legendre_eers/"
csv_filename = base_dir + "features_combination_supervectors_legendre.csv"

In [22]:
proportions = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [23]:
eers_by_proportion = dict(zip(proportions, [None] * len(proportions)))

In [24]:
for p in proportions:
    with open(eers_dir + str(p), "r") as f:
        lines = f.readlines()
        eer_degree_i = [lines[j].strip().split(":") for j in range(len(lines))]
        eer_degree_i = [value for value in eer_degree_i if len(value) == 2]
        eer_degree_i = dict(eer_degree_i)
        eers_by_proportion[p] = eer_degree_i

In [25]:
kappa_total_dict, kappa_positives_dict, kappa_negatives_dict = kappa_positives_negatives_dict()
kappa_total = np.array([kappa_total_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
kappa_positives = np.array([kappa_positives_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
kappa_negatives = np.array([kappa_negatives_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)

In [26]:
base_matrix = np.array(kappa_phonemes).reshape(-1,1)

In [27]:
for key in eers_by_proportion.keys():
    results_dict = eers_by_proportion[key]
    results = [float(results_dict[phoneme]) for phoneme in kappa_phonemes]
    results = np.array(results).reshape(-1,1)
    base_matrix = np.hstack((base_matrix, results))
    
base_matrix = np.hstack((base_matrix, kappa_positives))
base_matrix = np.hstack((base_matrix, kappa_negatives))
base_matrix = np.hstack((base_matrix, kappa_total))

In [28]:
dataframe = pd.DataFrame(base_matrix[:, 1:], index=kappa_phonemes, columns=proportions+['corrects', 'incorrects', 'total'])
dataframe = dataframe.astype(np.float)b
dataframe = dataframe.sort_values(by=['total'])
dataframe.round(3).to_csv(path_or_buf=csv_filename, index_label="Phonemes")


In [56]:
"_".join(['G', 'b', 'w', 'B', 'D', 'm', 'i', 's'])

'G_b_w_B_D_m_i_s'

In [2]:
(0.189 - 0.187) / 0.188

0.010638297872340436