In [1]:
import numpy as np
from sklearn.metrics import roc_curve
from scipy.optimize import brentq
from scipy.interpolate import interp1d
import pandas as pd

# Score-combinations for Kappa

In [2]:
base_dir = "/Users/lmatayoshi/Documents/Projects/tesis_notebooks/final_v2_experiments/dev/score_combination/"
eers_dir = base_dir + "kappa/single/"
csvs_dir = base_dir + "kappa/csvs/"

In [3]:
kappa_phonemes = ['G', 'b', 'w', 'B', 'D', 'm', 'i', 's']

In [4]:
def all_phonemes_empty_dict(phonemes):
    return dict(zip(phonemes, [0.0 for p in phonemes]))

In [5]:
def kappa_positives_negatives_dict():
    phonemes = ['G', 'b', 'w', 'B', 'D', 'm', 'i', 's']
    total_values = [865, 923, 1243, 1597, 2929, 3920, 6167, 8035]
    total_dict = dict(zip(phonemes, total_values))
    positives_values = [222, 528, 743, 428, 920, 3234, 4929, 7555]
    positives_dict = dict(zip(phonemes, positives_values))
    negatives_values = [643, 395, 500, 1169, 2009, 686, 1238, 480]
    negatives_dict = dict(zip(phonemes, negatives_values))
    return (total_dict, positives_dict, negatives_dict)

In [6]:
def check_logids(lines_supervectors, lines_second_source):
    logids_supervectors = [l[0] for l in lines_supervectors]
    logids_second_source = [l[0] for l in lines_second_source]
    if not logids_supervectors == logids_second_source:
        raise Exception(phoneme + " logids doesn't match in supervectors vs second source")
    

In [7]:
def label_for(label):
    if label == "True":
        return 1
    elif label == "False":
        return -1

In [8]:
def compute_EER(tuples):
    labels = [t[1] for t in tuples]
    hyperplane_distances = [t[0] for t in tuples]
    fpr, tpr, _ = roc_curve(labels, hyperplane_distances)
    # eer is where fpr is equal to fnr
    eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    return eer

Estrategia, diccionario con todos los kappa phonemes por factor. 2 estructuras, una para cada par de features: Supervectors + Legendre y Supervectors + DCT

In [9]:
factors = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [10]:
def main(factors, eers_dir, second_source, csvs_dir):
    ### Dictionary generation 
    supervectors_dir = eers_dir + "svm/"
    second_source_dir = eers_dir + second_source + "/"
    csv_filename = csvs_dir + second_source + ".csv"
    factors_dict_second_source = dict(zip(factors, [all_phonemes_empty_dict(kappa_phonemes) for f in factors]))
    for factor in factors:
        for phoneme in kappa_phonemes:
            original_phoneme = phoneme
            if phoneme in ['g', 'y', 'd', 'b', 'n']:
                phoneme = phoneme + "_lowercase"
            supervectors_filename = supervectors_dir + phoneme
            second_source_filename = second_source_dir + phoneme
            with open(supervectors_filename, "r") as f_supervectors:
                lines_supervectors = f_supervectors.readlines()
                lines_supervectors = [l.split(" ") for l in lines_supervectors]
            with open(second_source_filename, "r") as f_second_source:
                lines_second_source = f_second_source.readlines()
                lines_second_source = [l.split(" ") for l in lines_second_source]
            check_logids(lines_supervectors, lines_second_source)
            
            phoneme_values = []
            for i in range(len(lines_supervectors)):
                new_value = float(lines_supervectors[i][4]) + factor*float(lines_second_source[i][4])
                new_line = lines_supervectors[i]
                label = label_for(new_line[3])
                new_tuple = (new_value, label)
                new_line[4] = str(new_value)
                new_line = " ".join(new_line) + "\n"
                phoneme_values.append(new_tuple)
            factors_dict_second_source[factor][original_phoneme] = compute_EER(phoneme_values)
    
    ### Dataframe generation
    kappa_total_dict, kappa_positives_dict, kappa_negatives_dict = kappa_positives_negatives_dict()
    kappa_total = np.array([kappa_total_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
    kappa_positives = np.array([kappa_positives_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
    kappa_negatives = np.array([kappa_negatives_dict[phoneme] for phoneme in kappa_phonemes]).reshape(-1,1)
    
    base_matrix = np.array(kappa_phonemes).reshape(-1,1)
    for key in factors:
        results_dict = factors_dict_second_source[key]
        results = [float(results_dict[phoneme]) for phoneme in kappa_phonemes]
        results = np.array(results).reshape(-1,1)
        base_matrix = np.hstack((base_matrix, results))
    
    minimums = []
    for row in base_matrix[:, 1:]:
        minimums.append(np.min(np.array(row).astype(np.float)))
    minimums = np.array(minimums).reshape(-1, 1)
    base_matrix = np.column_stack((base_matrix, minimums))
    
    base_matrix = np.hstack((base_matrix, kappa_positives))
    base_matrix = np.hstack((base_matrix, kappa_negatives))
    base_matrix = np.hstack((base_matrix, kappa_total))
    
    dataframe = pd.DataFrame(base_matrix[:, 1:], index=kappa_phonemes, columns=factors+['min','corrects', 'incorrects', 'total'])
    dataframe = dataframe.astype(np.float)
    dataframe = dataframe.sort_values(by=['total'])
    dataframe.round(3).to_csv(path_or_buf=csv_filename, index_label="Phonemes")
    return dataframe

In [11]:
legendre_dataframe = main(factors, eers_dir, "legendre", csvs_dir)
legendre_dataframe

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,min,corrects,incorrects,total
G,0.162162,0.161742,0.166407,0.167963,0.169518,0.172508,0.167963,0.171171,0.175676,0.178791,0.178849,0.161742,222.0,643.0,865.0
b,0.121519,0.123106,0.122967,0.128083,0.129114,0.131646,0.131646,0.136709,0.138088,0.143012,0.141772,0.121519,528.0,395.0,923.0
w,0.150605,0.153326,0.15366,0.154778,0.154778,0.154778,0.1527,0.151704,0.15074,0.148048,0.146703,0.146703,743.0,500.0,1243.0
B,0.205607,0.203825,0.199316,0.200935,0.198598,0.195611,0.196262,0.197531,0.199139,0.19846,0.19846,0.195611,428.0,1169.0,1597.0
D,0.181682,0.181185,0.182868,0.182678,0.183696,0.185797,0.187656,0.190217,0.192682,0.196117,0.2,0.181185,920.0,2009.0,2929.0
m,0.153989,0.152058,0.14966,0.14723,0.146568,0.147211,0.147186,0.145773,0.145773,0.145265,0.14723,0.145265,3234.0,686.0,3920.0
i,0.254443,0.253635,0.251775,0.251047,0.251212,0.250972,0.249814,0.250355,0.250295,0.250404,0.250644,0.249814,4929.0,1238.0,6167.0
s,0.325444,0.316745,0.314828,0.310417,0.3125,0.311534,0.310417,0.3125,0.316496,0.31317,0.316667,0.310417,7555.0,480.0,8035.0


In [12]:
legendre_dataframe['min'].values
legendre_dataframe[0.0].values

array([ 0.16216216,  0.12151899,  0.15060462,  0.20560748,  0.18168243,
        0.15398887,  0.25444265,  0.32544379])

In [13]:
dct_dataframe = main(factors, eers_dir, "dct", csvs_dir)
dct_dataframe

Unnamed: 0,0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,min,corrects,incorrects,total
G,0.162162,0.161742,0.166667,0.171171,0.18018,0.18507,0.184685,0.184787,0.180404,0.184685,0.18818,0.161742,222.0,643.0,865.0
b,0.121519,0.121519,0.124051,0.128788,0.129597,0.134177,0.140152,0.140152,0.142045,0.146262,0.149367,0.121519,528.0,395.0,923.0
w,0.150605,0.152,0.152568,0.155002,0.154954,0.154778,0.154465,0.154,0.154,0.152076,0.154613,0.150605,743.0,500.0,1243.0
B,0.205607,0.203271,0.202737,0.198598,0.201027,0.199316,0.200935,0.201027,0.200653,0.200802,0.201027,0.198598,428.0,1169.0,1597.0
D,0.181682,0.179242,0.181522,0.183176,0.187169,0.190217,0.193122,0.197611,0.2,0.202831,0.203086,0.179242,920.0,2009.0,2929.0
m,0.153989,0.154037,0.15102,0.14723,0.146877,0.145949,0.145949,0.145773,0.145245,0.145773,0.14723,0.145245,3234.0,686.0,3920.0
i,0.254443,0.252408,0.251986,0.250964,0.250558,0.250752,0.249138,0.248326,0.247981,0.248648,0.250761,0.247981,4929.0,1238.0,6167.0
s,0.325444,0.316667,0.304699,0.303069,0.302501,0.298875,0.297917,0.295301,0.295833,0.29375,0.293845,0.29375,7555.0,480.0,8035.0


In [14]:
dct_dataframe[0.0].values

array([ 0.16216216,  0.12151899,  0.15060462,  0.20560748,  0.18168243,
        0.15398887,  0.25444265,  0.32544379])

# Previous version

In [208]:
all_phonemes = ["Y", "f", "c", "x", "G", "d", "b", "g", "z", "w", "N", "B", "rr", "u", "p", "D", "y", "k", "m", "t", "l", "i", "r", "n", "s", "o", "a", "e"]

In [209]:
#factors = [0.2 * i for i in range(0,6)]
factors = [0.0, 0.1, 0.3, 0.5, 0.7, 1.0]

In [210]:
base_dir = "/Users/lmatayoshi/Documents/Projects/tesis_notebooks/main_experiments/score_combinations/dev/"

In [211]:
base_output_dir = base_dir + "output/"
output_dir_legendre = base_output_dir + "supervectors_legendre/"
output_dir_dct = base_output_dir + "supervectors_dct/"
supervectors_dir = base_dir + "supervectors_single_phonemes/"
legendre_dir = base_dir + "legendre_single_phonemes/"
dct_dir = base_dir + "dct_single_phonemes/"

In [212]:
def load_positives_negatives_dict():
    phonemes = [ "Y", "f", "c", "x", "G", "d", "b", "g", "z", "w", "N", "B", "rr", "u", "p", "D", "y", "k", "m", "t", "l", "i", "r", "n", "s", "o", "a", "e"]
    positive_values = [ 53, 682, 405, 590, 222, 773, 528, 887, 189, 743, 911, 428, 491, 1948, 1657, 920, 2453, 1708, 3234, 2938, 3505, 4929, 3650, 7152, 7555, 8040, 10144, 10597]
    negative_values = [10, 10, 105, 153, 643, 89, 395, 114, 997, 500, 443, 1169, 1739, 482, 1055, 2009, 574, 1472, 686, 1542, 1373, 1238, 2617, 476, 480, 2077, 2069, 3484 ]
    total_values = [positive_values[i] + negative_values[i] for i in range(len(positive_values))]
    total_instances = np.float(sum(total_values))
    weighted_values = [v/total_instances for v in total_values]
    positives_dict = dict(zip(phonemes, positive_values))
    negatives_dict = dict(zip(phonemes, negative_values))
    total_dict = dict(zip(phonemes, total_values))
    weighted_dict = dict(zip(phonemes, weighted_values))
    return (total_dict, weighted_dict, positives_dict, negatives_dict)

In [213]:
def all_phonemes_empty_dict(phonemes):
    return dict(zip(phonemes, [0.0 for p in phonemes]))

In [214]:
def check_logids(lines_supervectors, lines_legendre, lines_dct):
    logids_supervectors = [l[0] for l in lines_supervectors]
    logids_legendre = [l[0] for l in lines_legendre]
    logids_dct = [l[0] for l in lines_dct]
    if not logids_supervectors == logids_legendre:
        raise Exception(phoneme + " logids doesn't match in supervectors vs legendre")
    if not logids_supervectors == logids_dct:
        raise Exception(phoneme + " logids doesn't match in supervectors vs dct")

In [215]:
def label_for(label):
    if label == "True":
        return 1
    elif label == "False":
        return -1

In [216]:
def compute_EER(tuples):
    labels = [t[1] for t in tuples]
    hyperplane_distances = [t[0] for t in tuples]
    fpr, tpr, _ = roc_curve(labels, hyperplane_distances)
    # eer is where fpr is equal to fnr
    eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    return eer

In [217]:
def main():
    factors_dict_legendre = dict(zip(factors, [all_phonemes_empty_dict(all_phonemes) for f in factors]))
    factors_dict_dct = dict(zip(factors, [all_phonemes_empty_dict(all_phonemes) for f in factors]))
    for factor in factors:
        for phoneme in all_phonemes:
            original_phoneme = phoneme
            if phoneme in ['g', 'y', 'd', 'b', 'n']:
                phoneme = phoneme + "_lowercase"
            supervectors_filename = supervectors_dir + phoneme
            legendre_filename = legendre_dir + phoneme
            dct_filename = dct_dir + phoneme
            output_file_legendre = output_dir_legendre + phoneme
            output_file_dct = output_dir_dct + phoneme

            with open(supervectors_filename, "r") as f_supervectors:
                lines_supervectors = f_supervectors.readlines()
                lines_supervectors = [l.split(" ") for l in lines_supervectors]
            with open(legendre_filename, "r") as f_legendre:
                lines_legendre = f_legendre.readlines()
                lines_legendre = [l.split(" ") for l in lines_legendre]
            with open(dct_filename, "r") as f_dct:
                lines_dct = f_dct.readlines()
                lines_dct = [l.split(" ") for l in lines_dct]

            check_logids(lines_supervectors, lines_legendre, lines_dct)
            with open(output_file_legendre, "w") as f_output_legendre:        
                phoneme_values_legendre = []
                for i in range(len(lines_supervectors)):
                    new_value = float(lines_supervectors[i][4]) + factor*float(lines_legendre[i][4])
                    new_line = lines_supervectors[i]
                    label = label_for(new_line[3])
                    new_tuple = (new_value, label)
                    new_line[4] = str(new_value)
                    new_line = " ".join(new_line) + "\n"
                    phoneme_values_legendre.append(new_tuple)
                    f_output_legendre.write(new_line)
            with open(output_file_dct, "w") as f_output_dct:
                phoneme_values_dct = []
                for i in range(len(lines_supervectors)):
                    new_value = float(lines_supervectors[i][4]) + factor*float(lines_dct[i][4])
                    new_line = lines_supervectors[i]
                    label = label_for(new_line[3])
                    new_tuple = (new_value, label)
                    new_line[4] = str(new_value)
                    new_line = " ".join(new_line) + "\n"
                    phoneme_values_dct.append(new_tuple)
                    f_output_dct.write(new_line)
                factors_dict_legendre[factor][original_phoneme] = compute_EER(phoneme_values_legendre)
                factors_dict_dct[factor][original_phoneme] = compute_EER(phoneme_values_dct)
    return factors_dict_legendre, factors_dict_dct

In [218]:
factors_dict_legendre, factors_dict_dct = main()

# Save to dataframe

In [238]:
csvs_dir = base_dir + "csvs"
csv_legendre_filename = base_dir + "legendre_eers_by_factor.csv"
csv_dct_filename = base_dir + "dct_eers_by_factor.csv"

In [239]:
def generate_dataframe(factors_dict, output_filename):
    original_phonemes_column = factors_dict[factors[0]].keys()
    phonemes_column = np.array(original_phonemes_column).reshape(-1,1)
    total_dict, weighted_dict, positives_dict, negatives_dict = load_positives_negatives_dict()
    n_positives = np.array([positives_dict[phoneme] for phoneme in original_phonemes_column]).reshape(-1,1)
    n_negatives = np.array([negatives_dict[phoneme] for phoneme in original_phonemes_column]).reshape(-1,1)
    n_total = np.array([total_dict[phoneme] for phoneme in original_phonemes_column]).reshape(-1,1)
    matrix = phonemes_column

    for factor in factors:
        factor_results = factors_dict[factor].values()
        factor_results = np.array(factor_results).reshape(-1,1)
        matrix = np.hstack((matrix, factor_results))

    matrix = np.hstack((matrix, n_positives))
    matrix = np.hstack((matrix, n_negatives))
    matrix = np.hstack((matrix, n_total))

    dataframe = pd.DataFrame(matrix[:, 1:], index=original_phonemes_column, columns=factors + ['n_positives', 'n_negatives', 'n_total'])
    dataframe = dataframe.astype(np.float)
    dataframe = dataframe.sort_values(by=['n_total'])
    dataframe.round(3).to_csv(path_or_buf=output_filename, index_label="Phonemes")
    print dataframe

In [240]:
generate_dataframe(factors_dict_legendre, csv_legendre_filename)
generate_dataframe(factors_dict_dct, csv_dct_filename)

         0.0       0.1       0.3       0.5       0.7       1.0  n_positives  \
Y   0.383562  0.383562  0.400000  0.400000  0.396226  0.339806         53.0   
c   0.439506  0.439506  0.441916  0.430556  0.429630  0.428571        405.0   
f   0.384164  0.360704  0.334311  0.300000  0.300000  0.300000        682.0   
x   0.164407  0.163399  0.156560  0.156863  0.158858  0.159322        590.0   
d   0.363519  0.359551  0.348315  0.351876  0.359551  0.359551        773.0   
G   0.162162  0.162162  0.163297  0.167963  0.171171  0.178315        222.0   
b   0.121519  0.123176  0.126582  0.136536  0.136364  0.145833        528.0   
g   0.234470  0.236842  0.228070  0.236842  0.252537  0.251409        887.0   
z   0.222222  0.217676  0.216931  0.219876  0.220662  0.217653        189.0   
w   0.150605  0.151247  0.154000  0.153673  0.153432  0.152086        743.0   
N   0.217344  0.209932  0.210757  0.212953  0.210487  0.206234        911.0   
B   0.205607  0.203457  0.198598  0.200000  0.199316

In [232]:
original_phonemes_column = factors_dict_legendre[factors[0]].keys()
phonemes_column = np.array(original_phonemes_column).reshape(-1,1)

In [233]:
total_dict, weighted_dict, positives_dict, negatives_dict = load_positives_negatives_dict()
n_positives = np.array([positives_dict[phoneme] for phoneme in original_phonemes_column]).reshape(-1,1)
n_negatives = np.array([negatives_dict[phoneme] for phoneme in original_phonemes_column]).reshape(-1,1)
n_total = np.array([total_dict[phoneme] for phoneme in original_phonemes_column]).reshape(-1,1)

In [234]:
matrix = phonemes_column

for factor in factors:
    factor_results = factors_dict_legendre[factor].values()
    factor_results = np.array(factor_results).reshape(-1,1)
    matrix = np.hstack((matrix, factor_results))

matrix = np.hstack((matrix, n_positives))
matrix = np.hstack((matrix, n_negatives))
matrix = np.hstack((matrix, n_total))

dataframe = pd.DataFrame(matrix[:, 1:], index=original_phonemes_column, columns=factors + ['n_positives', 'n_negatives', 'n_total'])
dataframe = dataframe.astype(np.float)
dataframe = dataframe.sort_values(by=['n_total'])
dataframe.round(3).to_csv(path_or_buf=csv_filename, index_label="Phonemes")
dataframe

Unnamed: 0,0.0,0.1,0.3,0.5,0.7,1.0,n_positives,n_negatives,n_total
Y,0.383562,0.383562,0.4,0.4,0.396226,0.339806,53.0,10.0,63.0
c,0.439506,0.439506,0.441916,0.430556,0.42963,0.428571,405.0,105.0,510.0
f,0.384164,0.360704,0.334311,0.3,0.3,0.3,682.0,10.0,692.0
x,0.164407,0.163399,0.15656,0.156863,0.158858,0.159322,590.0,153.0,743.0
d,0.363519,0.359551,0.348315,0.351876,0.359551,0.359551,773.0,89.0,862.0
G,0.162162,0.162162,0.163297,0.167963,0.171171,0.178315,222.0,643.0,865.0
b,0.121519,0.123176,0.126582,0.136536,0.136364,0.145833,528.0,395.0,923.0
g,0.23447,0.236842,0.22807,0.236842,0.252537,0.251409,887.0,114.0,1001.0
z,0.222222,0.217676,0.216931,0.219876,0.220662,0.217653,189.0,997.0,1186.0
w,0.150605,0.151247,0.154,0.153673,0.153432,0.152086,743.0,500.0,1243.0


# Step by step

Armar una matriz para el primer factor. Guardar en distintos CSVs

In [126]:
key = factors[0]

In [127]:
actual = factors_dict_legendre[key]

In [152]:
total_dict, weighted_dict, positives_dict, negatives_dict = load_positives_negatives_dict()

In [172]:
n_positives = np.array([positives_dict[phoneme] for phoneme in all_phonemes]).reshape(-1,1)
n_negatives = np.array([negatives_dict[phoneme] for phoneme in all_phonemes]).reshape(-1,1)
n_total = np.array([total_dict[phoneme] for phoneme in all_phonemes]).reshape(-1,1)

In [173]:
phonemes_column = np.array(actual.keys()).reshape(-1,1)
results = np.array(actual.values()).reshape(-1,1)
matrix = np.hstack((phonemes_column, results))
matrix = np.hstack((matrix, n_positives))
matrix = np.hstack((matrix, n_negatives))
matrix = np.hstack((matrix, n_total))

array([['rr', '0.321695412332', '53', '10', '63'],
       ['B', '0.205607476635', '682', '10', '692'],
       ['D', '0.181682429068', '405', '105', '510'],
       ['G', '0.162162162162', '590', '153', '743'],
       ['N', '0.217343578485', '222', '643', '865'],
       ['Y', '0.383561643836', '773', '89', '862'],
       ['a', '0.353310778153', '528', '395', '923'],
       ['c', '0.43950617284', '887', '114', '1001'],
       ['b', '0.121518987343', '189', '997', '1186'],
       ['e', '0.399827784157', '743', '500', '1243'],
       ['d', '0.363518758086', '911', '443', '1354'],
       ['g', '0.234469873891', '428', '1169', '1597'],
       ['f', '0.384164222874', '491', '1739', '2230'],
       ['i', '0.254442649436', '1948', '482', '2430'],
       ['k', '0.34074941452', '1657', '1055', '2712'],
       ['m', '0.153988868275', '920', '2009', '2929'],
       ['l', '0.279315263909', '2453', '574', '3027'],
       ['o', '0.410203527815', '1708', '1472', '3180'],
       ['n', '0.424369747898', '

In [134]:
actual.values()

[0.3216954123315411,
 0.20560747663513795,
 0.1816824290679321,
 0.16216216216220683,
 0.21734357848518115,
 0.3835616438356078,
 0.35331077815300704,
 0.4395061728395388,
 0.1215189873431145,
 0.39982778415655507,
 0.3635187580855961,
 0.23446987389070525,
 0.38416422287390034,
 0.25444264943604555,
 0.3407494145199063,
 0.15398886827481292,
 0.2793152639086352,
 0.41020352781546804,
 0.4243697478981574,
 0.29478672985874443,
 0.3254437869822485,
 0.31753630126056465,
 0.35369609856259693,
 0.2944228274956561,
 0.1506046170758523,
 0.3376280145358456,
 0.1644067796610168,
 0.2222222222222223]

In [132]:
base_matrix

array(['rr', 'B', 'D', 'G', 'N', 'Y', 'a', 'c', 'b', 'e', 'd', 'g', 'f',
       'i', 'k', 'm', 'l', 'o', 'n', 'p', 's', 'r', 'u', 't', 'w', 'y',
       'x', 'z'], 
      dtype='|S2')

In [None]:
def main(eer_degree_dir, csv_filename, plots_dir, weighted_average_dir, degree, alpha_values, alpha_values_str):
    baselines_dicc = load_baselines()
    results = []
    
    # Armo las distintas columnas para armar la matriz por degree
    for alpha in alpha_values_str:
        with open(eer_degree_dir + alpha, "r") as f:
            result = f.read()
        result = result.split("\n")
        result = [l.split(": ") for l in result]
        result = [r for r in result if len(r) == 2]
        if len(result) != 28:
            raise Exception("Phonemes are not complete")
        results.append(result)
    
    total_dict, weighted_dict, positives_dict, negatives_dict = load_positives_negatives_dict()
    base_matrix = np.array(results[0])
    phonemes = base_matrix[:, 0]
    n_positives = np.array([positives_dict[phoneme] for phoneme in phonemes]).reshape(-1,1)
    n_negatives = np.array([negatives_dict[phoneme] for phoneme in phonemes]).reshape(-1,1)
    n_total = np.array([total_dict[phoneme] for phoneme in phonemes]).reshape(-1,1)
    
    for i in range(1, len(results)):
        new_column = np.array(results[i])[:, 1].reshape(-1,1)
        base_matrix = np.hstack((base_matrix, new_column))
    
    ### PLOT WEIGHTED AVERAGES ###
    # COMPUTE BASELINE FOR DEGREE
    baseline = compute_weighted_average(baselines_dicc[degree]["baseline_ols"], weighted_dict)
    weighted_averages = []
    weighted_average_matrix = np.copy(base_matrix)
    for i in range(weighted_average_matrix.shape[0]):
        #weighted_average_matrix[i,1:] = weight_row(total_dict, weighted_average_matrix[i,0], weighted_average_matrix[i,1:])
        weighted_average_matrix[i,1:] = weighted_average_matrix[i,1:].astype(np.float) * weighted_dict[weighted_average_matrix[i,0]]
    for j in range(1, weighted_average_matrix.shape[1]):
        eers = weighted_average_matrix[:,j].astype(np.float)
        weighted_averages.append(np.sum(eers))
    plt.clf()
    min_index_weighted_averages = np.argmin(weighted_averages)
    min_average = weighted_averages[min_index_weighted_averages]
    min_alpha = alpha_values[min_index_weighted_averages]
    title_weighted_averages = "Grado " + str(degree) + ". Min alpha: " + str(min_alpha) + ", value: " + str(min_average)
    plt.plot(alpha_values, weighted_averages)
    plt.axhline(y= baseline, linestyle="--", color="black")
    plt.title(title_weighted_averages)
    weighted_average_plot_filename = weighted_average_dir + "weighted_average_plot"
    plt.savefig(weighted_average_plot_filename)
    plt.clf()
    
    weighted_average_series = pd.Series(weighted_averages, index=alpha_values)
    weighted_average_csv_filename = weighted_average_dir + "weighted_average.csv"
    weighted_average_series.round(3).to_csv(path=weighted_average_csv_filename, index_label="Phonemes")
    ##############################

    base_matrix = np.hstack((base_matrix, n_positives))
    base_matrix = np.hstack((base_matrix, n_negatives))
    base_matrix = np.hstack((base_matrix, n_total))
    
    dataframe = pd.DataFrame(base_matrix[:, 1:], index=phonemes, columns=alpha_values + ['n_positives', 'n_negatives', 'n_total'])
    
    for index, row in dataframe.iterrows():  
        xs = row.index[:len(alpha_values)]
        ys = np.array(row[:len(alpha_values)]).astype(np.float)
        baseline_numpy = baselines_dicc[degree]['baseline_numpy_legendre'][index]
        baseline = baselines_dicc[degree]['baseline_ols'][index]
        plt.semilogx(xs, list(ys))
        plt.axhline(y=baseline, linestyle="--", color="black")
        delta = (np.min(ys) - baseline) / baseline
        anotate(plt, xs, ys, delta, baseline)

        title = "Grado " + str(degree) + ". Phoneme: "+ index + ". Positives: " + str(row[-2]) + ", Negatives: " + str(row[-1])
        plt.title(title)
        if index in ['g', 'y', 'd', 'b', 'n']:
            index = index + "_lowercase"
        plot_filename = plots_dir + index
        plt.savefig(plot_filename)
        plt.clf()
        
    dataframe = dataframe.astype(np.float)
    dataframe = dataframe.sort_values(by=['n_total'])
    dataframe.round(3).to_csv(path_or_buf=csv_filename, index_label="Phonemes")