In [266]:
#!/usr/bin/env python
# coding: utf-8

'''
    Treba da rekurzivno prodjem kroz sve foldere i ucitam putanje wpa#.txt i ca#.txt
'''
import numpy as np
import os
import re

In [267]:
# Paths
root = r'/home/littlewing/Projects/MachineLearningMicrosoftPetnica/QualificationRound/p2_publicDataSet/'
inputs = r'/home/littlewing/Projects/MachineLearningMicrosoftPetnica/QualificationRound/p2_publicDataSet/inputs/'
outputs = r'/home/littlewing/Projects/MachineLearningMicrosoftPetnica/QualificationRound/p2_publicDataSet/outputs/'
data_set = r'/home/littlewing/Projects/MachineLearningMicrosoftPetnica/QualificationRound/p2_publicDataSet/set/'

In [268]:
threshold = 0.7

# data_set_folder = input()
# curr_data_set = data_set + data_set_folder

regex = 'ca(\d+)\.txt|wpa(\d+)\.txt'
re_obj = re.compile(regex)


In [269]:

'''
New BSD License

Copyright (c) 2007â€“2020 The scikit-learn developers.
All rights reserved.
'''

# kopirano iz sklearn.utils i modifikovano
def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
    out = np.cumsum(arr, axis=axis, dtype=np.float64)

    return out

# kopirano iz sklearn.metrics i modifikovano
def _binary_clf_curve(y_true, y_score, pos_label=None):
    y_true = np.ravel(y_true)
    y_score = np.ravel(y_score)

    if pos_label is None:
        pos_label = 1.

    y_true = (y_true == pos_label)

    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]
    weight = 1.

    distinct_value_indices = np.where(np.diff(y_score))[0]
    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

    tps = stable_cumsum(y_true * weight)[threshold_idxs]
    fps = 1 + threshold_idxs - tps

    return fps, tps, y_score[threshold_idxs]


def roc_curve(y_true, y_score, pos_label=None):
    fps, tps, thresholds = _binary_clf_curve(
        y_true, y_score, pos_label=pos_label)

    if len(fps) > 2:
        optimal_idxs = np.where(np.r_[True,
                                      np.logical_or(np.diff(fps, 2),
                                                    np.diff(tps, 2)),
                                      True])[0]
        fps = fps[optimal_idxs]
        tps = tps[optimal_idxs]
        thresholds = thresholds[optimal_idxs]

    tps = np.r_[0, tps]
    fps = np.r_[0, fps]
    thresholds = np.r_[thresholds[0] + 1, thresholds]

    if fps[-1] <= 0:
        fpr = np.repeat(np.nan, fps.shape)
    else:
        fpr = fps / fps[-1]

    if tps[-1] <= 0:
        tpr = np.repeat(np.nan, tps.shape)
    else:
        tpr = tps / tps[-1]

    return fpr, tpr, thresholds




In [270]:
def TPR_FPR(tp, fp, p, n):
    tpr, fpr = tp / p, fp / n

    return (tpr, fpr)

def calc_eer(mapa_ca, mapa_wpa):
    y_true = []
    y_score = []
    
    # If key exists in both dicts then append corresponding values
    for k in mapa_ca:
        if k in mapa_wpa:
            y_true.append(1 if mapa_ca[k] == 'Yes' else 0)
            y_score.append(mapa_wpa[k])
    
    fpr, tpr, threshold = roc_curve(y_true, y_score, pos_label=1)
    fnr = 1 - tpr
#     eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
    
    EER = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
    
    return round(EER, 3)


In [271]:
def print_output(res_list):
    print('{:>5}  {:>5}  {:>5}  {:>5}  {:>5}  {:>5}'.format('pq', 'nq', 'vq', 'TPR', 'FPR', 'EER'))
    
    for res in res_list:
        pq, nq, vq, TPR, FPR, EER, file = res
        print('{:>5}  {:>5}  {:>5}  {:>5}  {:>5}  {:>5} : {:>6}'.format(pq, nq, vq, TPR, FPR, EER, file))

def output_diff(output, expected_output):
    output_diff = []
    
    for i in range(len(output)):
        row = []
        
        for el in range(len(output[0]) - 1):
            abs_diff = abs(float(output[i][el]) - float(expected_output[i][el]))
            row.append(round(abs_diff, 3))
        
        row.append(output[i][-1])
        output_diff.append(row)
    return output_diff

In [272]:
def output(input_root):
    output = []

    for input_file in os.listdir(input_root):
        curr_data_set = data_set + input_file.split('.txt')[0]
        file = input_file

        questions = {'Yes': 0, 'No': 0, 'Valid': 0, 'Total': 0}
        TP, FP, FN, TN = 0, 0, 0, 0
        TPR, FPR = 0.0, 0.0
        EER = 0.0

        mapa_ca = dict()
        mapa_wpa = dict()

        mapa_valid = []
        
        valid_yes = 0
        valid_no = 0 
    
        # get ca and wpa file content
        for root_, dir_, files_ in os.walk(curr_data_set):
            for item in files_:
                # get file path
                item_path = root_ + '/' + item

                # enter file
                item_content = ''
                with open(item_path, 'r') as f:
                    # read file content
                    item_content = f.readline().strip('% \n')

                matches = re_obj.match(item)
                if matches.group(1) == None:
                    perc = float(item_content) / 100
                    mapa_wpa[matches.group(2)] = perc
                else:
                    mapa_ca[matches.group(1)] = str(item_content)
                    questions['Total'] += 1
                    questions[item_content] += 1

        # Get valid questions and calculate TP, FP, FN, TN
        for k in mapa_ca:
            if k in mapa_wpa:
                mapa_valid.append(k)
                questions['Valid'] += 1
                
                if mapa_ca[k] == 'Yes':
                    valid_yes += 1
                elif mapa_ca[k] == 'No':
                    valid_no += 1
                else:
                    pass

                correct_answer = mapa_ca[k]
                confidence = mapa_wpa[k]
                prediction = 'False'

                if (confidence >= threshold):
                    prediction = 'Yes'

                if prediction == 'Yes':
                    if correct_answer == 'Yes':
                        TP += 1
                    else:
                        FP += 1
                else:
                    if correct_answer == 'Yes':
                        FN += 1
                    else:
                        TN += 1

        pq, nq, vq, tq = questions.values()
        
        TPR, FPR = TPR_FPR(TP, FP, valid_yes, valid_no)
        TPR = round(TPR, 3)
        FPR = round(FPR, 3)
        EER = calc_eer(mapa_ca, mapa_wpa)
        
        output.append([pq,nq,vq,TPR,FPR,EER, file])
        
    return output

In [273]:
def expected_output(output_root):
    expected_output = []
    
    for output_file in os.listdir(output_root):
        curr_output_file = output_root + output_file
    
        with open(curr_output_file, 'r') as f:
            pq = nq = vq = TPR = FPR = EER = 0
            
            item_content = f.readline().strip(' ,\n')
            row = item_content.split(',')
            pq = int(row[0])
            nq = int(row[1])
            vq = int(row[2])
            TPR = float(row[3])
            FPR = float(row[4])
            EER = float(row[5])
            
            row = [pq, nq, vq, TPR, FPR, EER, output_file]
            expected_output.append(row)
    
    return expected_output

In [274]:
print('output:')
output = output(inputs)
print_output(output)

print('\nexpected output:')
expected_output = expected_output(outputs)
print_output(expected_output)

print('\ndiff:')
output_diff = output_diff(output, expected_output)
print_output(output_diff)

output:
   pq     nq     vq    TPR    FPR    EER
  420    320    539  0.843  0.083  0.108 : 10.txt
  206    401    574  0.472  0.087  0.168 :  3.txt
  539    462    886  0.772  0.121  0.158 :  5.txt
  141    109    250  0.745  0.165   0.22 :  1.txt
  439    643   1052  0.845  0.154  0.154 :  2.txt
  143    482    467  0.841   0.25  0.206 :  9.txt
  578    272    687  0.889   0.04  0.066 :  7.txt
  122    105    185  0.716  0.044  0.133 :  8.txt
  216    259    433  0.376  0.034  0.216 :  4.txt
  818    510   1125  0.548  0.074  0.233 :  6.txt

expected output:
   pq     nq     vq    TPR    FPR    EER
  420    320    539  0.843  0.083  0.104 : 10.txt
  206    401    574  0.472  0.087  0.162 :  3.txt
  539    462    886  0.772  0.121  0.162 :  5.txt
  141    109    250  0.745  0.165  0.215 :  1.txt
  439    643   1052  0.845  0.154  0.154 :  2.txt
  143    482    467  0.841   0.25  0.202 :  9.txt
  578    272    687  0.889   0.04  0.069 :  7.txt
  122    105    185  0.716  0.044  0.135 :