In [14]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_roc_curve, make_scorer, f1_score, roc_auc_score, det_curve
from sklearn import preprocessing
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, LeaveOneGroupOut, PredefinedSplit, GridSearchCV
import matplotlib.pyplot as plt
import os
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB
import json
from datetime import datetime as dt
from docx import Document
from docx.shared import Cm, Pt

%matplotlib inline

In [15]:
def update_dict(d, u):
    import collections.abc
    for k, v in u.items():
        if isinstance(v, collections.abc.Mapping):
            d[k] = update(d.get(k, {}), v)
        else:
            d[k] = v
    return d


def update_file_with_results(file_path, results_dict):
    with open(file_path, 'r') as f:
        res = json.load(f)
    
    res = update_dict(res, results_dict)
    
    with open(file_path, 'w') as f:
        json.dump(res, f, sort_keys=True, indent=2)
        
        
def get_dict_with_results(json_path):
    with open(json_path, 'r') as f:
        res = json.load(f)
    return res 

In [16]:
def eer(fpr, fnr, thresholds):
    idx = np.nanargmin(np.absolute((fnr - fpr)))
    eer_threshold = thresholds[idx]
    eer1 = fpr[idx]
    
    return eer1, eer_threshold


def auc_roc(fpr, tpr):
    return metrics.auc(fpr, tpr)


def confusion_matrix_thr(y_true, proba, threshold):
    predict = proba
    predict[predict > threshold] = 1
    predict[predict <= threshold] = 0
    
    matr = metrics.confusion_matrix(y_true, predict, labels=[0, 1])
    
    tp = matr[0, 0]
    fp = matr[1, 0]
    fn = matr[0, 1]
    tn = matr[1, 1]
    
    return tn, fp, fn, tp


def calc_metrics(y_test, proba, thresholds):
    FPR = np.array([])
    TPR = np.array([])
    FNR = np.array([])
    F_score = np.array([])
    ANGA = np.array([])
    ANIA = np.array([])

    for thr in thresholds:
        tn, fp, fn, tp = confusion_matrix_thr(y_test, proba.copy(), thr)
        
        fpr = fp / (tn + fp)
        tpr = tp / (tp + fn)
        fnr = fn / (tp + fn)
        
        FPR = np.append(FPR, 1 if np.isnan(fpr) else fpr)
        TPR = np.append(TPR, 1 if np.isnan(tpr) else tpr)
        FNR = np.append(FNR, 1 if np.isnan(fnr) else fnr)
        F_score = np.append(F_score, tp / (tp + 0.5 * (fn + fp)))
    
    EER, EER_thr = eer(fpr=FPR, fnr=FNR, thresholds=thresholds)
    AUC_ROC = auc_roc(fpr=FPR, tpr=TPR)
    
    return {'FAR': FPR, 
            'FRR': FNR, 
            'F': F_score, 
            'EER': EER, 
            'EER_thr': EER_thr, 
            'AUC-ROC': AUC_ROC}


def iterate_over_cv_results(results):
    for df_type, inner in results.items():
        if df_type == 'stub':
            continue

        for window_type, inner1 in inner.items():
            for window_size, inner2 in inner1.items():
                for model, inner3 in inner2.items():
                    for valid_user, inner4 in inner3['cross_validation']['valid_user'].items():
                        yield {'df_type': df_type, 
                               'window_type': window_type, 
                               'window_size': window_size, 
                               'model': model, 
                               'valid_user': valid_user, 
                               'accuracy': np.array(inner4['accuracy'])}

                        
def iterate_over_final_results(results):
    for df_type, inner in results.items():
        if df_type == 'stub':
            continue

        for window_type, inner1 in inner.items():
            for window_size, inner2 in inner1.items():
                for model, inner3 in inner2.items():
                    for valid_user, inner4 in inner3['final_validation']['valid_user'].items():
                        for intruder, inner5 in inner4['extracted_user'].items(): 
                            yield {'df_type': df_type, 
                                   'window_type': window_type, 
                                   'window_size': window_size, 
                                   'model': model, 
                                   'valid_user': valid_user, 
                                   'intruder': intruder,
                                   'test': np.array(inner5['test']), 
                                   'proba': np.array(inner5['proba'])[:, 1], 
                                   'time': [] if 'time' not in inner5.keys() else np.array(inner5['time'])}
            

def avg_accuracy(results):
    metrics = {}
    for res in iterate_over_cv_results(results):
        key = (res['df_type'], res['window_type'], res['window_size'], res['model'])
        if key not in metrics.keys():
            metrics[key] = {'accuracy': []}
        
        metrics[key]['accuracy'].append(res['accuracy'])
        
    for k, v in metrics.items():
        metrics[k] = ({'accuracy': np.array(v['accuracy']).mean()})    
        
    return metrics
          
    
def avg_common_metrics(results, thresholds):
    metrics = {}
    for res in iterate_over_final_results(results):
        key = (res['df_type'], res['window_type'], res['window_size'], res['model'])
        if key not in metrics.keys():
            metrics[key] = {'EER': [], 'AUC-ROC': []}
        
        metrics_dict = calc_metrics(res['test'], res['proba'], thresholds)
        
        metrics[key]['EER'].append(metrics_dict['EER'])
        metrics[key]['AUC-ROC'].append(metrics_dict['AUC-ROC'])
        
    for k, v in metrics.items():
        metrics[k] = ({'EER': np.array(v['EER']).mean(), 
                       'AUC-ROC': np.array(v['AUC-ROC']).mean()})
    
    return metrics

In [63]:
def add_columns_names(table, names, row_index = 0):
    for name, i in zip(names, range(len(names))):
        table.rows[row_index].cells[i].text = str(name)
    return table


def add_rows_names(table, names, col_index = 0):
    for name, i in zip(names, range(len(names))):
        table.rows[i].cells[col_index].text = str(name)
    return table


def generate_common_accuracy_tables(results, df_type, window_type, window_sizes):
    word_document = Document()
    document_name = '_'.join([df_type, window_type])
    
    table = word_document.add_table(rows=10, cols=6) # we add rows iteratively
    table.style = 'TableGrid'
    
    NameIdx = 0
    WndIdx = 1
    CatBoostIdx = 2
    RandomForestIdx = 3
    SVCIdx = 4
    LogRegIdx = 5
    
    def get_col_idx(model_tag):
        if model_tag == 'CatBoost':
            return CatBoostIdx
        if model_tag == 'RandomForest':
            return RandomForestIdx
        if model_tag == 'SVC':
            return SVCIdx
        if model_tag == 'LogReg':
            return LogRegIdx
    
    s5 = 1
    s10 = 2
    s30 = 3
    s60 = 4
    s90 = 5
    s120 = 6
    s240 = 7
    s600 = 8
    
    def get_row_idx(wnd):
        if wnd == '5s':
            return s5
        if wnd == '10s':
            return s10
        if wnd == '30s':
            return s30
        if wnd == '60s':
            return s60
        if wnd == '90s':
            return s90
        if wnd == '120s':
            return s120
        if wnd == '240s':
            return s240
        if wnd == '600s':
            return s600
    
    table = add_columns_names(table, ['Метрика', 'Размер окна, с', 'CatBoostClassifier', 'RandomForest', 'SVM-SVC', 'LogisticRegression'])
    table = add_rows_names(table, ['Метрика', 'Accuracy'])
    table = add_rows_names(table, ['Размер окна, с'] + 
                           [str(x).replace('s', '') for x in window_sizes] + ['Лучший результат'], col_index=WndIdx)
    
    best_res = {}
    for k, v in results.items():
        if k[0] == df_type and k[1] == window_type:
            accuracy = results[k]['accuracy']
        
            best_res[k[3]] = ['0s', 0]
            
            if accuracy > best_res[k[3]][1]:
                best_res[k[3]][0] = k[2]
                best_res[k[3]][1] = accuracy
            
            table.rows[get_row_idx(k[2])].cells[get_col_idx(k[3])].text = str(round(accuracy, 3))

    for k, v in best_res.items():
        table.rows[get_row_idx(v[0])].cells[get_col_idx(k)].text = str(round(v[1], 3))
            
    word_document.add_page_break()
    word_document.save(document_name + '.docx')
    
    
def generate_common_metrics_tables(results, df_type, window_type, window_sizes):
   
    NameIdx = 0
    WndIdx = 1
    CatBoostIdx = 2
    RandomForestIdx = 3
    SVCIdx = 4
    LogRegIdx = 5
    
    def get_col_idx(model_tag):
        if model_tag == 'CatBoost':
            return CatBoostIdx
        if model_tag == 'RandomForest':
            return RandomForestIdx
        if model_tag == 'SVC':
            return SVCIdx
        if model_tag == 'LogReg':
            return LogRegIdx
    
    s5 = 1
    s10 = 2
    s30 = 3
    s60 = 4
    s90 = 5
    s120 = 6
    s240 = 7
    s600 = 8
    
    def get_row_idx(wnd):
        if wnd == '5s':
            return s5
        if wnd == '10s':
            return s10
        if wnd == '30s':
            return s30
        if wnd == '60s':
            return s60
        if wnd == '90s':
            return s90
        if wnd == '120s':
            return s120
        if wnd == '240s':
            return s240
        if wnd == '600s':
            return s600
        
    for metr in ['AUC-ROC', 'EER']:
        
        word_document = Document()
        document_name = '_'.join([df_type, window_type, metr])

        table = word_document.add_table(rows=10, cols=6) # we add rows iteratively
        table.style = 'TableGrid'
        
        table = add_columns_names(table, ['Метрика', 'Размер окна, с', 'CatBoostClassifier', 'RandomForest', 'SVM-SVC', 'LogisticRegression'])
        table = add_rows_names(table, ['Метрика', metr])
        table = add_rows_names(table, ['Размер окна, с'] + 
                               [str(x).replace('s', '') for x in window_sizes] + ['Лучший результат'], col_index=WndIdx)

        best_res = {}
        for k, v in results.items():
            if k[0] == df_type and k[1] == window_type:
                accuracy = results[k][metr]
                best_res[k[3]] = ['0s', 0]

                if accuracy > best_res[k[3]][1]:
                    best_res[k[3]][0] = k[2]
                    best_res[k[3]][1] = accuracy

                table.rows[get_row_idx(k[2])].cells[get_col_idx(k[3])].text = str(round(accuracy, 3))

        for k, v in best_res.items():
            table.rows[get_row_idx(v[0])].cells[get_col_idx(k)].text = str(round(v[1], 3))

        word_document.add_page_break()
        word_document.save(document_name + '.docx')

In [40]:
DATA_TYPE = 'wifi'

RESULTS_PATH = ".\\_results"
RESULTS_FILE = DATA_TYPE + '_results.json'

THRESHOLDS = np.arange(0.0, 1.01, 0.05)

wifi_results = get_dict_with_results(os.path.join(RESULTS_PATH, RESULTS_FILE))

DATA_TYPE = 'bt'

RESULTS_PATH = ".\\_results"
RESULTS_FILE = DATA_TYPE + '_results.json'

THRESHOLDS = np.arange(0.0, 1.01, 0.05)

bt_results = get_dict_with_results(os.path.join(RESULTS_PATH, RESULTS_FILE))

# DATA_TYPE = 'location'

# RESULTS_PATH = ".\\_results"
# RESULTS_FILE = DATA_TYPE + '_results.json'

# THRESHOLDS = np.arange(0.0, 1.01, 0.05)

# location_results = get_dict_with_results(os.path.join(RESULTS_PATH, RESULTS_FILE))

In [43]:
wifi_metrics = avg_accuracy(wifi_results)
bt_metrics = avg_accuracy(bt_results)
# location_metrics = avg_accuracy(location_results)

In [44]:
wifi_common_metrics = avg_common_metrics(wifi_results, THRESHOLDS)
bt_common_metrics = avg_common_metrics(bt_results, THRESHOLDS)
# location_common_metrics = avg_common_metrics(location_results, THRESHOLDS)

In [45]:
# DATA_TYPES = ['wifi', 'bt', 'location']
DATA_TYPES = ['wifi', 'bt']
WINDOW_TYPES = ['rolling', 'sampling']
# WINDOWS = ['5s', '10s', '30s', '60s', '90s', '120s', '240s', '600s']
WINDOWS = ['60s', '90s', '120s']

In [62]:
for d_t in DATA_TYPES:
    for wnd_t in WINDOW_TYPES:
        for wnd in WINDOWS:
            generate_common_accuracy_tables(eval(d_t + '_metrics'), d_t, wnd_t, WINDOWS)
            
for d_t in DATA_TYPES:
    for wnd_t in WINDOW_TYPES:
        for wnd in WINDOWS:
            generate_common_metrics_tables(eval(d_t + '_common_metrics'), d_t, wnd_t, WINDOWS)

{}
{'CatBoost': ['120s', 0.9642906996040824]}
{'CatBoost': ['120s', 0.9642906996040824], 'LogReg': ['120s', 0.9092073795637521]}
{'CatBoost': ['120s', 0.9642906996040824], 'LogReg': ['120s', 0.9092073795637521], 'RandomForest': ['120s', 0.9809506763791974]}
{'CatBoost': ['60s', 0.9576989773722036], 'LogReg': ['120s', 0.9092073795637521], 'RandomForest': ['120s', 0.9809506763791974]}
{'CatBoost': ['60s', 0.9576989773722036], 'LogReg': ['60s', 0.8984688196345694], 'RandomForest': ['120s', 0.9809506763791974]}
{'CatBoost': ['60s', 0.9576989773722036], 'LogReg': ['60s', 0.8984688196345694], 'RandomForest': ['60s', 0.9707212831502225]}
{'CatBoost': ['90s', 0.9640165597756015], 'LogReg': ['60s', 0.8984688196345694], 'RandomForest': ['60s', 0.9707212831502225]}
{'CatBoost': ['90s', 0.9640165597756015], 'LogReg': ['90s', 0.9070144801411496], 'RandomForest': ['60s', 0.9707212831502225]}
{'CatBoost': ['90s', 0.9640165597756015], 'LogReg': ['90s', 0.9070144801411496], 'RandomForest': ['90s', 0.97

{}
{'CatBoost': ['120s', 0.07168516147970695]}
{'CatBoost': ['120s', 0.07168516147970695], 'LogReg': ['120s', 0.1277135948138579]}
{'CatBoost': ['120s', 0.07168516147970695], 'LogReg': ['120s', 0.1277135948138579], 'RandomForest': ['120s', 0.04263398509585668]}
{'CatBoost': ['60s', 0.08194346950487771], 'LogReg': ['120s', 0.1277135948138579], 'RandomForest': ['120s', 0.04263398509585668]}
{'CatBoost': ['60s', 0.08194346950487771], 'LogReg': ['60s', 0.13728601898558934], 'RandomForest': ['120s', 0.04263398509585668]}
{'CatBoost': ['60s', 0.08194346950487771], 'LogReg': ['60s', 0.13728601898558934], 'RandomForest': ['60s', 0.06474705189382272]}
{'CatBoost': ['90s', 0.06923504647724625], 'LogReg': ['60s', 0.13728601898558934], 'RandomForest': ['60s', 0.06474705189382272]}
{'CatBoost': ['90s', 0.06923504647724625], 'LogReg': ['90s', 0.13289119183096418], 'RandomForest': ['60s', 0.06474705189382272]}
{'CatBoost': ['90s', 0.06923504647724625], 'LogReg': ['90s', 0.13289119183096418], 'RandomF

{'CatBoost': ['120s', 0.16285010756940915], 'LogReg': ['120s', 0.39883752784803256], 'RandomForest': ['120s', 0.1035084975238262]}
{'CatBoost': ['60s', 0.2415056009300803], 'LogReg': ['120s', 0.39883752784803256], 'RandomForest': ['120s', 0.1035084975238262]}
{'CatBoost': ['60s', 0.2415056009300803], 'LogReg': ['60s', 0.4282943184161751], 'RandomForest': ['120s', 0.1035084975238262]}
{'CatBoost': ['60s', 0.2415056009300803], 'LogReg': ['60s', 0.4282943184161751], 'RandomForest': ['60s', 0.12824692365706278]}
{'CatBoost': ['90s', 0.1799079219753021], 'LogReg': ['60s', 0.4282943184161751], 'RandomForest': ['60s', 0.12824692365706278]}
{'CatBoost': ['90s', 0.1799079219753021], 'LogReg': ['90s', 0.4062542824250298], 'RandomForest': ['60s', 0.12824692365706278]}
{'CatBoost': ['90s', 0.1799079219753021], 'LogReg': ['90s', 0.4062542824250298], 'RandomForest': ['90s', 0.10277003755656645]}
{'CatBoost': ['90s', 0.1799079219753021], 'LogReg': ['90s', 0.4062542824250298], 'RandomForest': ['90s', 

{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{'CatBoost': ['120s', 0.8618402441575137]}
{'CatBoost': ['120s', 0.8618402441575137], 'LogReg': ['120s', 0.7170804725531268]}
{'CatBoost': ['120s', 0.8618402441575137], 'LogReg': ['120s', 0.7170804725531268], 'RandomForest': ['120s', 0.8799620250588359]}
{'CatBoost': ['60s', 0.8191199189993269], 'LogReg': ['120s', 0.7170804725531268], 'RandomForest': ['120s', 0.8799620250588359]}
{'CatBoost': ['60s', 0.8191199189993269], 'LogReg': ['60s', 0.7170741268418472], 'RandomForest': ['120s', 0.8799620250588359]}
{'CatBoost': ['60s', 0.8191199189993269], 'LogReg': ['60s', 0.7170741268418472], 'RandomForest': ['60s', 0.8154799261677675]}
{'CatBoost': ['90s', 0.8405138278084341], 'LogReg': ['60s', 0.7170741268418472], 'RandomForest': ['60s', 0.8154799261677675]}
{'CatBoost': ['90s', 0.8405138278084341], 'LogReg': ['90s', 0.7211473071894697], 'RandomForest': ['60s', 0.8154799261677675]}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{'CatBoost': ['120s', 0.23362094949857726]}
{'CatBoo