In [1]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_roc_curve, make_scorer, f1_score, roc_auc_score
from sklearn import preprocessing
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import cross_validate, LeaveOneGroupOut, PredefinedSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from datetime import datetime as dt
import os
import json

%matplotlib inline

In [2]:
def concat_dataframes(path, df_type):
    dfs_list = []
    dfs_rows_len_list = []
    
    for user in os.listdir(path):
        for file in os.listdir(os.path.join(path, user)):
            if file.find(df_type) != -1:
                df = pd.read_csv(os.path.join(path, user, file))
                
                if df_type != 'broadcasts':
                    df = df.drop(["timestamp"], axis=1)
#                 df = (df - df.min()) / (df.max() - df.min())
                
                df["user"] = int(user.split('_')[1])
                
                dfs_list.append(df)
    
    return pd.concat(dfs_list, ignore_index=True)


def get_broadcasts_dataframe(data_path, window_type, window_size):
    return concat_dataframes(os.path.join(data_path, window_type, window_size), 'broadcasts'), create_file_for_results('broadcasts')


def get_broadcasts_list(file):
    with open(file, 'r') as f:
        lines = f.readlines()
    
    return [x.replace('\n', '') for x in lines]


def create_file_for_results(data_type):
    res_folder = '.\\_results'
    if os.path.exists(res_folder) is False:
        os.makedirs(res_folder)
    
    file = os.path.join(res_folder, data_type + '_results.json')
    if os.path.exists(file) is False:
        with open(file, 'w') as f:
            json.dump({'stub': None}, f)
        
    return file    


def update_file_with_results(file_path, results_dict):
    import collections.abc

    def update(d, u):
        for k, v in u.items():
            if isinstance(v, collections.abc.Mapping):
                d[k] = update(d.get(k, {}), v)
            else:
                d[k] = v
        return d
    
    with open(file_path, 'r') as f:
        res = json.load(f)
    
    res = update(res, results_dict)
    
    with open(file_path, 'w') as f:
        json.dump(res, f, sort_keys=True, indent=2)
        
        
def get_dict_with_results(json_path):
    with open(json_path, 'r') as f:
        res = json.load(f)
    return res    


def get_cv_split(X, y, group_labels, valid_user_label):
    predefined_split_array = np.zeros(group_labels.shape[0])
    i = 0
    test_array = [x for x in range(group_labels.shape[0])]
    for test, _ in LeaveOneGroupOut().split(X, y, group_labels):
        diff = np.setdiff1d(test_array, test)
        if np.all(group_labels[diff[0] : diff[-1]] == valid_user_label) is np.bool_(True):
            for sample in diff:
                predefined_split_array[sample] = -1
        else:
            for sample in diff:
                predefined_split_array[sample] = i
            i += 1
    return predefined_split_array


def fit_label_encoder(broadcasts_list):
    le = LabelEncoder()
    le.fit(broadcasts_list)
    return le


def prepare_dataset(df, user):
    df_ = split_users_into_two_classes(df.copy(), user)
    
    group_labels = df_.labels.to_numpy().copy()
    df_ = df_.drop(['labels', 'timestamp'], axis=1)
    
    dataset = df_.to_numpy().copy()
    X = dataset[:, :-1]
    y = dataset[:, -1]
    
    return X, y, group_labels


def split_users_into_two_classes(df, valid_user_label):
    df.loc[df["user"] != valid_user_label, "user"] = 0
    df.loc[df["user"] == valid_user_label, "user"] = 1
    return df  


def generate_train_dataset(df, user, ex_user, is_SVM = False):
    df_ = df.copy()

    df_for_test = []

    df__ = df_[df_.labels == ex_user].copy()
    df_for_test.append(df__)
    df_ = df_.drop(df__.index, axis=0)

    for user_ in df_.labels.unique():
        if user_ != ex_user:
            test_size = int((0.25 * df_[df_.labels == user_].shape[0]) - 1)
            df__ = df_[df_.labels == user_].sample(test_size).copy()
            df_for_test.append(df__)
            df_ = df_.drop(df__.index, axis=0)

    df_ = split_users_into_two_classes(df_.copy(), user)
    
    valid_user_count = df_[df_.user == 1].shape[0]
    intruder_count = df_[df_.user != 1].shape[0]
    
    if valid_user_count < intruder_count:
        df_ = df_.drop(df_[df_.user != 1].sample(intruder_count - valid_user_count).index)
    else:
        df_ = df_.drop(df_[df_.user == 1].sample(valid_user_count - intruder_count).index)
          
    if is_SVM:    
        df_.loc[df_.user == 0, 'user'] = -1

    df_ = df_.drop(["labels", 'timestamp'], axis=1)

    dataset = df_.to_numpy().copy()
    np.random.shuffle(dataset)
    
    X = dataset[:, 0:-1]
    y = dataset[:, -1]
    
    return X, y, df_for_test


def generate_test_dataset(df_list, user, ex_user, is_SVM = False):
    test_df = pd.concat(df_list)

    valid_user_in_test_count = test_df[test_df.labels == user].shape[0]
    ex_user_in_test_count = test_df[test_df.labels == ex_user].shape[0]
    others_in_test_count = [test_df[test_df.labels == x].shape[0]
                            for x in test_df.labels.unique() if x != user and x != ex_user]

    others_test_count = sum(others_in_test_count)
    part_size = min(valid_user_in_test_count, ex_user_in_test_count)
    if others_test_count <= min(valid_user_in_test_count, ex_user_in_test_count):
        part_size = others_test_count    
        
    new_df_parts = []    

    new_df_parts.append(test_df[test_df.labels == user].sample(part_size).copy())
    new_df_parts.append(test_df[test_df.labels == ex_user].sample(part_size).copy())
    new_df_parts.append(test_df[~test_df.labels.isin([user, ex_user])].sample(part_size).copy())
    
    test_df = pd.concat(new_df_parts)
    
    test_df.loc[test_df.labels == user, "user"] = 1
    if is_SVM:
        test_df.loc[test_df.labels != user, "user"] = -1
    else:
        test_df.loc[test_df.labels != user, "user"] = 0

    print("True: ", test_df[test_df.user == 1].shape)
    print("Shape: ", test_df.shape)
    for x in test_df.labels.unique():
        print("Count ", x, ": ", test_df[test_df.labels == x].shape)

    test_df = test_df.drop("labels", axis=1)

    test_dataset = test_df.to_numpy().copy()
    
    timestamps = test_dataset[:, 0].copy()
    X_test = test_dataset[:, 1:-1].copy()
    y_test = test_dataset[:, -1].copy()

    return X_test, [int(x) for x in y_test], timestamps

In [3]:
def broadcasts_model_cross_validation(results_file, model, df, model_tag, df_type, window_type, window_size):
    for user in df.labels.unique():
        print("Valid User: ", user)
        print("--------------------------------------------------------------------------------")

        X, y, group_labels = prepare_dataset(df, user)

        cv_split = PredefinedSplit(test_fold=get_cv_split(X, y, group_labels, user))
        scoring = ('accuracy')

        cv_results = cross_validate(model, X, y, scoring=scoring, cv=cv_split, n_jobs=-1)
        accuracy = cv_results['test_score']
        
        results = {
            df_type: {
                "fixed": {
                    "fixed": {
                        model_tag: {
                            "cross_validation": {
                                "valid_user": {
                                    str(user): {
                                        "accuracy": accuracy.tolist(),
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        
        update_file_with_results(results_file, results)

        print("CV accuracy list: ", accuracy)
        print("CV mean accuracy: ", np.mean(accuracy))
        print("CV min accuracy: ", np.min(accuracy))
        print("CV max accuracy: ", np.max(accuracy))

        print("--------------------------------------------------------------------------------")
        
        
def broadcasts_model_final_validation(results_file, model, df, model_tag, df_type, window_type, window_size, is_SVM = False):
    for user in df.labels.unique():
        print("Valid User: ", user)
        print("--------------------------------------------------------------------------------")
        for ex_user in df.labels.unique():
            if ex_user != user:
                X, y, df_for_test = generate_train_dataset(df, user, ex_user, is_SVM)
                
                model.fit(X, y)

                X_test, y_test, timestamps = generate_test_dataset(df_for_test, user, ex_user, is_SVM)

                predict = model.predict(X_test)
                if is_SVM:
                    proba = model.decision_function(X_test)
                else:
                    proba = model.predict_proba(X_test)

                results = {
                    df_type: {
                        "fixed": {
                            "fixed": {
                                model_tag: {
                                    "final_validation": {
                                        "valid_user": {
                                            str(user): {
                                                "extracted_user": {
                                                    str(ex_user): {
                                                        "test": y_test,
                                                        "predict": predict.tolist(),
                                                        "proba": proba.tolist(),
                                                        "time": timestamps.tolist()
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                update_file_with_results(results_file, results)

                print("Valid user = ", user, ", Extracted user = ", ex_user, "accuracy = ", accuracy_score(y_test, predict))
                print("--------------------------------------------------------------------------------")     

### Learning settings
### ***

In [4]:
DATA_PATH = '..\\..\\scripts\\_features_all'

DATA_TYPE = "broadcasts"

WINDOW_TYPE = "rolling"
WINDOW_SIZE = "60s"

BROADCASTS_LIST_FILE = "broadcasts.list"

### ***

In [5]:
df, RESULTS_FILE = get_broadcasts_dataframe(DATA_PATH, WINDOW_TYPE, WINDOW_SIZE)

In [6]:
broadcasts_list = get_broadcasts_list(BROADCASTS_LIST_FILE)
le = fit_label_encoder(broadcasts_list)

In [7]:
df['action'] = le.transform(df.action)
df['labels'] = df['user']

## CatBoostClassifier CV

In [8]:
model_params = {
    'iterations': 100,
    'depth': 6,
    'loss_function': 'Logloss',
    'l2_leaf_reg': 1,
    'leaf_estimation_iterations': 5,
    'logging_level': 'Silent'
}

model = CatBoostClassifier(**model_params)
broadcasts_model_cross_validation(RESULTS_FILE, model, df, "CatBoost", DATA_TYPE, WINDOW_TYPE, WINDOW_SIZE)

Valid User:  1
--------------------------------------------------------------------------------
CV accuracy list:  [1.         1.         0.97938658 1.         1.         1.
 1.        ]
CV mean accuracy:  0.9970552263794721
CV min accuracy:  0.9793865846563046
CV max accuracy:  1.0
--------------------------------------------------------------------------------
Valid User:  2
--------------------------------------------------------------------------------
CV accuracy list:  [1. 1. 1. 1. 1. 1. 1.]
CV mean accuracy:  1.0
CV min accuracy:  1.0
CV max accuracy:  1.0
--------------------------------------------------------------------------------
Valid User:  3
--------------------------------------------------------------------------------
CV accuracy list:  [1. 1. 1. 1. 1. 1. 1.]
CV mean accuracy:  1.0
CV min accuracy:  1.0
CV max accuracy:  1.0
--------------------------------------------------------------------------------
Valid User:  4
------------------------------------------------

## CatBoostClassifier Final Validation

In [9]:
model_params = {
    'iterations': 100,
    'depth': 6,
    'loss_function': 'Logloss',
    'l2_leaf_reg': 1,
    'leaf_estimation_iterations': 5,
    'logging_level': 'Silent'
}

model = CatBoostClassifier(**model_params)
broadcasts_model_final_validation(RESULTS_FILE, model, df, "CatBoost", DATA_TYPE, WINDOW_TYPE, WINDOW_SIZE)

Valid User:  1
--------------------------------------------------------------------------------
True:  (3471, 4)
Shape:  (10413, 4)
Count  1 :  (3471, 4)
Count  2 :  (3471, 4)
Count  5 :  (1169, 4)
Count  6 :  (1122, 4)
Count  8 :  (192, 4)
Count  4 :  (582, 4)
Count  3 :  (277, 4)
Count  7 :  (129, 4)
Valid user =  1 , Extracted user =  2 accuracy =  0.46979736867377314
--------------------------------------------------------------------------------
True:  (3471, 4)
Shape:  (10413, 4)
Count  1 :  (3471, 4)
Count  3 :  (3471, 4)
Count  6 :  (1232, 4)
Count  4 :  (573, 4)
Count  5 :  (1222, 4)
Count  8 :  (225, 4)
Count  2 :  (95, 4)
Count  7 :  (124, 4)
Valid user =  1 , Extracted user =  3 accuracy =  0.4988956112551618
--------------------------------------------------------------------------------
True:  (3471, 4)
Shape:  (10413, 4)
Count  1 :  (3471, 4)
Count  4 :  (3471, 4)
Count  3 :  (285, 4)
Count  6 :  (1333, 4)
Count  7 :  (148, 4)
Count  5 :  (1347, 4)
Count  2 :  (143, 4)
C

True:  (4387, 4)
Shape:  (13161, 4)
Count  4 :  (4387, 4)
Count  2 :  (4387, 4)
Count  3 :  (357, 4)
Count  7 :  (182, 4)
Count  6 :  (1566, 4)
Count  5 :  (1556, 4)
Count  1 :  (468, 4)
Count  8 :  (258, 4)
Valid user =  4 , Extracted user =  2 accuracy =  0.4706329306283717
--------------------------------------------------------------------------------
True:  (6014, 4)
Shape:  (18042, 4)
Count  4 :  (6014, 4)
Count  3 :  (6014, 4)
Count  1 :  (664, 4)
Count  5 :  (2313, 4)
Count  6 :  (2225, 4)
Count  8 :  (352, 4)
Count  7 :  (229, 4)
Count  2 :  (231, 4)
Valid user =  4 , Extracted user =  3 accuracy =  0.46641170601928833
--------------------------------------------------------------------------------
True:  (6014, 4)
Shape:  (18042, 4)
Count  4 :  (6014, 4)
Count  5 :  (6014, 4)
Count  6 :  (3181, 4)
Count  1 :  (952, 4)
Count  8 :  (557, 4)
Count  2 :  (312, 4)
Count  3 :  (679, 4)
Count  7 :  (333, 4)
Valid user =  4 , Extracted user =  5 accuracy =  0.5504932934264494
-------

Valid user =  7 , Extracted user =  3 accuracy =  0.5223179326546593
--------------------------------------------------------------------------------
True:  (1277, 4)
Shape:  (3831, 4)
Count  7 :  (1277, 4)
Count  4 :  (1277, 4)
Count  5 :  (453, 4)
Count  3 :  (106, 4)
Count  6 :  (450, 4)
Count  2 :  (47, 4)
Count  8 :  (80, 4)
Count  1 :  (141, 4)
Valid user =  7 , Extracted user =  4 accuracy =  0.6570086139389193
--------------------------------------------------------------------------------
True:  (1277, 4)
Shape:  (3831, 4)
Count  7 :  (1277, 4)
Count  5 :  (1277, 4)
Count  1 :  (177, 4)
Count  2 :  (47, 4)
Count  6 :  (547, 4)
Count  4 :  (273, 4)
Count  3 :  (138, 4)
Count  8 :  (95, 4)
Valid user =  7 , Extracted user =  5 accuracy =  0.5371965544244323
--------------------------------------------------------------------------------
True:  (1277, 4)
Shape:  (3831, 4)
Count  7 :  (1277, 4)
Count  6 :  (1277, 4)
Count  5 :  (564, 4)
Count  2 :  (42, 4)
Count  3 :  (129, 4)
Cou

## CategoricalNB CV

In [10]:
model_params = {
    'min_categories': len(le.classes_)
}

model = CategoricalNB(**model_params)
broadcasts_model_cross_validation(RESULTS_FILE, model, df, "NB", DATA_TYPE, WINDOW_TYPE, WINDOW_SIZE)

Valid User:  1
--------------------------------------------------------------------------------
CV accuracy list:  [1.         1.         0.97564625 0.99591682 1.         1.
 0.99925271]
CV mean accuracy:  0.9958308253055019
CV min accuracy:  0.9756462471947469
CV max accuracy:  1.0
--------------------------------------------------------------------------------
Valid User:  2
--------------------------------------------------------------------------------
CV accuracy list:  [1.        1.        1.        1.        1.        1.        0.9997509]
CV mean accuracy:  0.9999644147109584
CV min accuracy:  0.9997509029767094
CV max accuracy:  1.0
--------------------------------------------------------------------------------
Valid User:  3
--------------------------------------------------------------------------------
CV accuracy list:  [0.999568   0.99680875 0.99912725 0.99981057 1.         0.99687195
 0.99937726]
CV mean accuracy:  0.9987948268648698
CV min accuracy:  0.9968087531342603


## CategoricalNB Final Validation

In [11]:
model_params = {
    'min_categories': len(le.classes_)
}

model = CategoricalNB(**model_params)
broadcasts_model_final_validation(RESULTS_FILE, model, df, "NB", DATA_TYPE, WINDOW_TYPE, WINDOW_SIZE)

Valid User:  1
--------------------------------------------------------------------------------
True:  (3471, 4)
Shape:  (10413, 4)
Count  1 :  (3471, 4)
Count  2 :  (3471, 4)
Count  6 :  (1200, 4)
Count  5 :  (1133, 4)
Count  4 :  (573, 4)
Count  8 :  (187, 4)
Count  3 :  (253, 4)
Count  7 :  (125, 4)
Valid user =  1 , Extracted user =  2 accuracy =  0.480745222318256
--------------------------------------------------------------------------------
True:  (3471, 4)
Shape:  (10413, 4)
Count  1 :  (3471, 4)
Count  3 :  (3471, 4)
Count  5 :  (1196, 4)
Count  4 :  (614, 4)
Count  7 :  (131, 4)
Count  2 :  (107, 4)
Count  6 :  (1209, 4)
Count  8 :  (214, 4)
Valid user =  1 , Extracted user =  3 accuracy =  0.4985114760395659
--------------------------------------------------------------------------------
True:  (3471, 4)
Shape:  (10413, 4)
Count  1 :  (3471, 4)
Count  4 :  (3471, 4)
Count  6 :  (1262, 4)
Count  8 :  (203, 4)
Count  3 :  (292, 4)
Count  7 :  (155, 4)
Count  5 :  (1421, 4)
Co

Valid user =  4 , Extracted user =  2 accuracy =  0.5401565230605577
--------------------------------------------------------------------------------
True:  (6014, 4)
Shape:  (18042, 4)
Count  4 :  (6014, 4)
Count  3 :  (6014, 4)
Count  7 :  (251, 4)
Count  5 :  (2235, 4)
Count  6 :  (2272, 4)
Count  1 :  (649, 4)
Count  2 :  (207, 4)
Count  8 :  (400, 4)
Valid user =  4 , Extracted user =  3 accuracy =  0.48647600044340983
--------------------------------------------------------------------------------
True:  (6014, 4)
Shape:  (18042, 4)
Count  4 :  (6014, 4)
Count  5 :  (6014, 4)
Count  1 :  (925, 4)
Count  6 :  (3191, 4)
Count  7 :  (330, 4)
Count  3 :  (728, 4)
Count  8 :  (563, 4)
Count  2 :  (277, 4)
Valid user =  4 , Extracted user =  5 accuracy =  0.5650149650814765
--------------------------------------------------------------------------------
True:  (6014, 4)
Shape:  (18042, 4)
Count  4 :  (6014, 4)
Count  6 :  (6014, 4)
Count  1 :  (951, 4)
Count  5 :  (3162, 4)
Count  3 : 

True:  (1277, 4)
Shape:  (3831, 4)
Count  7 :  (1277, 4)
Count  4 :  (1277, 4)
Count  5 :  (485, 4)
Count  8 :  (55, 4)
Count  6 :  (473, 4)
Count  2 :  (48, 4)
Count  3 :  (104, 4)
Count  1 :  (112, 4)
Valid user =  7 , Extracted user =  4 accuracy =  0.6048029235186635
--------------------------------------------------------------------------------
True:  (1277, 4)
Shape:  (3831, 4)
Count  7 :  (1277, 4)
Count  5 :  (1277, 4)
Count  4 :  (294, 4)
Count  1 :  (149, 4)
Count  6 :  (560, 4)
Count  3 :  (127, 4)
Count  2 :  (45, 4)
Count  8 :  (102, 4)
Valid user =  7 , Extracted user =  5 accuracy =  0.5559906029757243
--------------------------------------------------------------------------------
True:  (1277, 4)
Shape:  (3831, 4)
Count  7 :  (1277, 4)
Count  6 :  (1277, 4)
Count  8 :  (91, 4)
Count  5 :  (578, 4)
Count  1 :  (153, 4)
Count  4 :  (283, 4)
Count  3 :  (110, 4)
Count  2 :  (62, 4)
Valid user =  7 , Extracted user =  6 accuracy =  0.4685460715217959
---------------------