In [1]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_roc_curve, make_scorer, f1_score, roc_auc_score, det_curve
from sklearn import preprocessing
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, LeaveOneGroupOut, PredefinedSplit, GridSearchCV
import matplotlib.pyplot as plt
import os
from sklearn import metrics

%matplotlib inline

In [2]:
def concat_dataframes(path, df_type):
    dfs_list = []
    dfs_rows_len_list = []
    
    for user in os.listdir(path):
        for file in os.listdir(os.path.join(path, user)):
            if file.find(df_type) != -1:
                df = pd.read_csv(os.path.join(path, user, file))
                
                df = df.drop(["timestamp"], axis=1)
#                 df = (df - df.min()) / (df.max() - df.min())
                
                df["user"] = int(user.split('_')[1])
                
                dfs_list.append(df)
    
    return pd.concat(dfs_list, ignore_index=True)


def drop_bad_rows(df, z = 3):
    bad_rows = set()
    for col in df.columns:
        if col != "user":
            for user in df.user.unique():
                for x in list(df.loc[df.user == user, :][np.abs(stats.zscore(df.loc[df.user == user, col])) > z].index):
                    bad_rows.add(x)

            for x in list(df[col][np.abs(stats.zscore(df[col])) > z].index):
                bad_rows.add(x)

    df = df.drop(list(bad_rows), axis=0)

    return df


def drop_bad_cols(df, z = 3, allowed_proportion = 0.1):
    bad_cols = set()
    for col in df.columns:
        if col != "user":
            if df[df[col] != df[col].mean()].shape[0] < allowed_proportion * df.shape[0]:
                bad_cols.add(col)

            for user in df.user.unique():
                if df.loc[df.user == user, :][df.loc[df.user == user, col] != df.loc[df.user == user, col].mean()].shape[0] < allowed_proportion * df.loc[df.user == user, :].shape[0]:
                    bad_cols.add(col)

                elif np.sum(np.abs(stats.zscore(df.loc[df.user == user, col])) < z) < (1 - allowed_proportion) * df.loc[df.user == user, col].shape[0]:
                    bad_cols.add(col)

    df = df.drop(bad_cols, axis=1)
    return df, list(bad_cols)


def split_users_into_two_classes(df, valid_user_label, intruder_label):
    df.loc[df["user"] != valid_user_label, "user"] = intruder_label
    df.loc[df["user"] == valid_user_label, "user"] = 1
    return df  


def prepare_dataset(df, user, is_SVM=False):
    df_ = split_users_into_two_classes(df.copy(), user)
    
    group_labels = df_.labels.to_numpy().copy()
    df_ = df_.drop('labels', axis=1)
    
    if is_SVM:
        df_.loc[df_.user == 0, 'user'] = -1
    
    dataset = df_.to_numpy().copy()
    X = dataset[:, :-1]
    y = dataset[:, -1]
    
    return X, y, group_labels


def create_file_for_results(data_type):
    res_folder = '.\\_results'
    if os.path.exists(res_folder) is False:
        os.makedirs(res_folder)
    
    file = os.path.join(res_folder, data_type + '_results.json')
    if os.path.exists(file) is False:
        with open(file, 'w') as f:
            json.dump({'stub': None}, f)
        
    return file    


def update_file_with_results(file_path, results_dict):
    import collections.abc

    def update(d, u):
        for k, v in u.items():
            if isinstance(v, collections.abc.Mapping):
                d[k] = update(d.get(k, {}), v)
            else:
                d[k] = v
        return d
    
    with open(file_path, 'r') as f:
        res = json.load(f)
    
    res = update(res, results_dict)
    
    with open(file_path, 'w') as f:
        json.dump(res, f, sort_keys=True, indent=2)
        
        
def get_dict_with_results(json_path):
    with open(json_path, 'r') as f:
        res = json.load(f)
    return res    


def get_dataframe(path, data_type, window_type, window_size):
    return concat_dataframes(os.path.join(path, window_type, window_size), data_type), create_file_for_results(data_type)


def drop_corr_columns(df, corr_coef):
    corr_matrix = df.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    corr_cols = [column for column in upper_tri.columns if any(abs(upper_tri[column]) > corr_coef) and column != "user"]
    return df.drop(corr_cols, axis=1), corr_cols

In [13]:
def get_train_split(df, valid_user, intruder, is_SVM = False):
    train_df = df[df.user != intruder]
    intruder_label = 0
    if is_SVM:
        intruder_label = -1
    train_df = split_users_into_two_classes(train_df.copy(), valid_user, intruder_label)   
    
    valid_user_count = train_df[train_df.user == 1].shape[0]
    intruder_count = train_df[train_df.user == intruder_label].shape[0]

    if valid_user_count < intruder_count:
        train_df = train_df.drop(train_df[train_df.user == intruder_label].sample(intruder_count - valid_user_count).index)
    else:
        train_df = train_df.drop(train_df[train_df.user == 1].sample(valid_user_count - intruder_count).index)
        
    dataset = train_df.to_numpy().copy()
    np.random.shuffle(dataset)

    X = dataset[:, :-1]
    y = dataset[:, -1]    
    
    return X, y


def process_train_df(df, features, data_type, corr = 0.7, z = 3, prop = 0.1): 
    df = df.drop(df.columns.difference(features), axis=1)
    df = df.dropna(how='all')
    df = df.fillna(0)

    if 'count_mean' in df.columns:
        df = df[df.count_mean != 0]
    
    df = drop_bad_rows(df, z)
    df, dropped_cols_1 = drop_bad_cols(df, z, prop)
    df, dropped_cols_2 = drop_corr_columns(df, corr)
    
    return df, dropped_cols_1 + dropped_cols_2


def get_test_split(df, cols_for_drop, is_SVM = False):
    events_count = df.events_count.to_numpy()
    df = df.drop(cols_for_drop + ['timestamp', 'events_count'], axis=1)
    # test_df = (test_df - test_df.min()) / (test_df.max() - test_df.min())
    df = df.fillna(0)
    
    if is_SVM:
        df.loc[df["user"] != 1, "user"] = -1
    
    test_dataset = df.to_numpy().copy()
    
    X_test = test_dataset[:, :-1]
    y_test = test_dataset[:, -1]
    
    return X_test, y_test, events_count


def get_test_df(
    data_path,
    events_wnd,
    window_type,
    window_size,
    valid_user,
    intruder,
    data_type,
    index_1,
    index_2
):
    test_df = pd.read_csv(
        os.path.join(
            data_path,
            events_wnd,
            window_type,
            window_size,
            "_".join(["user", str(valid_user)]),
            "_".join([str(data_type), str(index_1), str(index_2), str(intruder)]) + ".data.csv",
        )
    )
    
    if 'count_mean' in test_df.columns:
        test_df = test_df[test_df['count_mean'] != 0]
    
    test_df = test_df.rename(columns={'user_user_agg': 'user', 'events_count_sum': 'events_count'})
    return test_df


def test_model(
    results_file,
    model,
    X_test,
    y_test,
    model_tag,
    df_type,
    window_type,
    window_size,
    valid_user,
    intruder,
    index_1,
    index_2,
    events_count,
    is_SVM = False
):
    try:
        predict = model.predict(X_test)
    #     if is_SVM:
    #         proba = model.decision_function(X_test)
    #     else:
        proba = model.predict_proba(X_test)

        print("TYPE: ", df_type)
        print('Model = ', model_tag, ', valid user = ', valid_user, ', intruder = ', intruder, ", ", index_1, '_', index_2)    
        print("--------------------------------------------------------------------------------")

        results = {
            df_type: {
                window_type: {
                    window_size: {
                        model_tag: {
                            "valid_user": {
                                str(valid_user): {
                                    "intruder": {
                                        str(intruder): {
                                            "valid_sample_index": {
                                                str(index_1): {
                                                    "intruder_sample_index": {
                                                        str(index_2): {
                                                            "test": y_test.tolist(),
                                                            "predict": predict.tolist(),
                                                            "proba": proba.tolist(),
                                                            "events_count": events_count.tolist()
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }    

        update_file_with_results(results_file, results)
        print('Accuracy score = ', accuracy_score(y_test, predict))
        print("--------------------------------------------------------------------------------")
        
    except Exception:
        pass
    
    
def simulate(
    users_count,
    samples_count,
    model,
    model_tag,
    data_path,
    test_data_path,
    df_type,
    events_wnd,
    window_type,
    window_size,
    is_SVM = False,
):   
    for valid_user in range(1, users_count + 1):
        for intruder in range(1, users_count + 1):
            if valid_user != intruder:
                
                df, results_file = get_dataframe(data_path, df_type, window_type, window_size)
                features = df.columns.to_list()
                df, dropped_cols = process_train_df(df, features, df_type)
                X_train, y_train = get_train_split(df, valid_user, intruder, is_SVM)

                model.fit(X_train, y_train)

                for valid_user_sample in range(samples_count):
                    for intruder_sample in range(samples_count):    
                        
                        test_df = get_test_df(
                            test_data_path,
                            events_wnd,
                            window_type,
                            window_size,
                            valid_user,
                            intruder,
                            df_type,
                            index_1=valid_user_sample,
                            index_2=intruder_sample,
                        )

                        X_test, y_test, events_count = get_test_split(test_df, dropped_cols, is_SVM)

                        test_model(
                            results_file,
                            model,
                            X_test,
                            y_test,
                            model_tag,
                            df_type,
                            window_type,
                            window_size,
                            valid_user,
                            intruder,
                            index_1=valid_user_sample,
                            index_2=intruder_sample,
                            events_count=events_count,
                            is_SVM=False,
                        )

### Simulation settings
### ***

In [14]:
DATA_PATH = '..\\..\\scripts\\_features_all'
TEST_DATA_PATH = '..\\..\\scripts\\_features_events'

EVENTS_WND = '15min'

DATA_TYPE = "location"

WINDOW_TYPE = "rolling"
WINDOW_SIZE = "60s"

SAMPLES_COUNT = 1
USERS_COUNT = 8

In [15]:
model_params = {
    "n_estimators": 100,
    "criterion": "gini",
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "auto",
    "n_jobs": -1,
    "class_weight": "balanced",
}

model = RandomForestClassifier(**model_params)

SVM = False

In [16]:
DATA_TYPES = ["location", "wifi", "bt"]
WINDOW_SIZES = ['60s']
WINDOW_TYPES = ['rolling', 'sampling']

### ***

In [None]:
for data_type in DATA_TYPES:
    for wnd_type in WINDOW_TYPES:
        for wnd_size in WINDOW_SIZES:
            simulate(
                USERS_COUNT,
                SAMPLES_COUNT,
                model,
                "RandomForest",
                DATA_PATH,
                TEST_DATA_PATH,
                data_type,
                EVENTS_WND,
                wnd_type,
                wnd_size,
                is_SVM = SVM,
            )

TYPE:  location
Model =  RandomForest , valid user =  3 , intruder =  6 ,  0 _ 0
--------------------------------------------------------------------------------
Accuracy score =  1.0
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  3 , intruder =  7 ,  0 _ 0
--------------------------------------------------------------------------------
Accuracy score =  0.9700996677740864
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  3 , intruder =  8 ,  0 _ 0
--------------------------------------------------------------------------------
Accuracy score =  0.993006993006993
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  4 , intruder =  1 ,  0 _ 0
--------------------------------------------------------------------------------
Accuracy score

Accuracy score =  0.9193548387096774
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  7 , intruder =  8 ,  0 _ 0
--------------------------------------------------------------------------------
Accuracy score =  0.9402985074626866
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  8 , intruder =  1 ,  0 _ 0
--------------------------------------------------------------------------------
Accuracy score =  1.0
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  8 , intruder =  2 ,  0 _ 0
--------------------------------------------------------------------------------
Accuracy score =  1.0
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  8 , intruder =  3 

Accuracy score =  0.8571428571428571
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  6 , intruder =  7 ,  0 _ 0
--------------------------------------------------------------------------------
Accuracy score =  0.7272727272727273
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  6 , intruder =  8 ,  0 _ 0
--------------------------------------------------------------------------------
Accuracy score =  0.6
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  7 , intruder =  1 ,  0 _ 0
--------------------------------------------------------------------------------
Accuracy score =  1.0
--------------------------------------------------------------------------------
TYPE:  location
Model =  RandomForest , valid user =  7 , intruder =  2 

  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
  return (a - mns) / sstd
