In [1]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_roc_curve, make_scorer
from sklearn import preprocessing
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, LeaveOneGroupOut, PredefinedSplit, GridSearchCV
import matplotlib.pyplot as plt


%matplotlib inline

In [2]:
def make_dataframe_impl(df_count, rolling=True):
    dfs_list = []
    dfs_rows_len_list = []

    file_name = ""
    if rolling is True:
        file_name = ".\\_datasets\\600s\\location_rolling_dataset_"
    else:
        file_name = ".\\_datasets\\600s\\location_sampling_dataset_"

    for i in range(1, df_count + 1):
        df = pd.read_csv(file_name + str(i) + ".csv")
        df = df.drop(["timestamp"], axis=1)

        df = (df - df.min()) / (df.max() - df.min())

        df["user"] = i
        dfs_list.append(df)
        dfs_rows_len_list.append(df.shape[0])

    df = pd.concat(dfs_list, ignore_index=True)
    return df

In [3]:
def make_common_rolling_dataframe(df_count):
    return make_dataframe_impl(df_count, True)

def make_common_sampling_dataframe(df_count):
    return make_dataframe_impl(df_count, False)

In [None]:
def drop_bad_rows(df):
    bad_rows = set()
    for col in df.columns:
        if col != "user":
            for user in df.user.unique():
                for x in list(
                    df.loc[df.user == user, :][
                        np.abs(stats.zscore(df.loc[df.user == user, col])) > 2
                    ].index
                ):
                    bad_rows.add(x)

            for x in list(df[col][np.abs(stats.zscore(df[col])) > 2].index):
                bad_rows.add(x)

    df.drop(list(bad_rows), axis=0, inplace=True)

    return df

def drop_bad_cols(df):
    bad_cols = set()
    for col in df.columns:
        if col != "user":
            if df[df[col] != df[col].mean()].shape[0] < 0.1 * df.shape[0]:
                bad_cols.add(col)

            for user in df.user.unique():
                if (
                    df.loc[df.user == user, :][
                        df.loc[df.user == user, col]
                        != df.loc[df.user == user, col].mean()
                    ].shape[0]
                    < 0.1 * df.loc[df.user == user, :].shape[0]
                ):
                    bad_cols.add(col)

                elif (
                    np.sum(np.abs(stats.zscore(df.loc[df.user == user, col])) < 2)
                    < 0.9 * df.loc[df.user == user, col].shape[0]
                ):
                    bad_cols.add(col)

    df.drop(list(bad_cols), axis=1, inplace=True)

    return df

In [None]:
def resample(df):
    sampling_dfs = []
    need_count = 0

    for label, count in zip(df.user.value_counts().index, df.user.value_counts().values):
        if need_count == 0:
            need_count = count
            df_ = df[df.user == label]
            sampling_dfs.append(df_)
        else:
            df_ = df[df.user == label]
            df_over = df_.sample(need_count, replace=True, random_state=42)
            sampling_dfs.append(df_over)

    new_df = pd.concat(sampling_dfs)
    new_df = new_df.reset_index()

    return pd.concat(sampling_dfs)

def extract_delayed_user(df, user_label):
    df_user = df[df["user"] == user_label]
    df = df[df["user"] != user_label]
    return df_user, df

def split_users_into_two_classes(df, valid_user_label):
    df.loc[df["user"] != valid_user_label, "user"] = 0
    df.loc[df["user"] == valid_user_label, "user"] = 1
    return df

def get_cv_split(X, y, group_labels, valid_user_label):
    predefined_split_array = np.zeros(group_labels.shape[0])
    i = 0
    test_array = [x for x in range(group_labels.shape[0])]
    for test, _ in LeaveOneGroupOut().split(X, y, group_labels):
        diff = np.setdiff1d(test_array, test)
        if np.all(group_labels[diff[0] : diff[-1]] == valid_user_label) is np.bool_(True):
            for sample in diff:
                predefined_split_array[sample] = -1
        else:
            for sample in diff:
                predefined_split_array[sample] = i
            i += 1
    return predefined_split_array

In [4]:
df = make_common_rolling_dataframe(8)

In [5]:
df.columns.to_list()

['accuracy_mean',
 'accuracy_var',
 'accuracy_median',
 'accuracy_skew',
 'accuracy_kurt',
 'accuracy_std',
 'speed_mean',
 'speed_var',
 'speed_median',
 'speed_skew',
 'speed_kurt',
 'speed_std',
 'altitude_speed_mean',
 'altitude_speed_var',
 'altitude_speed_median',
 'altitude_speed_skew',
 'altitude_speed_kurt',
 'altitude_speed_std',
 'acc_mean',
 'acc_var',
 'acc_median',
 'acc_skew',
 'acc_kurt',
 'acc_std',
 'altitude_acc_mean',
 'altitude_acc_var',
 'altitude_acc_median',
 'altitude_acc_skew',
 'altitude_acc_kurt',
 'altitude_acc_std',
 'user']

In [6]:
features = [
    "accuracy_mean",
    "accuracy_var",
    "accuracy_median",
    "accuracy_skew",
    "accuracy_kurt",
    "accuracy_std",
    "speed_mean",
    "speed_var",
    "speed_median",
    #  'speed_skew', - переобучается и у нас получаются 0 аккураси
    "speed_kurt",
    "speed_std",
    "altitude_speed_mean",
    "altitude_speed_var",
    "altitude_speed_median",
    #  'altitude_speed_skew', - сильное переобучение, каждое второе - либо 1 либо 0
    "altitude_speed_kurt",
    "altitude_speed_std",
    "acc_mean",
    "acc_var",
    #  'acc_median',
    #  'acc_skew',
    "acc_kurt",
    "acc_std",
    #  'altitude_acc_mean',
    "altitude_acc_var",
    "altitude_acc_median",
    #  'altitude_acc_skew',
    "altitude_acc_kurt",
    "altitude_acc_std",
    "user",
]

In [7]:
df = df.drop(df.columns.difference(features), axis=1)
df = df.dropna()

In [10]:
df = drop_bad_cols(df)
df = drop_bad_rows(df)

In [12]:
corr_matrix = df.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
corr_cols = [column for column in upper_tri.columns if any(abs(upper_tri[column]) > 0.7) and column != "user"]
df = df.drop(corr_cols, axis=1)

In [None]:
df['labels'] = df['user']

In [None]:
gs_df_parts = []
for user in df.labels.unique():
    new_df = df[df.labels == user].sample(int(df[df.labels == user].shape[0] * 0.2)).copy()
    gs_df_parts.append(new_df)

df = pd.concat(gs_df_parts)

## CatBoostClassifier GridSearchCV

In [None]:
for user in df.labels.unique():
    print("Valid User: ", user)
    print("--------------------------------------------------------------------------------")

    df_ = resample(df.copy())
    df_ = split_users_into_two_classes(df_, user)
    df_ = resample(df_)

    group_labels = df_.labels.to_numpy().copy()
    df_ = df_.drop('labels', axis=1)

    dataset = df_.to_numpy().copy()
    X = dataset[:, :-1]
    y = dataset[:, -1]

    cv_split = PredefinedSplit(test_fold=get_cv_split(X, y, group_labels, user))

    clf = CatBoostClassifier()
    params = {'iterations': [100, 500],
              'depth': [6, 10],
              'loss_function': ['Logloss', 'CrossEntropy'],
              'l2_leaf_reg': [1, 10, 100],
              'leaf_estimation_iterations': [2, 5, 10],
              'logging_level':['Silent']
              }

    scorer = make_scorer(accuracy_score)
    clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=cv_split)

    clf_grid.fit(X, y)

    best_params = clf_grid.best_params_
    print('Best params: ', best_params)

    print("--------------------------------------------------------------------------------")

## RandomForestClassifier GridSearchCV

In [None]:
for user in df.labels.unique():
    print("Valid User: ", user)
    print("--------------------------------------------------------------------------------")

    df_ = resample(df.copy())
    df_ = split_users_into_two_classes(df_, user)
    df_ = resample(df_)

    group_labels = df_.labels.to_numpy().copy()
    df_ = df_.drop('labels', axis=1)

    dataset = df_.to_numpy().copy()
    X = dataset[:, :-1]
    y = dataset[:, -1]

    cv_split = PredefinedSplit(test_fold=get_cv_split(X, y, group_labels, user))

    clf = RandomForestClassifier()
    params = {'n_estimators': [50, 100],
              'criterion': ['gini', 'entropy'],
              'max_depth': [10, 100, None],
              'min_samples_split': [2, 100],
              'min_samples_leaf': [1, 10, 100],
              'max_features':['auto', 'sqrt', 'log2'],
              'n_jobs': [-1],
              'class_weight': ['balanced']
             }

    scorer = make_scorer(accuracy_score)
    clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=cv_split)

    clf_grid.fit(X, y)

    best_params = clf_grid.best_params_
    print('Best params: ', best_params)

    print("--------------------------------------------------------------------------------")

## SVC GridSearchCV

In [None]:
for user in df.labels.unique():
    print("Valid User: ", user)
    print("--------------------------------------------------------------------------------")

    df_ = resample(df.copy())
    df_ = split_users_into_two_classes(df_, user)
    df_ = resample(df_)
    
    df_.loc[df_.user == 0, 'user'] = -1

    group_labels = df_.labels.to_numpy().copy()
    df_ = df_.drop('labels', axis=1)

    dataset = df_.to_numpy().copy()
    X = dataset[:, :-1]
    y = dataset[:, -1]

    cv_split = PredefinedSplit(test_fold=get_cv_split(X, y, group_labels, user))

    clf = SVC()
    params = {'C': [0.01, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'degree': [1, 2, 3, 4],
              'gamma': ['scale', 'auto', 0.01, 0.1, 1, 3, 5],
              'cache_size': [2000],
              'max_iter':[-1, 100, 1000]
             }

    scorer = make_scorer(accuracy_score)
    clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=cv_split)

    clf_grid.fit(X, y)

    best_params = clf_grid.best_params_
    print('Best params: ', best_params)

    print("--------------------------------------------------------------------------------")

## LogReg GridSearchCV

In [None]:
for user in df.labels.unique():
    print("Valid User: ", user)
    print("--------------------------------------------------------------------------------")

    df_ = resample(df.copy())
    df_ = split_users_into_two_classes(df_, user)
    df_ = resample(df_)

    group_labels = df_.labels.to_numpy().copy()
    df_ = df_.drop('labels', axis=1)

    dataset = df_.to_numpy().copy()
    X = dataset[:, :-1]
    y = dataset[:, -1]

    cv_split = PredefinedSplit(test_fold=get_cv_split(X, y, group_labels, user))

    clf = LogisticRegression()
    params = {'penalty': ['l1', 'l2', 'elasticnet'],
              'C': [0.01, 1, 100],
              'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
              'max_iter': [500, 1000, 10000],
              'n_jobs': [-1]
             }

    scorer = make_scorer(accuracy_score)
    clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=cv_split)

    clf_grid.fit(X, y)

    best_params = clf_grid.best_params_
    print('Best params: ', best_params)

    print("--------------------------------------------------------------------------------")