In [99]:
# from pydantic import BaseSettings
from functools import lru_cache
from typing import List, Mapping

import dill
from datetime import datetime, timedelta
import random

import pandas as pd
import numpy as np

import math
import seaborn as sns


from scipy.stats import multivariate_normal, multinomial
from typing import Tuple
from numpy.random import random_sample, randn
from sklearn.datasets import make_spd_matrix
from matplotlib import pyplot as plt
from sklearn.datasets import make_blobs

from collections import namedtuple
from time import process_time

# Third Party
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from scipy import stats

import warnings
from collections import OrderedDict
import os

from datetime import datetime as dt

warnings.filterwarnings("ignore")

# Named tuple to help structure results
PipelinesTuple = namedtuple("PipelinesTuple", ["approved", "rejected"])


from kuberspatiotemporal import CompoundModel, Feature, SpatialModel, KuberModel
from kuberspatiotemporal.tools.tools import check_spd, check_singular, repr_list_ndarray

from kuberspatiotemporal.tools.data import (
    FeatureSelector,
    get_column_transformer,
    split_anomaly_dataset,
)

%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [18]:
import logging

logging.getLogger("imported_module").setLevel(logging.WARNING)

<IPython.core.display.Javascript object>

In [13]:
data = pd.read_csv(
    "data/1fefac90-9903-41fe-94bb-0ed9d9abeae1.csv",
    usecols=["timestamp", "application_uid", "auth_status", "fingerprinttimezone"],
)

data["timestamp"] = data["timestamp"].apply(lambda x: dt.fromtimestamp(x))
data["timestamp"] = data.timestamp.dt.tz_localize("UTC")
data["weekday"] = data["timestamp"].apply(lambda x: "day" + str(x.weekday()))
#df = data.copy()

data.head()

Unnamed: 0,timestamp,application_uid,auth_status,fingerprinttimezone,weekday
0,2021-03-09 00:20:44+00:00,34ddbe55360c183dcfdc6e913a23c398cc1fa77452d1b4...,expired,{value=null},day1
1,2021-03-09 00:20:18+00:00,34ddbe55360c183dcfdc6e913a23c398cc1fa77452d1b4...,approved,{value=null},day1
2,2021-03-09 00:17:49+00:00,34ddbe55360c183dcfdc6e913a23c398cc1fa77452d1b4...,approved,{value=null},day1
3,2021-03-09 00:17:00+00:00,34ddbe55360c183dcfdc6e913a23c398cc1fa77452d1b4...,approved,{value=null},day1
4,2021-03-09 00:16:30+00:00,34ddbe55360c183dcfdc6e913a23c398cc1fa77452d1b4...,approved,{value=null},day1


<IPython.core.display.Javascript object>

In [236]:
def test_feature_determination(data : pd.DataFrame , column_str : str, noise_probability = 0.005, **kwargs):
    fs = FeatureSelector(data[column_str])
    fs.select()
    X_train = data[column_str].sample(3000)
    X_train = X_train[
        np.concatenate(
            (fs.categorical_features, fs.numerical_features , fs.time_feature)
        )
    ]
    n_components = 100

    # get transformers
    column_transformer, index = get_column_transformer(fs)

    # features model
    features_cpd_model = []
    n_dim = len(fs.time_feature) + len(fs.numerical_features) + len(fs.categorical_features)

    idx_spatial = np.concatenate((index["numerical"], index["numerical_time"])).astype('int')
    idx_kuber = np.concatenate((index["categorical"], index["categorical_time"])).astype('int')

    if len(idx_spatial) > 0:

        spatial_transformed = column_transformer.fit_transform(X_train)[:, idx_spatial]

        limits = np.array(
            [
                np.min(spatial_transformed, axis=0),
                np.max(spatial_transformed, axis=0),
            ]
        )

        features_cpd_model.append(
            Feature(
                SpatialModel(
                    n_dim=len(idx_spatial),
                    n_components= n_components,
                    min_eigval=1e-10,
                    limits=limits,
                    covar_factor=np.array([np.cov(spatial_transformed[:, i], spatial_transformed[:, i])[
                                        0][0] for i in range(spatial_transformed.shape[1])]),
                    **kwargs
                    
                ),
                idx_spatial,
            )
        )

    # I create a KuberModel for each category
    spatial_transformed = column_transformer.fit_transform(X_train)[:, idx_kuber]

    for idx_cat, name in zip(idx_kuber, fs.categorical_features):
        features_cpd_model.append(
            Feature(
                KuberModel(n_components= n_components,n_symbols=len(fs.get_categories(name))),
                [idx_cat],
            )
        )
    #print(len(features_cpd_model))

    # Then, a CompoundModel is created: it includes both Spatial and Kuber models
    #print(index.values())
    kst = CompoundModel(
        n_dim=np.sum([len(x) for x in index.values()]),
        n_components= n_components, 
        n_iterations=200,
        scaling_parameter=1.1,
        nonparametric=True,
        online_learning=False,
        loa=True,
        features=features_cpd_model,
        noise_probability = noise_probability

    )
    print('fs.categorical_features', fs.categorical_features) #================================
    print('fs.numerical_features',fs.numerical_features) #=====================================
    print('fs.time_feature', fs.time_feature) #================================================

    pipeline_approved = make_pipeline(column_transformer, kst)
    pipeline_approved.fit(X_train)


    X = pipeline_approved["columntransformer"].transform(X_train)
    return (kst, pipeline_approved)


<IPython.core.display.Javascript object>

## Using 1-D UTC Data

In [34]:
data = pd.read_csv(
    "data/1fefac90-9903-41fe-94bb-0ed9d9abeae1.csv",
    usecols=["timestamp", "application_uid", "auth_status", "fingerprinttimezone"],
)

data["timestamp"] = data["timestamp"].apply(lambda x: dt.fromtimestamp(x))
data["timestamp"] = data.timestamp.dt.tz_localize("UTC")

column_str = ["timestamp"]

kst, pipeline_approved = test_feature_determination(data, column_str)

X = pipeline_approved["columntransformer"].transform(data[column_str])

print(
    "Transformed columns: ",
    "\n",
    X[1:5, :],
    "\n",
    "Sample Scores: ",
    "\n",
    kst.score_samples(X[1:5, :]),
)

DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)


fs.categorical_features []
fs.numerical_features []
fs.time_feature ['timestamp']


DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kube

Transformed columns:  
 [[0.33833333]
 [0.29694444]
 [0.28333333]
 [0.275     ]] 
 Sample Scores:  
 [0.78605056 0.7751326  0.77127357 0.76884274]


<IPython.core.display.Javascript object>

## Using 1-D non-UTC Data

In [35]:
data = pd.read_csv(
    "data/1fefac90-9903-41fe-94bb-0ed9d9abeae1.csv",
    usecols=["timestamp", "application_uid", "auth_status", "fingerprinttimezone"],
)

data["timestamp"] = data["timestamp"].apply(lambda x: dt.fromtimestamp(x))

column_str = ["timestamp"]

kst, pipeline_approved = test_feature_determination(data, column_str)

X = pipeline_approved["columntransformer"].transform(data[column_str])

print(
    "Transformed columns: ",
    "\n",
    X[1:5, :],
    "\n",
    "Sample Scores: ",
    "\n",
    kst.score_samples(X[1:5, :]),
)

DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)


fs.categorical_features []
fs.numerical_features []
fs.time_feature ['timestamp']


DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 4)


Transformed columns:  
 [[0.33833333]
 [0.29694444]
 [0.28333333]
 [0.275     ]] 
 Sample Scores:  
 [0.63818322 0.63270264 0.63087046 0.62974144]


<IPython.core.display.Javascript object>

## Using timestamp + weekday (Current KT)

In [244]:
data = pd.read_csv(
    "data/1fefac90-9903-41fe-94bb-0ed9d9abeae1.csv",
    usecols=["timestamp", "application_uid", "auth_status", "fingerprinttimezone"],
)

data["timestamp"] = data["timestamp"].apply(lambda x: dt.fromtimestamp(x))
data["weekday"] = data["timestamp"].apply(lambda x: "day" + str(x.weekday()))


column_str = ["timestamp", "weekday"]
noise_probability = 0.005

kst, pipeline_approved = test_feature_determination(
    data, column_str, noise_probability, box=1
)

X = pipeline_approved["columntransformer"].transform(data[column_str])

print(
    "Transformed columns: ",
    "\n",
    X[1:5, :],
    "\n",
    "Sample Scores: ",
    "\n",
    kst.score_samples(X[1:5, :]),
)

fs.categorical_features ['weekday']
fs.numerical_features []
fs.time_feature ['timestamp']


DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:(100, 3000)
DEBUG:kuberspatiotemporal.spatial:

Transformed columns:  
 [[0.33833333 1.        ]
 [0.29694444 1.        ]
 [0.28333333 1.        ]
 [0.275      1.        ]] 
 Sample Scores:  
 [0.54621568 0.53471571 0.530748   0.52827346]


<IPython.core.display.Javascript object>

In [243]:
kst.score_samples(X).mean()

DEBUG:kuberspatiotemporal.spatial:(100, 9494)


0.6948392557731956

<IPython.core.display.Javascript object>

## Using weekday + timestamp (order reversed)

In [159]:
data = pd.read_csv(
    "data/1fefac90-9903-41fe-94bb-0ed9d9abeae1.csv",
    usecols=["timestamp", "application_uid", "auth_status", "fingerprinttimezone"],
)

data["timestamp"] = data["timestamp"].apply(lambda x: dt.fromtimestamp(x))
data["weekday"] = data["timestamp"].apply(lambda x: "day" + str(x.weekday()))


column_str = ["weekday", "timestamp"]
noise_probability = 0.005

kst, pipeline_approved = test_feature_determination(
    data, column_str, noise_probability, box=1
)

X = pipeline_approved["columntransformer"].transform(data[column_str])

print(
    "Transformed columns: ",
    "\n",
    X[1:5, :],
    "\n",
    "Sample Scores: ",
    "\n",
    kst.score_samples(X[1:5, :]),
)

DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)


fs.categorical_features ['weekday']
fs.numerical_features []
fs.time_feature ['timestamp']


DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kube

Transformed columns:  
 [[0.33833333 1.        ]
 [0.29694444 1.        ]
 [0.28333333 1.        ]
 [0.275      1.        ]] 
 Sample Scores:  
 [0.55975964 0.5460452  0.54109977 0.53796358]


<IPython.core.display.Javascript object>

## Using Only weekday

In [139]:
data = pd.read_csv(
    "data/1fefac90-9903-41fe-94bb-0ed9d9abeae1.csv",
    usecols=["timestamp", "application_uid", "auth_status", "fingerprinttimezone"],
)

data["timestamp"] = data["timestamp"].apply(lambda x: dt.fromtimestamp(x))
data["weekday"] = data["timestamp"].apply(lambda x: "day" + str(x.weekday()))


column_str = ["weekday"]
noise_probability = 0.01

kst, pipeline_approved = test_feature_determination(data, column_str, noise_probability)

X = pipeline_approved["columntransformer"].transform(data[column_str])

print(
    "Transformed columns: ",
    "\n",
    X[1:5, :],
    "\n",
    "Sample Scores: ",
    "\n",
    kst.score_samples(X[1:5, :]),
)

fs.categorical_features ['weekday']
fs.numerical_features []
fs.time_feature []
Transformed columns:  
 [[1.]
 [1.]
 [1.]
 [1.]] 
 Sample Scores:  
 [0.93415185 0.93415185 0.93415185 0.93415185]


<IPython.core.display.Javascript object>

## Use timestamp + 2-D numerical

In [220]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<IPython.core.display.Javascript object>

In [221]:
data = pd.read_csv("data/1fefac90-9903-41fe-94bb-0ed9d9abeae1.csv",)
data.head()


data["timestamp"] = data["timestamp"].apply(lambda x: dt.fromtimestamp(x))
data["weekday"] = data["timestamp"].apply(lambda x: "day" + str(x.weekday()))

data = data[~data["longitude"].isna()]

column_str = ["timestamp", "latitude", "longitude"]
noise_probability = 0.01

kst, pipeline_approved = test_feature_determination(
    data, column_str, noise_probability, box=1
)

X = pipeline_approved["columntransformer"].transform(data[column_str])

print(
    "Transformed columns: ",
    "\n",
    X[1:5, :],
    "\n",
    "Sample Scores: ",
    "\n",
    kst.score_samples(X[1:5, :]),
)

fs.categorical_features []
fs.numerical_features ['longitude' 'latitude']
fs.time_feature ['timestamp']


DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kuberspatiotemporal.spatial:(100, 300)
DEBUG:kube

Transformed columns:  
 [[  0.33833333 -97.822       37.751     ]
 [  0.29694444 -97.822       37.751     ]
 [  0.28333333 -97.822       37.751     ]
 [  0.275      -97.822       37.751     ]] 
 Sample Scores:  
 [0.67103624 0.65139676 0.64454623 0.64025488]


<IPython.core.display.Javascript object>

In [192]:
res = pd.DataFrame(X, columns=["timestamp", "lat", "long"])
res["score"] = kst.score_samples(X)
res[(res["score"] < 0.5) & (res["score"] > 0.2)]

Unnamed: 0,timestamp,lat,long,score
10,0.026667,-72.5621,42.0825,0.246008
42,23.300278,-72.5621,42.0825,0.434506
44,23.283333,-72.5621,42.0825,0.436223
50,23.176667,-72.5596,41.8363,0.209766
70,22.463056,-72.6420,41.9900,0.216921
...,...,...,...,...
9381,15.812222,-71.1349,42.3907,0.232927
9395,15.652222,-72.3196,42.1010,0.424604
9440,14.950833,-72.4070,42.2735,0.239451
9452,14.278333,-71.7719,42.2555,0.288759


<IPython.core.display.Javascript object>

## Use only 2-D Numerical 

In [245]:
data = pd.read_csv("data/1fefac90-9903-41fe-94bb-0ed9d9abeae1.csv",)
data.head()


data["timestamp"] = data["timestamp"].apply(lambda x: dt.fromtimestamp(x))
data["weekday"] = data["timestamp"].apply(lambda x: "day" + str(x.weekday()))

data = data[~data["longitude"].isna()]

column_str = ["latitude", "longitude"]
noise_probability = 0.1

kst, pipeline_approved = test_feature_determination(data, column_str, noise_probability)

X = pipeline_approved["columntransformer"].transform(data[column_str])

print(
    "Transformed columns: ",
    "\n",
    X[1:5, :],
    "\n",
    "Sample Scores: ",
    "\n",
    kst.score_samples(X[1:5, :]),
)

fs.categorical_features []
fs.numerical_features ['longitude' 'latitude']
fs.time_feature []
Transformed columns:  
 [[-97.822  37.751]
 [-97.822  37.751]
 [-97.822  37.751]
 [-97.822  37.751]] 
 Sample Scores:  
 [0.99999892 0.99999892 0.99999892 0.99999892]


<IPython.core.display.Javascript object>

## Use All

In [246]:
data = pd.read_csv("data/1fefac90-9903-41fe-94bb-0ed9d9abeae1.csv",)
data.head()


data["timestamp"] = data["timestamp"].apply(lambda x: dt.fromtimestamp(x))
data["weekday"] = data["timestamp"].apply(lambda x: "day" + str(x.weekday()))

data = data[~data["longitude"].isna()]

column_str = ["timestamp", "weekday", "latitude", "longitude"]
noise_probability = 0.005

kst, pipeline_approved = test_feature_determination(data, column_str, noise_probability)

X = pipeline_approved["columntransformer"].transform(data[column_str])

print(
    "Transformed columns: ",
    "\n",
    X[1:5, :],
    "\n",
    "Sample Scores: ",
    "\n",
    kst.score_samples(X[1:5, :]),
)

fs.categorical_features ['weekday']
fs.numerical_features ['longitude' 'latitude']
fs.time_feature ['timestamp']
Transformed columns:  
 [[  0.33833333 -97.822       37.751        1.        ]
 [  0.29694444 -97.822       37.751        1.        ]
 [  0.28333333 -97.822       37.751        1.        ]
 [  0.275      -97.822       37.751        1.        ]] 
 Sample Scores:  
 [0.99999332 0.99999271 0.99999249 0.99999235]


<IPython.core.display.Javascript object>

In [198]:
res = pd.DataFrame(X, columns=["timestamp", "latitude", "longitude", "weekday"])
res["score"] = kst.score_samples(X)
res[(res["score"] < 0.5) & (res["score"] > 0.2)]

Unnamed: 0,timestamp,latitude,longitude,weekday,score
10,0.026667,-72.5621,42.0825,1.0,0.235270
225,19.681389,-72.5508,42.1139,0.0,0.200593
232,19.469722,-72.5621,42.0825,0.0,0.229757
234,19.450278,-72.5621,42.0825,0.0,0.231588
239,19.396944,-72.7522,42.1293,0.0,0.211653
...,...,...,...,...,...
9437,15.015833,-72.5771,42.1763,4.0,0.338682
9440,14.950833,-72.4070,42.2735,4.0,0.288321
9442,14.876944,-72.4721,42.1719,4.0,0.338039
9451,14.455556,-72.4464,42.1245,4.0,0.303188


<IPython.core.display.Javascript object>