In [1]:
import os 
import pandas as pd 
import numpy as np
import torch
from dfencoder import AutoEncoder
from pathlib import Path
import json
from sklearn.model_selection import train_test_split

Big Picture: 
- For the dfp pipeline (Training)
    - Preproc
    - dfp_rolling_window
    - data prep
    - monitor module 
    - dfp training module
    



In [2]:
def load_data(train_data_dir: Path):
    i = 0 
    all_data_dfs = []
    for a_td in train_data_dir.iterdir():
        with open(a_td,'r') as f: 
            json_obj = json.load(f)
        json_normalized = pd.json_normalize(json_obj)
        all_data_dfs.append(json_normalized)
    return all_data_dfs

In [3]:
azure_training_data = Path("data/azure-training-data")
azure_inference_data = Path("data/azure-inference-data")

In [4]:
azure_training_data_df_list = load_data(azure_training_data)
azure_inference_data_df_list = load_data(azure_inference_data)
#azure_training_data_df = pd.DataFrame(azure_training_data_json)
#azure_inference_data_df = pd.DataFrame(azure_inference_data_json)

In [5]:
# concat into one big dataframe for training and inference 

azure_training_df = pd.concat(azure_training_data_df_list)
azure_inference_df = pd.concat(azure_inference_data_df_list)


In [6]:
azure_training_df['properties.userDisplayName'].unique()

array(['Thomas Price', 'Attack Target', 'Aaron Cole', 'Joseph Taylor',
       'Cynthia Perry', 'Alicia Ramirez', 'Jim Gonzalez',
       'Cassie Fernandez', 'Jacob Meyers', 'Nicholas Black',
       'Terry Proctor', 'Vanessa Ramirez', 'Keith Sheppard',
       'Jamie Watson', 'Kristen Howell', 'David Johnson',
       'Melissa Martin', 'Amy Anderson', 'Angela Kerr', 'Robert Rojas'],
      dtype=object)

In [7]:
# split dataframe into user specific dfs
unique_names = list(azure_training_df['properties.userDisplayName'].unique())
training_data_dfs = {}
inference_data_dfs = {}
for name in unique_names: 
    name_azure_training_df = azure_training_df[azure_training_df['properties.userDisplayName'] == name]
    name_azure_inf_df = azure_inference_df[azure_inference_df['properties.userDisplayName'] == name]
    
    training_data_dfs[name] = name_azure_training_df
    inference_data_dfs[name] = name_azure_inf_df

In [8]:
for name, df in training_data_dfs.items(): 
    print('Name=', name)
    print('Training samples:', len(df))
    print('Inference samples', len(inference_data_dfs[name]))

Name= Thomas Price
Training samples: 493
Inference samples 32
Name= Attack Target
Training samples: 378
Inference samples 124
Name= Aaron Cole
Training samples: 371
Inference samples 28
Name= Joseph Taylor
Training samples: 300
Inference samples 19
Name= Cynthia Perry
Training samples: 424
Inference samples 26
Name= Alicia Ramirez
Training samples: 11
Inference samples 0
Name= Jim Gonzalez
Training samples: 122
Inference samples 8
Name= Cassie Fernandez
Training samples: 126
Inference samples 12
Name= Jacob Meyers
Training samples: 123
Inference samples 8
Name= Nicholas Black
Training samples: 10
Inference samples 1
Name= Terry Proctor
Training samples: 124
Inference samples 5
Name= Vanessa Ramirez
Training samples: 130
Inference samples 5
Name= Keith Sheppard
Training samples: 117
Inference samples 13
Name= Jamie Watson
Training samples: 119
Inference samples 12
Name= Kristen Howell
Training samples: 120
Inference samples 10
Name= David Johnson
Training samples: 122
Inference samples 

Feature Preprocessing

In [9]:
# feature_list_p = [
#     'appDisplayName',
#     'clientAppUsed',
#     'deviceDetail.displayName',
#     'deviceDetail.browser',
#     'deviceDetail.operatingSystem',
#     'statusFailureReason',
#     'riskEventTypes_v2',
#     'location.countryOrRegion',
#     'location.city',
# ]

feature_list_p = [
    'appDisplayName',
    'clientAppUsed',
    'deviceDetail.displayName',
    'deviceDetail.browser',
    'deviceDetail.operatingSystem',
    'status.failureReason',
    'location.countryOrRegion',
    'location.city',
]

In [10]:
cols = azure_training_df.columns
for p in feature_list_p: 
    for c in cols: 
        if p in c: 
            print(c)

properties.appDisplayName
properties.clientAppUsed
properties.deviceDetail.displayName
properties.deviceDetail.browser
properties.deviceDetail.operatingSystem
properties.status.failureReason
properties.location.countryOrRegion
properties.location.city


In [11]:
feature_list = [('properties.' + f) for f in feature_list_p]

In [12]:
feature_list

['properties.appDisplayName',
 'properties.clientAppUsed',
 'properties.deviceDetail.displayName',
 'properties.deviceDetail.browser',
 'properties.deviceDetail.operatingSystem',
 'properties.status.failureReason',
 'properties.location.countryOrRegion',
 'properties.location.city']

In [13]:
for c in azure_training_df.columns:
    if 'riskEventTypesv2' in c: 
        print(c)

In [14]:
def convert_column_names(df):
    df.columns = [col.replace(".","_") for col in df.columns]
    return df     

In [15]:
feat_processed_training_data_dfs = {}
feat_processed_inference_data_dfs = {}

for name, train_df in training_data_dfs.items():
    feat_proc_train_df = train_df[feature_list]
    feat_proc_train_df = feat_proc_train_df.drop('properties.deviceDetail.displayName',axis=1)
    feat_proc_train_df = convert_column_names(feat_proc_train_df)
    feat_processed_training_data_dfs[name] = feat_proc_train_df

for name, inf_df in inference_data_dfs.items():
    feat_proc_inf_df = inf_df[feature_list]
    feat_proc_inf_df = convert_column_names(feat_proc_inf_df)
    feat_processed_inference_data_dfs[name] = feat_proc_inf_df




In [16]:
def train_user_model(training_df,split_size=0.2):

    mps_device = torch.device("mps:0")
    # train test split
    train_df, val_df = train_test_split(training_df, test_size=split_size)

    num_columns = len(training_df.columns) 
    model = AutoEncoder(encoder_layers=[num_columns,int(num_columns/2)],
                        decoder_layers=[num_columns],
                        activation='relu',
                        swap_p=0.2,
                        lr=0.01,
                        batch_size=32,
                        optimizer='sgd',
                        scaler='gauss_rank',
                        min_cats=1,)
    
    model.fit(training_df,20,val_df)
    return model
    

In [17]:
user_models_by_name = {}
for name, df in feat_processed_training_data_dfs.items():
    model = train_user_model(df)
    user_models_by_name[name] = model

100%|██████████| 16/16 [00:00<00:00, 179.14it/s]
100%|██████████| 16/16 [00:00<00:00, 228.75it/s]
100%|██████████| 16/16 [00:00<00:00, 222.25it/s]
100%|██████████| 16/16 [00:00<00:00, 233.22it/s]
100%|██████████| 16/16 [00:00<00:00, 230.73it/s]
100%|██████████| 16/16 [00:00<00:00, 237.59it/s]
100%|██████████| 16/16 [00:00<00:00, 235.91it/s]
100%|██████████| 16/16 [00:00<00:00, 236.76it/s]
100%|██████████| 16/16 [00:00<00:00, 238.38it/s]
100%|██████████| 16/16 [00:00<00:00, 234.13it/s]
100%|██████████| 16/16 [00:00<00:00, 237.30it/s]
100%|██████████| 16/16 [00:00<00:00, 236.13it/s]
100%|██████████| 16/16 [00:00<00:00, 236.77it/s]
100%|██████████| 16/16 [00:00<00:00, 233.33it/s]
100%|██████████| 16/16 [00:00<00:00, 197.37it/s]
100%|██████████| 16/16 [00:00<00:00, 237.97it/s]
100%|██████████| 16/16 [00:00<00:00, 237.38it/s]
100%|██████████| 16/16 [00:00<00:00, 236.27it/s]
100%|██████████| 16/16 [00:00<00:00, 239.01it/s]
100%|██████████| 16/16 [00:00<00:00, 241.15it/s]
100%|██████████| 12/

In [18]:
def inference(model,df):
    anomaly_scores = model.get_anomaly_score(df)
    return anomaly_scores

In [19]:
# implement digital fingerprinting code / implement the actual pipeline
# feat_processed_inference_data_dfs
# for name, df  in feat_processed_inference_data_dfs: 
#     model = user_models_by_name[name]
#     anomaly_scores =
    

In [20]:
tp_inf_data = feat_processed_inference_data_dfs['Thomas Price']

In [21]:
model = user_models_by_name['Thomas Price']
ae_anomaly_score = model.get_anomaly_score(tp_inf_data)
tp_inf_data['anomaly_score'] = ae_anomaly_score


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tp_inf_data['anomaly_score'] = ae_anomaly_score


In [22]:
tp_inf_data

Unnamed: 0,properties_appDisplayName,properties_clientAppUsed,properties_deviceDetail_displayName,properties_deviceDetail_browser,properties_deviceDetail_operatingSystem,properties_status_failureReason,properties_location_countryOrRegion,properties_location_city,anomaly_score
3,Adobe Identity Management,Mobile Apps and Desktop clients,THOMASPRICE-LT,Edge 118.12158,Windows 10,External security challenge was not satisfied.,XQ,Brownton,0.844241
6,Adobe Identity Management,Mobile Apps and Desktop clients,THOMASPRICE-LT,Edge 118.12158,Windows 10,External security challenge was not satisfied.,XR,Port Denisetown,0.300431
9,Adobe Identity Management,Mobile Apps and Desktop clients,THOMASPRICE-LT,Edge 118.12158,Windows 10,Other,XR,Port Kimberlytown,0.758053
17,Cisco AnyConnect,Mobile Apps and Desktop clients,THOMASPRICE-LT,Edge 118.12158,Windows 10,External security challenge was not satisfied.,XN,Littlemouth,0.526056
10,Cisco AnyConnect,Mobile Apps and Desktop clients,THOMASPRICE-LT,Edge 118.12158,Windows 10,External security challenge was not satisfied.,XQ,Brownton,0.857953
15,Cisco AnyConnect,Mobile Apps and Desktop clients,THOMASPRICE-LT,Edge 118.12158,Windows 10,External security challenge was not satisfied.,XR,Santosborough,0.505617
0,Adobe Identity Management,Mobile Apps and Desktop clients,THOMASPRICE-LT,Edge 118.12158,Windows 10,External security challenge was not satisfied.,XN,Littlemouth,0.248657
2,Adobe Identity Management,Mobile Apps and Desktop clients,THOMASPRICE-LT,Edge 118.12158,Windows 10,External security challenge was not satisfied.,XN,Littlemouth,0.248657
6,Adobe Identity Management,Mobile Apps and Desktop clients,THOMASPRICE-LT,Edge 118.12158,Windows 10,External security challenge was not satisfied.,XR,Port Denisetown,0.300431
19,Adobe Identity Management,Mobile Apps and Desktop clients,THOMASPRICE-LT,Edge 118.12158,Windows 10,External security challenge was not satisfied.,XR,Santosborough,0.372359


In [23]:
tp_inf_data.sort_values('anomaly_score',ascending=False).head(10)
tp_inf_data["zscore"] = (tp_inf_data["anomaly_score"] - tp_inf_data["anomaly_score"].mean())/tp_inf_data["anomaly_score"].std()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tp_inf_data["zscore"] = (tp_inf_data["anomaly_score"] - tp_inf_data["anomaly_score"].mean())/tp_inf_data["anomaly_score"].std()


In [24]:
# anomalous events 

tp_inf_data[tp_inf_data['zscore']> 2]

Unnamed: 0,properties_appDisplayName,properties_clientAppUsed,properties_deviceDetail_displayName,properties_deviceDetail_browser,properties_deviceDetail_operatingSystem,properties_status_failureReason,properties_location_countryOrRegion,properties_location_city,anomaly_score,zscore
17,SD ECDN,Browser,THOMASPRICE-LT,Rich Client 3.19.8.16603,Windows 10,,XQ,Veronicachester,1.784105,3.274581
18,Bipsync,Mobile Apps and Desktop clients,,,Android,,XQ,Herringburgh,1.779849,3.263678


In [25]:
anomaly_tracker = {}
for name, df  in feat_processed_inference_data_dfs.items(): 
    model = user_models_by_name[name]
    anomaly_scores = model.get_anomaly_score(df)
    #print('anomaly len=',len(anomaly_scores))
    df['anomaly_score'] = anomaly_scores
    df["zscore"] = (df["anomaly_score"] - df["anomaly_score"].mean())/df["anomaly_score"].std()
    anomaly = df[df['zscore']>2]
    anomaly_tracker[name] = anomaly

    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['anomaly_score'] = anomaly_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["zscore"] = (df["anomaly_score"] - df["anomaly_score"].mean())/df["anomaly_score"].std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['anomaly_score'] = anomaly_scores
A value is trying to be set on a copy o

In [26]:
anomaly_tracker['Aaron Cole']

Unnamed: 0,properties_appDisplayName,properties_clientAppUsed,properties_deviceDetail_displayName,properties_deviceDetail_browser,properties_deviceDetail_operatingSystem,properties_status_failureReason,properties_location_countryOrRegion,properties_location_city,anomaly_score,zscore
3,Google Cloud / G Suite Connector by Microsoft,Mobile Apps and Desktop clients,,Edge 99.14477,Windows 10,Fresh auth token is needed. Have the user re-s...,XD,Carrollstad,1.840238,2.78305


In [27]:
anomaly_tracker['Thomas Price']

Unnamed: 0,properties_appDisplayName,properties_clientAppUsed,properties_deviceDetail_displayName,properties_deviceDetail_browser,properties_deviceDetail_operatingSystem,properties_status_failureReason,properties_location_countryOrRegion,properties_location_city,anomaly_score,zscore
17,SD ECDN,Browser,THOMASPRICE-LT,Rich Client 3.19.8.16603,Windows 10,,XQ,Veronicachester,1.784105,3.274581
18,Bipsync,Mobile Apps and Desktop clients,,,Android,,XQ,Herringburgh,1.779849,3.263678
