In [None]:
# TODO - Run r4.1-1

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
dateparse = lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M:%S')
usecols = ["id", "date", "user", "pc", "activity"]

In [None]:
logon_df = pd.read_csv('../data/r3.1/logon.csv', parse_dates=['date'], date_parser=dateparse, usecols=usecols)
logon_df = logon_df.set_index('date').sort_index(axis=0)

In [None]:
logon_df.info(memory_usage='deep')

In [None]:
logon_df.head()

In [None]:
# load the answer file
answers_df = pd.read_csv('../data/answers/r3.1-1.csv', header=None, names=['activity', 'date', 'user'], usecols=[0,2,3], parse_dates=['date'], date_parser=dateparse,)
answers_df = answers_df.set_index('date').sort_index(axis=0)
answers_df.head()

In [None]:
# drop rows for all but logon
answers_df = answers_df[answers_df['activity'] == 'logon']
answers_df.head()

In [None]:
# create a field for the hour of the day
answers_df['hour'] = answers_df.index.hour
answers_df['malicious_event'] = 1
answers_df.head()

In [None]:
# resample the answers to make it easier to process the results
answers_resampled_df = answers_df.groupby(['user','hour'], sort=False).resample('D').count()[['malicious_event']]
# for hours, that had more than 1 malicious event, set the flag to 1 since we're just flagging hours, not counting events
answers_resampled_df['malicious_event'] = np.where(answers_resampled_df['malicious_event']>1, 0, answers_resampled_df['malicious_event'])
answers_resampled_df.head()

In [None]:
# filter the data to a specific date range
#logon_df = logon_df['2010-02-20':'2010-03-20']

In [None]:
# create a field for the hour of the day
logon_df['hour'] = logon_df.index.hour
logon_df.head()

In [None]:
# group the data by user, hour and resample on a daily basis, counting the number of events per day.
# this results in a df that shows the count of logons/logoffs for each user/hour for each day
# if a user had no events during a given day/hour, then no record exists in this 
# dataframe (which is a problem we will fix)

logon_resampled_df = logon_df.groupby(['user','hour'], sort=False).resample('D').count()[['id']]
logon_resampled_df = logon_resampled_df.rename(columns={'id': 'login_count'})
logon_resampled_df = logon_resampled_df.astype({'login_count': 'uint8'})
logon_resampled_df.info(memory_usage='deep')

In [None]:
# this code creates entries that indicate a user had 0 event entries for a given time interval.
# we want the hour/user/date combination to show 0 for dates/hours when the user had 0 events 
# for the purpose of computing statistics

users = logon_resampled_df.index.get_level_values(0).unique()
hours = logon_resampled_df.index.get_level_values(1).unique()
dates = logon_resampled_df.index.get_level_values(2).unique()

# create a cartesian product of users, hours and dates
index = pd.MultiIndex.from_product([users, hours, dates], names = ["user", "hour", "date"])
full_df = pd.DataFrame(index = index)

# set the default value for the login_count to 0 - this will be overwritten by the true
# count that was calculated in logon_resampled_df
full_df['login_count'] = 0
full_df['malicious_event'] = 0

# save some memory
full_df = full_df.astype({'login_count': 'uint8', 'malicious_event':'uint8'})

full_df.info(memory_usage='deep')

In [None]:
# merge the data frames on index
daily_count_df = pd.merge(left=full_df, right=logon_resampled_df, how='left', right_index=True, left_index=True)

# collapse the two login_count columns into a single column and remove the columns created by the merge
daily_count_df['login_count'] = daily_count_df[["login_count_x", "login_count_y"]].max(axis=1)
daily_count_df = daily_count_df.drop(['login_count_x', 'login_count_y'], axis=1)
daily_count_df = daily_count_df.astype({'login_count': 'uint8', 'malicious_event':'uint8'})

In [None]:
daily_count_df.info(memory_usage='deep')

In [None]:
# merge the answers into the daily_count_df
# merge the data frames on index
daily_count_df = pd.merge(left=daily_count_df, right=answers_resampled_df, how='left', right_index=True, left_index=True)

# collapse the two login_count columns into a single column and remove the columns created by the merge
daily_count_df['malicious_event'] = daily_count_df[["malicious_event_x", "malicious_event_y"]].max(axis=1)
daily_count_df = daily_count_df.drop(['malicious_event_x', 'malicious_event_y'], axis=1)
daily_count_df = daily_count_df.astype({'malicious_event': 'uint8'})

daily_count_df.head()

#### Start of outlier detection calculations

In [None]:
# find the average login count for each user, hour combination
daily_count_df['mean_login_count'] = daily_count_df.groupby(['user','hour'], sort=False).login_count.mean()
daily_count_df.head()

In [None]:
# calculate the absolute deviation
daily_count_df['abs_dev'] = abs(daily_count_df['mean_login_count'] - daily_count_df['login_count'])
daily_count_df.head()

In [None]:
# calculate the mean absolute deviation
daily_count_df['mean_abs_dev'] = daily_count_df.groupby(['user','hour'], sort=False).abs_dev.mean()
daily_count_df.head()

In [None]:
# calcuate the lower bound and upper bound
multiplier = 9
daily_count_df['lower_bound'] = daily_count_df['mean_login_count'] - (daily_count_df['mean_abs_dev'] * multiplier)
daily_count_df['upper_bound'] = daily_count_df['mean_login_count'] + (daily_count_df['mean_abs_dev'] * multiplier)

daily_count_df.head()

In [None]:
# calculate the outliers
daily_count_df['outlier'] = np.where(daily_count_df['login_count'] < daily_count_df['lower_bound'] , 1, 
                                         np.where(daily_count_df['login_count'] > daily_count_df['upper_bound'], 1, 0))
daily_count_df.head()

In [None]:
from sklearn.metrics import confusion_matrix

cmtx = pd.DataFrame(
    confusion_matrix(daily_count_df['malicious_event'],daily_count_df['outlier'], labels=[1, 0]), 
    index=['true:yes', 'true:no'], 
    columns=['pred:yes', 'pred:no']
)
print(cmtx)

In [None]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
precision = precision_score(daily_count_df['malicious_event'], daily_count_df['outlier'])
recall = recall_score(daily_count_df['malicious_event'], daily_count_df['outlier'])
f1 = f1_score(daily_count_df['malicious_event'], daily_count_df['outlier'])
auc = roc_auc_score(daily_count_df['malicious_event'], daily_count_df['outlier'])

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC: {auc}')

### Increasing multiplier from 9 to 22 yields a higher precision score (.0006 vs .0003) without sacrificing recall

## CMU dataset scenarios

### 1. User who did not previously use removable drives or work after hours begins logging in after hours, using a removable drive, and uploading data to wikileaks.org. Leaves the organization shortly thereafter.

### 2. User begins surfing job websites and soliciting employment from a competitor. Before leaving the company, they use a thumb drive (at markedly higher rates than their previous activity) to steal data.

### 3. System administrator becomes disgruntled. Downloads a keylogger and  uses a thumb drive to transfer it to his supervisor's machine. The next day, he uses the collected  key logs to log in as his supervisor and send out an alarming mass email, causing panic in the organization. He leaves the organization immediately.

### 4. A user logs into another user's machine and searches for interesting files, emailing to their home email. This behavior occurs more and more frequently over a 3 month period.

### 5. A member of a group decimated by layoffs uploads documents to Dropbox, planning to use them for personal gain.