In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
dateparse = lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M:%S')
usecols = ["id", "date", "user", "pc", "activity"]
logon_df = pd.read_csv('../data/r2/logon_test.csv', parse_dates=['date'], date_parser=dateparse, usecols=usecols)
logon_df = logon_df.set_index('date').sort_index(axis=0)

In [None]:
logon_df.info(memory_usage='deep')

In [None]:
logon_df.head(10)

In [None]:
# filter the data to a specific date range
logon_df = logon_df['2010-01-01':'2010-02-04']

In [None]:
# show the number of distinct users in the data set
len(logon_df['user'].unique())

1. User who did not previously use removable drives or work after
hours begins logging in after hours, using a removable drive, and
uploading data to wikileaks.org. Leaves the organization shortly
thereafter.

In [None]:
# create a field for the hour of the day
logon_df['hour'] = logon_df.index.hour
logon_df.head()

In [None]:
# TODO - how to get zero for a date/hour that has no observations?
# TODO - exclude all but the last 30-days from current day (drop older records, or use window?)

# group the data by user, hour and activity and resample/sum on a daily basis
# results in a df that shows the count of logons/logoffs for each user, hour for each day
logon_resampled_df = logon_df.groupby(['user','hour'], sort=False).resample('D').count()[['id']]
logon_resampled_df = logon_resampled_df.rename(columns={'id': 'login_count'})

In [None]:
# find the median login count for each user, hour combination
logon_resampled_df['median_login_count'] = logon_resampled_df.groupby(['user','hour'], sort=False).login_count.median()
logon_resampled_df.head(30)

In [None]:
# calculate the absolute deviation
logon_resampled_df['abs_dev'] = abs(logon_resampled_df['median_login_count'] - logon_resampled_df['login_count'])
logon_resampled_df.head(30)

In [None]:
# calculate the median absolute deviation
logon_resampled_df['median_abs_dev'] = logon_resampled_df.groupby(['user','hour'], sort=False).abs_dev.median()
logon_resampled_df.head(30)

In [None]:
# calcuate the lower bound and upper bound
multiplier = 9 
logon_resampled_df['lower_bound'] = logon_resampled_df['median_login_count'] - (logon_resampled_df['median_abs_dev'] * multiplier)

logon_resampled_df['upper_bound'] = logon_resampled_df['median_login_count'] + (logon_resampled_df['median_abs_dev'] * multiplier)

logon_resampled_df.head(30)

In [None]:
# calculate the outliers
logon_resampled_df['outlier'] = np.where(logon_resampled_df['login_count'] < logon_resampled_df['lower_bound'] , 1, 
                                         np.where(logon_resampled_df['login_count'] > logon_resampled_df['upper_bound'], 1, 0))
logon_resampled_df.head(30)

In [None]:
logon_resampled_df[logon_resampled_df['outlier'] == 1].index.get_level_values(0).unique()

In [None]:
# removed the leveled index that was set by the group_by
logon_resampled_df = logon_resampled_df.reset_index()
logon_resampled_df = logon_resampled_df.set_index('date').sort_index(axis=0)
#logon_resampled_df.head(100)

In [None]:
period = 2
min_periods = period

# calculate the mean number of logins for each hour over the last period days
#user_hour_df = logon_resampled_df.groupby(['user','hour','activity']).rolling(period, min_periods=min_periods)[['daily_count']].mean()
user_hour_df.head(30)

In [None]:
user_hour_df = user_hour_df.reset_index()
user_hour_df = user_hour_df.set_index('date').sort_index(axis=0)
user_hour_df.head()

2. User begins surfing job websites and soliciting employment from a
competitor. Before leaving the company, they use a thumb drive (at
markedly higher rates than their previous activity) to steal data.

3. System administrator becomes disgruntled. Downloads a keylogger and
uses a thumb drive to transfer it to his supervisor's machine. The
next day, he uses the collected keylogs to log in as his supervisor
and send out an alarming mass email, causing panic in the
organization. He leaves the organization immediately.

4. A user logs into another user's machine and searches for
interesting files, emailing to their home email. This behavior occurs
more and more frequently over a 3 month period.

5. A member of a group decimated by layoffs uploads documents to
Dropbox, planning to use them for personal gain.