In [133]:
import numpy as np
import pandas as pd

In [134]:
path_to_dataset = "C:/Users/tabzr/Documents/CMU Dataset/r4.2/"
# path_to_dataset = "~/Documents/CMU_Dataset/r4.2/"

print("Loading Devices", flush=True)
devices = pd.read_csv(path_to_dataset + "device.csv", index_col=2)
print("Loading Emails", flush=True)
emails = pd.read_csv(path_to_dataset + "email.csv", index_col=2)
print("Loading Files", flush=True)
files = pd.read_csv(path_to_dataset + "file.csv", index_col=2)
print("Loading Logons", flush=True)
logons = pd.read_csv(path_to_dataset + "logon.csv", index_col=2)
print("Loading Http", flush=True)
fields = ["date", "user", "url"]
https = pd.read_csv(path_to_dataset + "http.csv", usecols=fields, index_col=1)
print("Finished Loading", flush=True)

Loading Devices
Loading Emails
Loading Files
Loading Logons
Loading Http
Finished Loading


In [135]:
# All dates follow this format
dateFormat = "%m/%d/%Y %H:%M:%S"

print("Converting date strings to Timestamps", flush=True)

devices["date"] = pd.to_datetime( devices["date"], format=dateFormat)
print("Finished devices", flush=True)

emails["date"] = pd.to_datetime( emails["date"], format=dateFormat)
print("Finished emails", flush=True)

files["date"] = pd.to_datetime( files["date"], format=dateFormat)
print("Finished files", flush=True)

logons["date"] = pd.to_datetime( logons["date"], format=dateFormat)
print("Finished logons", flush=True)

https["date"] = pd.to_datetime( https["date"], format=dateFormat)
print("Finished https", flush=True)

print("Finished converting dates")

Converting date strings to Timestamps
Finished devices
Finished emails
Finished files
Finished logons
Finished https
Finished converting dates


In [10]:
features = 0
feature_map = {}
def addFeature(name):
    if name not in feature_map:
        global features
        feature_map[name] = features
        features+=1

In [11]:
#Logon features
# addFeature("Weekday_Logon_Normal") # 9am-5pm
# addFeature("Weekday_Logon_After")
# addFeature("Weekend_Logon")
addFeature("Logon")
# addFeature("Logoff_03hr")
# addFeature("Logoff_36hr")
# addFeature("Logoff_69hr")
# addFeature("Logoff_9+hr")
addFeature("Logoff") # Calculating length between first logon and logoff is a little tricky 

def logonFeatures(row):
    if row["activity"] == "Logon":
        if row["date"].weekday() < 5:
            if row["date"].hour >= 8 and row["date"].hour < 17:
                return feature_map["Weekday_Logon_Normal"]
            else:
                return feature_map["Weekday_Logon_After"]
        else:
            return feature_map["Weekend_Logon"]
    else: #Is Logoff
        return feature_map["Logoff"]
    

In [12]:
#Device features
addFeature("Connect")
# addFeature("Connect_Normal")
# addFeature("Connect_After")
# addFeature("Connect_Weekend")
addFeature("Disconnect")

def deviceFeatures(row):
    if row["activity"] == "Connect":
        if row["date"].weekday() < 5:
            if row["date"].hour >= 8 and row["date"].hour < 17:
                return feature_map["Connect_Normal"]
            else:
                return feature_map["Connect_After"]
        else:
            return feature_map["Connect_Weekend"]
    else:
        return feature_map["Disconnect"]

In [13]:
# The http web log is too large to extract features from (takes too long)
addFeature("Website")

def httpFeatures(row):
    return feature_map["Website"]

In [14]:
addFeature("Email")
# addFeature("Email_In") # All recepients are company email addresses
# addFeature("Email_Out") # Sent to a non-company email address

def emailFeatures(row):
    outsider = False
    if not pd.isnull(row["to"]):
        for address in row["to"].split(";"):
            if address.endswith("dtaa.com"):
                outsider = True
                
    if not pd.isnull(row["cc"]):
        for address in row["cc"].split(";"):
            if address.endswith("dtaa.com"):
                outsider = True
                
    if not pd.isnull(row["bcc"]):
        for address in row["bcc"].split(";"):
            if address.endswith("dtaa.com"):
                outsider = True
    if outsider:
        return feature_map["Email_Out"]
    else:
        return feature_map["Email_In"]

In [15]:
# filenames = np.unique( files["filename"].apply(lambda x: x.split(".",1)[1]) )
# filenames

In [16]:
addFeature("File")
# addFeature("File_exe";)
# addFeature("File_jpg")
# addFeature("File_zip")
# addFeature("File_txt")
# addFeature("File_doc") #Or pdf

def fileFeatures(row):
    if row["filename"].endswith(".exe"):
        return feature_map["File_exe"]
    if row["filename"].endswith(".jpg"):
        return feature_map["File_jpg"]
    if row["filename"].endswith(".zip"):
        return feature_map["File_zip"]
    if row["filename"].endswith(".txt"):
        return feature_map["File_txt"]
    else:
        return feature_map["File_doc"]

In [17]:
feature_map

{'Connect': 2,
 'Disconnect': 3,
 'Email': 5,
 'File': 6,
 'Logoff': 1,
 'Logon': 0,
 'Website': 4}

In [144]:
# Feature extraction

cols_to_keep = ["date", "feature"]

# Logons
print("Logon", flush=True)
logons["feature"] = logons.apply(logonFeatures, axis=1)
logons = logons[cols_to_keep]
# Devices
print("Device", flush=True)
devices["feature"] = devices.apply(deviceFeatures, axis=1)
devices = devices[cols_to_keep]
# Email
print("Email", flush=True)
emails["feature"] = emails.apply(emailFeatures, axis=1)
emails = emails[cols_to_keep]
# Files
print("File", flush=True)
files["feature"] = files.apply(fileFeatures, axis=1)
files = files[cols_to_keep]

Logon
Device
Email
File


In [145]:
# Http
print("Http", flush=True)
# https["feature"] = https.apply(httpFeatures, axis=1)
https["feature"] = feature_map["Website"]
https = https[cols_to_keep]

Http


In [146]:
# cols_to_keep = ["date", "feature"]
# logons = logons[cols_to_keep]
# devices = devices[cols_to_keep]
# https = https[cols_to_keep]
# emails = emails[cols_to_keep]
# files = files[cols_to_keep]

In [147]:
joint = pd.concat([logons, devices, https, emails, files])

In [148]:
# Consider dropping stuff earlier since this is very memory intensive
# joint.drop(["activity", "attachments", "bcc", "cc", "content", "filename", "from", "id", "pc", "size", "to", "user"], axis=1, inplace=True)

In [149]:
joint.sort_values("date", axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')

In [150]:
# Write to disk as hdf because feature extraction takes a looooooong time
joint.to_hdf("C:/Users/tabzr/Documents/CMU Dataset/r4.2/r42_features_complex.h5", "table", mode="w")
# joint.to_hdf("/home/tabz/Documents/CMU_Dataset/r4.2/features_simple.h5", "table", mode="w")