# Начало работы

In [2]:
import pandas as pd
from pathlib import Path
import re
from tqdm import tqdm
from math import ceil

tqdm.pandas()

# Идеи для фичей

* Выходные/праздничные дни
* Количество аттачей в письмах (или просто их присутствие)
* Расщепить внешние письма на входящие и исходящие
* Более бережное обращение с cc и bcc в емейлах
* Размер контента?
* Построение эмбеддингов вебсайтов

In [3]:
answers_dir = Path(r"C:/Users/Mvideo/Downloads/answers")
dataset_dir = Path(r"C:\Users\Mvideo\Downloads\r4.2")

main_answers_file = answers_dir / "insiders.csv"

assert(answers_dir.is_dir())
assert(dataset_dir.is_dir())
assert(main_answers_file.is_file())

output_dir = Path('C:/Datasets/CERT_output/')
assert(output_dir.is_dir())

In [4]:
main_df = pd.read_csv(main_answers_file)
main_df = main_df[main_df['dataset'] == 4.2]

In [5]:
insider_files = list(answers_dir.glob('**/r4.2*.csv'))

In [5]:
device_cols = next(pd.read_csv(dataset_dir / 'device.csv', chunksize=10)).columns
email_cols = next(pd.read_csv(dataset_dir / 'email.csv', chunksize=10)).columns
file_cols = next(pd.read_csv(dataset_dir / 'file.csv', chunksize=10)).columns
http_cols = next(pd.read_csv(dataset_dir / 'http.csv', chunksize=10)).columns
logon_cols = next(pd.read_csv(dataset_dir / 'logon.csv', chunksize=10)).columns

In [6]:
df_ls = []

for filename in insider_files:
    df = pd.read_csv(filename, names=list(range(13)))
    df_ls.append(df)
    
df = pd.concat(df_ls, axis=0, ignore_index=True)

In [7]:
malicious_users = main_df.user.unique()

In [15]:
set(df[df[0] == 'http'][5].apply(lambda s: re.match('^https?://(www\.)?([0-9\-\w\.]+)?.+$', s).group(2)))

{'actionalert.com',
 'actualkeylogger.com',
 'aol.com',
 'best-spy-soft.com',
 'boeing.com',
 'careerbuilder.com',
 'craigslist.org',
 'dailykeylogger.com',
 'download.cnet.com',
 'harris.com',
 'hp.com',
 'indeed.com',
 'job-hunt.org',
 'jobhuntersbible.com',
 'keylogpc.com',
 'linkedin.com',
 'lockheedmartin.com',
 'monster.com',
 'northropgrumman.com',
 'raytheon.com',
 'refog.com',
 'relytec.com',
 'simplyhired.com',
 'softactivity.com',
 'spectorsoft.com',
 'webwatchernow.com',
 'wellresearchedreviews.com',
 'wikileaks.org',
 'yahoo.com'}

# Читаем датасет

## Logon

In [5]:
df = pd.read_csv(dataset_dir / 'logon.csv')

In [6]:
df['date'] = pd.to_datetime(df.date, format='%m/%d/%Y %H:%M:%S')

In [7]:
df['day'] = df['date'].dt.floor('D')

# replaces df.groupby(['user', 'day']).pc.agg(pd.Series.mode)
# it much more complicated but much, much faster
# credit to: https://stackoverflow.com/a/57179083

# result is exactly same as "df.groupby('user').pc.agg(lambda x:x.value_counts().index[0])" though

most_common_pc = df\
    .groupby(['user', 'day', 'pc'])\
    .size()\
    .to_frame('count')\
    .reset_index()\
    .sort_values('count', ascending=False)\
    .drop_duplicates(subset=['user', 'day'])\
    .drop(columns=['count'])\
    .sort_values(['user', 'day'])\
    .groupby('user')\
    .pc\
    .agg(pd.Series.mode)\
    .rename('most_common_pc')
most_common_pc

user
AAE0190    PC-8915
AAF0535    PC-2408
AAF0791    PC-7357
AAL0706    PC-5282
AAM0658    PC-9923
            ...   
ZKS0899    PC-8517
ZMC0284    PC-3947
ZSB0649    PC-5343
ZSK0258    PC-0461
ZSL0305    PC-2640
Name: most_common_pc, Length: 1000, dtype: object

In [6]:
df = df.merge(most_common_pc, left_on='user', right_on='user', )
df['is_usual_pc'] = df['most_common_pc'] == df['pc']

is_work_time = (8 <= df.date.dt.hour) & (df.date.dt.hour < 17)
df['is_work_time'] = is_work_time

df['subtype'] = df['activity']

In [7]:
df[['date', 'user', 'is_usual_pc', 'is_work_time', 'subtype']].to_csv(output_dir / 'logon_preprocessed.csv')

## Device

In [14]:
csv_name = 'device'

df = pd.read_csv(dataset_dir / f'{csv_name}.csv')
df

Unnamed: 0,id,date,user,pc,activity
0,{J1S3-L9UU75BQ-7790ATPL},01/02/2010 07:21:06,MOH0273,PC-6699,Connect
1,{N7B5-Y7BB27SI-2946PUJK},01/02/2010 07:37:41,MOH0273,PC-6699,Disconnect
2,{U1V9-Z7XT67KV-5649MYHI},01/02/2010 07:59:11,HPH0075,PC-2417,Connect
3,{H0Z7-E6GB57XZ-1603MOXD},01/02/2010 07:59:49,IIW0249,PC-0843,Connect
4,{L7P2-G4PX02RX-7999GYOY},01/02/2010 08:04:26,IIW0249,PC-0843,Disconnect
...,...,...,...,...,...
405375,{R7R7-Y9VH64MN-4427OTOU},05/16/2011 22:27:23,EIS0041,PC-0422,Disconnect
405376,{J1G6-G7KE64TX-7505AXXN},05/16/2011 22:43:49,IBB0359,PC-4176,Connect
405377,{I3V8-Q1KQ57JG-4571IXHJ},05/16/2011 22:48:39,IBB0359,PC-4176,Disconnect
405378,{W9Y8-O7VO98OA-0160JVWR},05/16/2011 23:22:29,IBB0359,PC-3620,Connect


In [15]:
df['date'] = pd.to_datetime(df.date, format='%m/%d/%Y %H:%M:%S')

df = df.merge(most_common_pc, left_on='user', right_on='user', )
df['is_usual_pc'] = df['most_common_pc'] == df['pc']

is_work_time = (8 <= df.date.dt.hour) & (df.date.dt.hour < 17)
df['is_work_time'] = is_work_time

df['subtype'] = df['activity']

In [16]:
df[['date', 'user', 'is_usual_pc', 'is_work_time', 'subtype']].to_csv(output_dir / f'{csv_name}_preprocessed.csv')

## File

In [421]:
csv_name = 'file'

df = pd.read_csv(dataset_dir / f'{csv_name}.csv', usecols=['date', 'user', 'pc', 'filename'])
df

Unnamed: 0,date,user,pc,filename
0,01/02/2010 07:23:14,MOH0273,PC-6699,EYPC9Y08.doc
1,01/02/2010 07:26:19,MOH0273,PC-6699,N3LTSU3O.pdf
2,01/02/2010 08:12:03,HPH0075,PC-2417,D3D3WC9W.doc
3,01/02/2010 08:17:00,HPH0075,PC-2417,QCSW62YS.doc
4,01/02/2010 08:24:57,HSB0196,PC-8001,AU75JV6U.jpg
...,...,...,...,...
445576,05/16/2011 23:22:31,IBB0359,PC-3620,R8DTDN2Q.txt
445577,05/16/2011 23:22:32,IBB0359,PC-3620,BLHCRL6W.pdf
445578,05/16/2011 23:22:33,IBB0359,PC-3620,ZHMDTTW0.doc
445579,05/16/2011 23:22:33,IBB0359,PC-3620,AC0QL6KF.pdf


In [422]:
df['date'] = pd.to_datetime(df.date, format='%m/%d/%Y %H:%M:%S')

df = df.merge(most_common_pc, left_on='user', right_on='user', )
df['is_usual_pc'] = df['most_common_pc'] == df['pc']

is_work_time = (8 <= df.date.dt.hour) & (df.date.dt.hour < 17)
df['is_work_time'] = is_work_time

file_extensions = df.filename.str[-4:]
df['subtype'] = file_extensions

In [423]:
df

Unnamed: 0,date,user,pc,filename,most_common_pc,is_usual_pc,is_work_time,subtype
0,2010-01-02 07:23:14,MOH0273,PC-6699,EYPC9Y08.doc,PC-6699,True,False,.doc
1,2010-01-02 07:26:19,MOH0273,PC-6699,N3LTSU3O.pdf,PC-6699,True,False,.pdf
2,2010-01-02 08:28:08,MOH0273,PC-6699,JS09VZOJ.doc,PC-6699,True,True,.doc
3,2010-01-02 08:28:24,MOH0273,PC-6699,LX0I6B1U.pdf,PC-6699,True,True,.pdf
4,2010-01-02 09:14:38,MOH0273,PC-6699,TMF79YY1.doc,PC-6699,True,True,.doc
...,...,...,...,...,...,...,...,...
445576,2011-02-28 22:56:35,WDD0366,PC-0155,YCPR5UNL.doc,PC-0155,True,False,.doc
445577,2011-02-28 22:59:57,WDD0366,PC-0155,ADGISHHH.doc,PC-0155,True,False,.doc
445578,2011-02-28 23:02:41,WDD0366,PC-0155,4GAJAQD5.doc,PC-0155,True,False,.doc
445579,2011-02-28 23:03:07,WDD0366,PC-0155,1Z7TTEM3.doc,PC-0155,True,False,.doc


In [424]:
df[['date', 'user', 'is_usual_pc', 'is_work_time', 'subtype']].to_csv(output_dir / f'{csv_name}_preprocessed.csv')

## Email

In [453]:
csv_name = 'email'

df = pd.read_csv(dataset_dir / f'{csv_name}.csv', usecols=['date', 'user', 'pc', 'to', 'cc', 'bcc', 'from'])
df = df.fillna('')
df

Unnamed: 0,date,user,pc,to,cc,bcc,from
0,01/02/2010 07:11:45,LAP0338,PC-5758,Dean.Flynn.Hines@dtaa.com;Wade_Harrison@lockhe...,Nathaniel.Hunter.Heath@dtaa.com,,Lynn.Adena.Pratt@dtaa.com
1,01/02/2010 07:12:16,MOH0273,PC-6699,Odonnell-Gage@bellsouth.net,,,MOH68@optonline.net
2,01/02/2010 07:13:00,LAP0338,PC-5758,Penelope_Colon@netzero.com,,,Lynn_A_Pratt@earthlink.net
3,01/02/2010 07:13:17,LAP0338,PC-5758,Judith_Hayden@comcast.net,,,Lynn_A_Pratt@earthlink.net
4,01/02/2010 07:13:28,MOH0273,PC-6699,Bond-Raymond@verizon.net;Alea_Ferrell@msn.com;...,,Odonnell-Gage@bellsouth.net,MOH68@optonline.net
...,...,...,...,...,...,...,...
2629974,05/16/2011 20:54:18,HRL0540,PC-1117,Hadley.Willa.Hill@dtaa.com;Tarik.Linus.Hubbard...,Hedwig.Regina.Livingston@dtaa.com,,Hedwig.Regina.Livingston@dtaa.com
2629975,05/16/2011 20:54:43,LAF0991,PC-4973,Hu.Akeem.Vincent@dtaa.com,Brianna.Rebecca.Mcintyre@dtaa.com;Kermit.Coby....,,Lucas.Ahmed.Ferrell@dtaa.com
2629976,05/16/2011 21:08:12,LAF0991,PC-4973,Doyle.Grant@netzero.com;HAV856@charter.net,,,Ferrell.Lucas@sbcglobal.net
2629977,05/16/2011 21:15:35,JMW0638,PC-1397,Eagan.Zephania.Talley@dtaa.com,,,Jonah.Merritt.Wilder@dtaa.com


In [480]:
from_domain = df['from'].str.extract('^.+@(.+$)')[0]
is_external_from = from_domain == 'dtaa.com'

In [492]:
# this lines takes ~10 mins
# to_concated = df[['to', 'cc', 'bcc']].agg(';'.join, axis=1)

# slighly slower but there is nice progress bar
to_concated = df[['to', 'cc', 'bcc']].progress_apply(lambda x: ';'.join([x.to, x.cc, x.bcc]), axis=1)

100%|██████████████████████████████████████████████████████████████████████| 2629979/2629979 [08:55<00:00, 4908.39it/s]


0          Dean.Flynn.Hines@dtaa.com;Wade_Harrison@lockhe...
1                              Odonnell-Gage@bellsouth.net;;
2                               Penelope_Colon@netzero.com;;
3                                Judith_Hayden@comcast.net;;
4          Bond-Raymond@verizon.net;Alea_Ferrell@msn.com;...
                                 ...                        
2629974    Hadley.Willa.Hill@dtaa.com;Tarik.Linus.Hubbard...
2629975    Hu.Akeem.Vincent@dtaa.com;Brianna.Rebecca.Mcin...
2629976         Doyle.Grant@netzero.com;HAV856@charter.net;;
2629977                     Eagan.Zephania.Talley@dtaa.com;;
2629978                VLB5@cox.net;Noel_Foster@netzero.com;
Length: 2629979, dtype: object

In [486]:
# yes, it's horrible but this part is somewhat fast compared to the join part

is_external_to = to_concated.progress_apply(
    lambda x: any([re.match('^.+@(.+$)', e).group(1) != 'dtaa.com' for e in x.split(';') if e != ''])
    )

100%|█████████████████████████████████████████████████████████████████████| 2629979/2629979 [01:12<00:00, 36523.16it/s]


In [487]:
is_external = is_external_to | is_external_to

In [493]:
df['date'] = pd.to_datetime(df.date, format='%m/%d/%Y %H:%M:%S')

df = df.merge(most_common_pc, left_on='user', right_on='user', )
df['is_usual_pc'] = df['most_common_pc'] == df['pc']

is_work_time = (8 <= df.date.dt.hour) & (df.date.dt.hour < 17)
df['is_work_time'] = is_work_time

df['subtype'] = is_external

In [494]:
df[['date', 'user', 'is_usual_pc', 'is_work_time', 'subtype']].to_csv(output_dir / f'{csv_name}_preprocessed.csv')

## HTTP

In [8]:
job_hunting_websites = [
    'careerbuilder.com',
    'craiglist.org',
    'indeed.com',
    'job-hunt.org',
    'jobhuntersbible.com',
    'linkedin.com',
    'monster.com',
    'simplyhired.com',
]

hacktivist_websites = [
    'actualkeylogger.com',
    'best-spy-soft.com',
    'dailykeylogger.com',
    'keylogpc.com',
    'refog.com',
    'relytec.com',
    'softactivity.com',
    'spectorsoft.com',
    'webwatchernow.com',
    'wellresearchedreviews.com',
    'wikileaks.org'
]

filesharing_websites = [
    '4shared.com'
    'dropbox.com',
    'fileserve.com',
    'filefreak.com',
    'filestube.com',
    'megaupload.com',
    'thepiratebay.org'
]

In [9]:
with open(dataset_dir / 'http.csv') as f:
    for count, l in tqdm(enumerate(f)):
        pass

28434424it [05:52, 80602.94it/s] 


In [13]:
count = 28434424

In [14]:
CHUNK_SIZE = 500000

df_iter = pd.read_csv(dataset_dir / 'http.csv', chunksize=CHUNK_SIZE, usecols=['date', 'user', 'pc', 'url'])
(output_dir / 'http_preprocessed.csv').unlink()
first_it = True
mode = 'w'

for http_df in tqdm(df_iter, total=ceil(count / CHUNK_SIZE)):
    http_df['date'] = pd.to_datetime(http_df.date, format='%m/%d/%Y %H:%M:%S')

    site_names = http_df['url'].apply(lambda s: re.match('^https?://(www)?([0-9\-\w\.]+)?.+$', s).group(2))
    http_df['site_name'] = site_names
    
    http_df['subtype'] = 0
    http_df.loc[site_names.isin(job_hunting_websites), 'subtype'] = 1
    http_df.loc[site_names.isin(hacktivist_websites), 'subtype'] = 2
    http_df.loc[site_names.isin(filesharing_websites), 'subtype'] = 3
    
    http_df = http_df.merge(most_common_pc, left_on='user', right_on='user', )
    http_df['is_usual_pc'] = http_df['most_common_pc'] == http_df['pc']

    is_work_time = (8 <= http_df.date.dt.hour) & (http_df.date.dt.hour < 17)
    http_df['is_work_time'] = is_work_time
    
    http_df.to_csv(output_dir / 'http_preprocessed.csv', header=first_it, index=False,
                   mode=mode, columns=['date', 'user', 'is_usual_pc', 'is_work_time', 'subtype', 'site_name'])
    first_it = False
    mode = 'a'

100%|██████████████████████████████████████████████████████████████████████████████████| 57/57 [23:13<00:00, 24.45s/it]
