In [1]:
import pandas as pd

# Load file.csv
file_df = pd.read_csv(
    r"C:\Users\karun\OneDrive\Documents\RIK\data\CERT-dataset\file.csv"
)

# Parse dates
file_df['date'] = pd.to_datetime(file_df['date'], errors='coerce')
file_df['date_only'] = file_df['date'].dt.date

# Convert removable columns to int
file_df['to_removable_media'] = file_df['to_removable_media'].map(
    {'True': 1, 'False': 0}
)
file_df['from_removable_media'] = file_df['from_removable_media'].map(
    {'True': 1, 'False': 0}
)

# Flag file writes & opens
file_df['is_write'] = file_df['activity'].str.lower().str.contains('write').astype(int)
file_df['is_open'] = file_df['activity'].str.lower().str.contains('open').astype(int)

# Flag decoy file access
decoy_df = pd.read_csv(
    r"C:\Users\karun\OneDrive\Documents\RIK\data\CERT-dataset\decoy_file.csv"
)
decoy_files_set = set(decoy_df['decoy_filename'].str.lower())
file_df['is_decoy'] = file_df['filename'].str.lower().isin(decoy_files_set).astype(int)

# Aggregate per user per day
daily_user_files = file_df.groupby(
    ['user', 'date_only'], as_index=False
).agg(
    file_events_per_day=('filename', 'count'),
    unique_files_per_day=('filename', pd.Series.nunique),
    usb_copies_per_day=('to_removable_media', 'sum'),
    usb_reads_per_day=('from_removable_media', 'sum'),
    file_write_events=('is_write', 'sum'),
    file_open_events=('is_open', 'sum'),
    decoy_file_access_per_day=('is_decoy', 'sum')
)

# Save output
daily_user_files.to_csv(
    '../outputs/daily_user_files_enriched.csv',
    index=False
)

daily_user_files.head()


Unnamed: 0,user,date_only,file_events_per_day,unique_files_per_day,usb_copies_per_day,usb_reads_per_day,file_write_events,file_open_events,decoy_file_access_per_day
0,AAB0162,2010-01-13,2,1,0.0,0.0,0,2,2
1,AAB0162,2010-01-18,1,1,0.0,0.0,0,1,1
2,AAB0162,2010-03-08,1,1,0.0,0.0,0,1,1
3,AAB0162,2010-05-19,1,1,0.0,0.0,0,1,1
4,AAB0162,2010-05-24,1,1,0.0,0.0,0,1,1
