In [1]:
import pandas as pd

# Load email.csv
email_df = pd.read_csv(
    r"C:\Users\karun\OneDrive\Documents\RIK\data\CERT-dataset\email.csv"
)

# Parse dates
email_df['date'] = pd.to_datetime(email_df['date'], errors='coerce')
email_df['date_only'] = email_df['date'].dt.date

# Fill NaNs
email_df['to'] = email_df['to'].fillna('')
email_df['cc'] = email_df['cc'].fillna('')
email_df['bcc'] = email_df['bcc'].fillna('')
email_df['attachments'] = pd.to_numeric(email_df['attachments'], errors='coerce').fillna(0)

# Multi-recipient, CC, BCC counts
email_df['recipient_count'] = email_df['to'].apply(lambda x: len(x.split(',')) if x != '' else 0)
email_df['cc_count'] = email_df['cc'].apply(lambda x: len(x.split(',')) if x != '' else 0)
email_df['bcc_count'] = email_df['bcc'].apply(lambda x: len(x.split(',')) if x != '' else 0)

# Average attachment size
email_df['avg_attachment_size'] = email_df['attachments'].apply(lambda x: x if pd.notnull(x) and x != 0 else 0)

# Outbound emails
email_df['outbound_email'] = (~email_df['to'].str.contains('@dtaa.com')).astype(int)

# Aggregate per user per day
daily_user_email = email_df.groupby(
    ['user', 'date_only'], as_index=False
).agg(
    emails_sent_per_day=('activity', lambda a: (a == 'Send').sum()),
    emails_viewed_per_day=('activity', lambda a: (a == 'View').sum()),
    avg_email_size_per_day=('size', 'mean'),
    attachments_per_day=('attachments', 'sum'),
    multi_recipient_emails_per_day=('recipient_count', lambda x: (x > 1).sum()),
    outbound_emails_per_day=('outbound_email', 'sum'),
    unique_recipients_per_day=('recipient_count', 'sum'),
    cc_recipients_per_day=('cc_count', 'sum'),
    bcc_recipients_per_day=('bcc_count', 'sum'),
    avg_attachment_size_per_day=('avg_attachment_size', 'mean')
)

# Save output
daily_user_email.to_csv(
    '../outputs/daily_user_email_enriched.csv',
    index=False
)

daily_user_email.head()


Unnamed: 0,user,date_only,emails_sent_per_day,emails_viewed_per_day,avg_email_size_per_day,attachments_per_day,multi_recipient_emails_per_day,outbound_emails_per_day,unique_recipients_per_day,cc_recipients_per_day,bcc_recipients_per_day,avg_attachment_size_per_day
0,AAB0162,2010-01-04,4,5,290616.555556,0.0,0,1,9,1,0,0.0
1,AAB0162,2010-01-05,5,4,320414.444444,0.0,0,1,9,6,0,0.0
2,AAB0162,2010-01-06,4,5,336477.444444,0.0,0,0,9,3,0,0.0
3,AAB0162,2010-01-07,3,6,238192.333333,0.0,0,2,9,4,0,0.0
4,AAB0162,2010-01-08,3,6,26544.0,0.0,0,2,9,7,0,0.0
