In [1]:
import pandas as pd
import glob

# Step 1: Load logon.csv
logon_df = pd.read_csv(
    r"C:\Users\karun\OneDrive\Documents\RIK\data\CERT-dataset\logon.csv"
)

# Step 2: Parse dates & filter logons
logon_df['date'] = pd.to_datetime(logon_df['date'], errors='coerce')
logon_df = logon_df[logon_df['activity'].str.lower() == "logon"]

# Step 3: Extract date/hour/weekday
logon_df['date_only'] = logon_df['date'].dt.date
logon_df['hour'] = logon_df['date'].dt.hour
logon_df.sort_values(by=['user','date'], inplace=True)

# Step 4: Aggregate per user per day
daily_user_logons = logon_df.groupby(
    ['user', 'date_only'], as_index=False
).agg(
    logins_per_day=('pc', 'count'),
    unique_pcs_per_day=('pc', pd.Series.nunique),
    logins_out_of_hours=('hour', lambda h: ((h < 8) | (h > 18)).sum()),
    weekend_logins=('date', lambda d: d.dt.weekday.isin([5,6]).sum()),
    earliest_login_time=('hour', 'min'),
    latest_login_time=('hour', 'max')
)

# Step 5: Rolling 7-day average logins
daily_user_logons.sort_values(by=['user','date_only'], inplace=True)
daily_user_logons['rolling_7day_avg_logins'] = daily_user_logons.groupby(
    'user'
)['logins_per_day'].transform(
    lambda x: x.rolling(window=7, min_periods=1).mean()
)

# Step 6: Load all LDAP files & merge
ldap_files = sorted(glob.glob(
    r"C:\Users\karun\OneDrive\Documents\RIK\data\CERT-dataset\LDAP\*.csv"
))
ldap_df = pd.concat([pd.read_csv(f) for f in ldap_files], ignore_index=True)

ldap_df.sort_values(by='user_id', inplace=True)
ldap_df = ldap_df.groupby('user_id').tail(1).reset_index(drop=True)

# Merge LDAP with logons
daily_user_logons = daily_user_logons.merge(
    ldap_df,
    left_on='user',
    right_on='user_id',
    how='left'
)

# Save final enriched logon features
daily_user_logons.to_csv(
    '../outputs/daily_user_logons_enriched.csv',
    index=False
)

daily_user_logons.head()


Unnamed: 0,user,date_only,logins_per_day,unique_pcs_per_day,logins_out_of_hours,weekend_logins,earliest_login_time,latest_login_time,rolling_7day_avg_logins,employee_name,user_id,email,role,projects,business_unit,functional_unit,department,team,supervisor
0,AAB0162,2010-01-04,1,1,1,0,7,7,1.0,Amos Ahmed Burch,AAB0162,Amos.Ahmed.Burch@dtaa.com,HardwareEngineer,,1,3 - ResearchAndEngineering_Government_Domestic,3 - SoftwareManagement,3 - EmbeddedSoftware,Jeanette Macey Simpson
1,AAB0162,2010-01-05,1,1,1,0,7,7,1.0,Amos Ahmed Burch,AAB0162,Amos.Ahmed.Burch@dtaa.com,HardwareEngineer,,1,3 - ResearchAndEngineering_Government_Domestic,3 - SoftwareManagement,3 - EmbeddedSoftware,Jeanette Macey Simpson
2,AAB0162,2010-01-06,1,1,1,0,7,7,1.0,Amos Ahmed Burch,AAB0162,Amos.Ahmed.Burch@dtaa.com,HardwareEngineer,,1,3 - ResearchAndEngineering_Government_Domestic,3 - SoftwareManagement,3 - EmbeddedSoftware,Jeanette Macey Simpson
3,AAB0162,2010-01-07,1,1,1,0,7,7,1.0,Amos Ahmed Burch,AAB0162,Amos.Ahmed.Burch@dtaa.com,HardwareEngineer,,1,3 - ResearchAndEngineering_Government_Domestic,3 - SoftwareManagement,3 - EmbeddedSoftware,Jeanette Macey Simpson
4,AAB0162,2010-01-08,1,1,1,0,7,7,1.0,Amos Ahmed Burch,AAB0162,Amos.Ahmed.Burch@dtaa.com,HardwareEngineer,,1,3 - ResearchAndEngineering_Government_Domestic,3 - SoftwareManagement,3 - EmbeddedSoftware,Jeanette Macey Simpson
