In [1]:
import pandas as pd
from urllib.parse import urlparse

# Define lists outside of the loop
suspicious_exts = ['.ru', '.cn', '.info', '.xyz']
suspicious_hosting_sites = ['dropbox.com', 'drive.google.com', 'mega.nz', 'mediafire.com']
suspicious_protocols = ['ftp://', 'file://']

# Prepare a list to accumulate chunk results
agg_results = []

# Read in chunks
chunksize = 500_000
for chunk in pd.read_csv(
    r"C:\Users\karun\OneDrive\Documents\RIK\data\CERT-dataset\http.csv",
    chunksize=chunksize
):
    # Parse dates
    chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')
    chunk['date_only'] = chunk['date'].dt.date
    chunk['hour'] = chunk['date'].dt.hour

    # Fill NaNs
    chunk['url'] = chunk['url'].fillna('')

    # Extract domain and path
    chunk['domain'] = chunk['url'].apply(lambda u: urlparse(u).netloc)
    chunk['path'] = chunk['url'].apply(lambda u: urlparse(u).path)

    # Flag suspicious domains, hosting, protocol & after-hours
    chunk['is_suspicious_domain'] = chunk['domain'].apply(lambda d: int(any(ext in d for ext in suspicious_exts)))
    chunk['is_file_hosting'] = chunk['domain'].apply(lambda d: int(any(host in d for host in suspicious_hosting_sites)))
    chunk['is_suspicious_protocol'] = chunk['url'].apply(lambda u: int(any(proto in u for proto in suspicious_protocols)))
    chunk['is_out_of_hours'] = chunk['hour'].apply(lambda h: int(h < 8 or h > 18))

    # Group by user and date_only
    chunk_agg = chunk.groupby(
        ['user', 'date_only'], as_index=False
    ).agg(
        web_visits_per_day=('url', 'count'),
        unique_domains_per_day=('domain', pd.Series.nunique),
        unique_paths_per_day=('path', pd.Series.nunique),
        suspicious_domains_per_day=('is_suspicious_domain', 'sum'),
        file_hosting_visits_per_day=('is_file_hosting', 'sum'),
        suspicious_protocol_visits_per_day=('is_suspicious_protocol', 'sum'),
        visits_out_of_hours_per_day=('is_out_of_hours', 'sum')
    )
    agg_results.append(chunk_agg)

# Combine all chunked results
daily_user_http = pd.concat(agg_results, ignore_index=True)

# Regroup in case the same user-date appeared in multiple chunks
daily_user_http = daily_user_http.groupby(
    ['user', 'date_only'], as_index=False
).sum()

# Save output
daily_user_http.to_csv(
    '../outputs/daily_user_http_enriched.csv',
    index=False
)

daily_user_http.head()


Unnamed: 0,user,date_only,web_visits_per_day,unique_domains_per_day,unique_paths_per_day,suspicious_domains_per_day,file_hosting_visits_per_day,suspicious_protocol_visits_per_day,visits_out_of_hours_per_day
0,AAB0162,2010-01-04,95,16,17,0,0,0,3
1,AAB0162,2010-01-05,95,23,23,0,0,0,1
2,AAB0162,2010-01-06,95,25,25,0,0,0,2
3,AAB0162,2010-01-07,95,26,26,0,0,0,3
4,AAB0162,2010-01-08,95,15,15,0,0,0,17
