In [25]:
import urllib.request
import gzip
import os
ftp_url = "ftp://ita.ee.lbl.gov/traces/calgary_access_log.gz"
local_file = "calgary_access_log.gz"
if not os.path.exists(local_file):
    print("Downloading the file...")
    urllib.request.urlretrieve(ftp_url, local_file)
    print("Download complete!")
else:
    print("File already exists locally.")
def load_logs(file_path):
    with gzip.open(file_path, 'rt', errors='ignore') as f:
        lines = f.readlines()
    return lines
raw_logs = load_logs(local_file)
print(f"✅ Total lines loaded: {len(raw_logs)}")


File already exists locally.
✅ Total lines loaded: 726739


In [26]:
import re
from datetime import datetime

# Regex pattern to match Apache log format
log_pattern = re.compile(
    r'(\S+) (\S+) (\S+) \[(.*?)\] "(.*?)" (\d{3}) (\S+)'
)

# Store parsed logs
parsed_logs = []

for line in tqdm(raw_logs, desc="Parsing logs"):
    match = log_pattern.match(line)
    if match:
        host, rfc931, authuser, timestamp, request, status, byte_size = match.groups()

        # Parse datetime
        try:
            dt = datetime.strptime(timestamp, '%d/%b/%Y:%H:%M:%S %z')
        except ValueError:
            continue  # skip malformed dates

        # Parse request into method, filename, and protocol
        request_parts = request.split()
        if len(request_parts) == 3:
            method, filename, protocol = request_parts
        else:
            filename = None  # malformed request
            method = protocol = None

        # Parse byte size
        byte_size = int(byte_size) if byte_size != '-' else 0

        # Append cleaned data
        parsed_logs.append({
            'host': host,
            'datetime': dt,
            'method': method,
            'filename': filename,
            'protocol': protocol,
            'status': int(status),
            'bytes': byte_size
        })


Parsing logs: 100%|█████████████████████████████████████████████████████████| 726739/726739 [00:40<00:00, 17729.43it/s]


In [7]:
import pandas as pd

df = pd.DataFrame(parsed_logs)
print("✅ DataFrame created. Shape:", df.shape)
df.head()


✅ DataFrame created. Shape: (724910, 7)


Unnamed: 0,host,datetime,method,filename,protocol,status,bytes
0,local,1994-10-24 13:41:41-06:00,GET,index.html,HTTP/1.0,200,150
1,local,1994-10-24 13:41:41-06:00,GET,1.gif,HTTP/1.0,200,1210
2,local,1994-10-24 13:43:13-06:00,GET,index.html,HTTP/1.0,200,3185
3,local,1994-10-24 13:43:14-06:00,GET,2.gif,HTTP/1.0,200,2555
4,local,1994-10-24 13:43:15-06:00,GET,3.gif,HTTP/1.0,200,36403


In [9]:
print(df['datetime'].dtype)


object


In [27]:
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')


In [28]:
df['date_str'] = df['datetime'].dt.strftime('%d-%b-%Y')
df['hour'] = df['datetime'].dt.hour
df['ext'] = df['filename'].str.extract(r'\.([a-zA-Z0-9]+)$', expand=False).fillna('none')


In [29]:
total_logs = len(df)
print("Q1:", total_logs)


Q1: 724910


In [30]:
unique_hosts = df['host'].nunique()
print("Q2:", unique_hosts)


Q2: 2


In [31]:
datewise_unique_filenames = df.groupby('date_str')['filename'].nunique().to_dict()
print("Q3:", datewise_unique_filenames)


Q3: {'01-Aug-1995': 669, '01-Jul-1995': 387, '01-Jun-1995': 590, '01-May-1995': 467, '01-Oct-1995': 552, '01-Sep-1995': 328, '02-Apr-1995': 438, '02-Aug-1995': 855, '02-Jul-1995': 397, '02-Jun-1995': 513, '02-May-1995': 701, '02-Oct-1995': 871, '02-Sep-1995': 349, '03-Apr-1995': 795, '03-Aug-1995': 582, '03-Jul-1995': 433, '03-Jun-1995': 398, '03-May-1995': 589, '03-Oct-1995': 846, '03-Sep-1995': 212, '04-Apr-1995': 821, '04-Aug-1995': 715, '04-Jul-1995': 610, '04-Jun-1995': 353, '04-May-1995': 684, '04-Oct-1995': 889, '04-Sep-1995': 340, '05-Apr-1995': 891, '05-Aug-1995': 507, '05-Jul-1995': 607, '05-Jun-1995': 494, '05-May-1995': 609, '05-Oct-1995': 846, '05-Sep-1995': 411, '06-Apr-1995': 678, '06-Aug-1995': 448, '06-Jul-1995': 522, '06-Jun-1995': 662, '06-May-1995': 517, '06-Oct-1995': 868, '06-Sep-1995': 549, '07-Apr-1995': 776, '07-Aug-1995': 608, '07-Jul-1995': 428, '07-Jun-1995': 486, '07-May-1995': 725, '07-Oct-1995': 468, '07-Sep-1995': 590, '08-Apr-1995': 542, '08-Aug-1995': 

In [32]:
q4_404_count = df[df['status'] == 404].shape[0]
print("Q4:", q4_404_count)



Q4: 23586


In [None]:
Top 15 filenames with 404 responses

In [33]:
q5_top_404_files = df[df['status'] == 404]['filename'].value_counts().head(15).items()
print("Q5:", list(q5_top_404_files))


Q5: [('index.html', 4694), ('4115.html', 902), ('1611.html', 649), ('5698.xbm', 585), ('710.txt', 408), ('2002.html', 258), ('2177.gif', 193), ('10695.ps', 161), ('6555.html', 153), ('487.gif', 152), ('151.html', 149), ('3414.gif', 148), ('488.gif', 148), ('40.html', 148), ('9678.gif', 142)]


In [None]:
Top 15 file extensions with 404 responses

In [34]:
df['ext'] = df['filename'].str.extract(r'\.([a-zA-Z0-9]+)$', expand=False).fillna('none')

q6_top_404_exts = df[df['status'] == 404]['ext'].value_counts().head(15).items()
print("Q6:", list(q6_top_404_exts))


Q6: [('html', 12142), ('gif', 7202), ('xbm', 824), ('ps', 754), ('none', 673), ('jpg', 520), ('txt', 496), ('GIF', 135), ('htm', 107), ('cgi', 77), ('com', 45), ('Z', 41), ('dvi', 40), ('ca', 36), ('hmtl', 30)]


In [35]:
df_july = df[(df['datetime'].dt.month == 7) & (df['datetime'].dt.year == 1995)]

# Filter out non-numeric bytes (i.e., '-')
df_july = df_july[df_july['bytes'].astype(str) != '-']
df_july['bytes'] = df_july['bytes'].astype(int)

q7_bandwidth_july = df_july.groupby('date_str')['bytes'].sum().to_dict()
print("Q7:", q7_bandwidth_july)


Q7: {'01-Jul-1995': 11349799, '02-Jul-1995': 8656918, '03-Jul-1995': 13596612, '04-Jul-1995': 26573988, '05-Jul-1995': 19541225, '06-Jul-1995': 19755015, '07-Jul-1995': 9427822, '08-Jul-1995': 5403491, '09-Jul-1995': 4660556, '10-Jul-1995': 14917754, '11-Jul-1995': 22507207, '12-Jul-1995': 17367065, '13-Jul-1995': 15989234, '14-Jul-1995': 19186430, '15-Jul-1995': 15773233, '16-Jul-1995': 9016378, '17-Jul-1995': 19601338, '18-Jul-1995': 17099761, '19-Jul-1995': 17851725, '20-Jul-1995': 20752623, '21-Jul-1995': 25491617, '22-Jul-1995': 8136259, '23-Jul-1995': 9593870, '24-Jul-1995': 22308265, '25-Jul-1995': 24561635, '26-Jul-1995': 24995540, '27-Jul-1995': 25969995, '28-Jul-1995': 36460693, '29-Jul-1995': 11700624, '30-Jul-1995': 23189598, '31-Jul-1995': 30730715}


In [36]:
df['hour'] = df['datetime'].dt.hour
q8_hourly_dist = df['hour'].value_counts().sort_index().to_dict()
print("Q8:", q8_hourly_dist)


Q8: {0.0: 11598, 1.0: 9913, 2.0: 9403, 3.0: 8147, 4.0: 7832, 5.0: 8283, 6.0: 9798, 7.0: 11930, 8.0: 17351, 9.0: 21683, 10.0: 25717, 11.0: 28665, 12.0: 26845, 13.0: 30089, 14.0: 29792, 15.0: 28149, 16.0: 28287, 17.0: 23332, 18.0: 17862, 19.0: 17325, 20.0: 17492, 21.0: 15969, 22.0: 14588, 23.0: 13613}


In [None]:
Top 10 most requested filenames

In [37]:
q9_top_filenames = df['filename'].value_counts().head(10).items()
print("Q9:", list(q9_top_filenames))


Q9: [('index.html', 139528), ('3.gif', 24006), ('2.gif', 23595), ('4.gif', 8018), ('244.gif', 5148), ('5.html', 5010), ('4097.gif', 4874), ('8870.jpg', 4492), ('6733.gif', 4278), ('8472.gif', 3843)]


In [38]:
q10_status_dist = df['status'].value_counts().sort_index().to_dict()
print("Q10:", q10_status_dist)


Q10: {200: 568348, 302: 30295, 304: 97792, 400: 15, 401: 46, 403: 4743, 404: 23586, 500: 42, 501: 43}
