In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

In [8]:
# load input data
!wget https://raw.githubusercontent.com/zuzanaSKB/machine-learning-project/refs/heads/main/hw_echo_pinatrace.txt

--2025-01-05 21:51:33--  https://raw.githubusercontent.com/zuzanaSKB/machine-learning-project/refs/heads/main/hw_echo_pinatrace.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4339498 (4.1M) [text/plain]
Saving to: ‘hw_echo_pinatrace.txt’


2025-01-05 21:51:34 (50.7 MB/s) - ‘hw_echo_pinatrace.txt’ saved [4339498/4339498]



In [14]:
# preprocess input data

def parse_pintrace_file():
    entries = []
    with open('hw_echo_pinatrace.txt', 'r') as file:
        for line in file:
            parts = line.split()
            if len(parts) < 4:
                continue

            # extract data: address, operation, instr_ptr, size, next_addr
            addr = parts[0].rstrip(':')
            addr = int(addr, 16)
            operation = parts[1]      # 'r' or 'w'
            instr_ptr = int(parts[2], 16)
            size = int(parts[3])      # Size in bytes
            next_addr = int(parts[4], 16) if len(parts) > 4 else None

            # append to list
            entries.append({
                "addr": addr,
                "operation": operation,
                "instr_ptr": instr_ptr,
                "size": size,
                "next_addr": next_addr
            })

    return pd.DataFrame(entries)

# feature extraction

# 1. Address Delta (addr_delta)
def calculate_addr_delta(df):
    df['addr_delta'] = df['addr'].diff().fillna(0)

# 2. Access Frequency (freq)
def calculate_access_frequency(df):
    address_freq = df['addr'].value_counts()
    df['freq'] = df['addr'].map(address_freq)

# 3. Instruction Pointer Reuse (instr_reuse)
def calculate_instr_reuse(df):
    instr_ptr_reuse = df['instr_ptr'].value_counts()
    df['instr_reuse'] = df['instr_ptr'].map(instr_ptr_reuse)

# 4. Temporal Gap (temporal_gap)
def calculate_temporal_gap(df):
    df['temporal_gap'] = df.groupby('addr').cumcount()

# 5. Spatial Locality (spatial_locality) - Check if addresses are close (same page)
def calculate_spatial_locality(df, page_size=4096):
    df['spatial_locality'] = df['addr'].apply(lambda x: (x // page_size))

# 6. Next Address Presence (next_addr_present)
def calculate_next_addr_present(df):
    df['next_addr_present'] = df['next_addr'].notna().astype(int)


def process_trace_data(file_path):
    # Step 1: Parse the file
    df = parse_pintrace_file()
    print(df.head())

    # Step 2: Feature Extraction
    calculate_addr_delta(df)
    calculate_access_frequency(df)
    calculate_instr_reuse(df)
    calculate_temporal_gap(df)
    calculate_spatial_locality(df)
    calculate_next_addr_present(df)

    # Show the resulting DataFrame with features
    print(df.head())  # Display the first few rows of the DataFrame

    # Optional: Save to CSV
    df.to_csv('memory_trace_features.csv', index=False)

# Run the process on the trace file
process_trace_data('hw_echo_pinatrace.out')


              addr operation        instr_ptr  size        next_addr
0  123680716817731         W  140731905595736     8  123680716817736
1  123680716820948         W  140731905595728     8                0
2  123680716820980         W  140731905595720     8                0
3  123680716820982         W  140731905595712     8                0
4  123680716820984         W  140731905595704     8                0
              addr operation        instr_ptr  size        next_addr  \
0  123680716817731         W  140731905595736     8  123680716817736   
1  123680716820948         W  140731905595728     8                0   
2  123680716820980         W  140731905595720     8                0   
3  123680716820982         W  140731905595712     8                0   
4  123680716820984         W  140731905595704     8                0   

   addr_delta  freq  instr_reuse  temporal_gap  spatial_locality  \
0         0.0     1            7             0       30195487504   
1      3217.0    