In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
from utils.prepro import single_preprocess,ExtraProcessCodeInfo,check_correct_dir_str,check_correct_dir,RunIdentifier
from utils.fltrace_classes import *
from utils.constants import *
from collections import defaultdict
from models.NLinear import NLinear
from functools import cached_property
import random
import torch
import numpy as np
from functools import partial
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import math
from tqdm import tqdm
import numpy.typing as npt
from typing import Sequence
from pathlib import Path
from shutil import rmtree
import pickle as pkl
import plotly.express as px
import itertools
import numpy as np
from plotly import graph_objects as go
from plotly.subplots import make_subplots
from scipy.ndimage import gaussian_filter1d

In [3]:
SEED = 2024
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

In [4]:
IN_PATH = "/home/garvalov/mlp2/data/data/prediction_select_few/processed/"
TEST_DF = IN_PATH+"facesim/processed_500_125.csv"
OUT_PATH = "/home/garvalov/mlp2/prediction/train_outputs/"
MODEL_BASE_DIR = OUT_PATH+"models/"

In [5]:
class Config():
    def __init__(self,config_or_dict,name:str="",name_features:list|None=None,update_dict:dict|None=None):
        self.name = name
        if name_features is None:
            name_features = ['epochs']
        elif 'epochs' not in name_features:
            name_features = ['epochs'] + name_features
        self.name_features = name_features
        if config_or_dict is None:
            config_or_dict = Config.get_default_train_config()
        if isinstance(config_or_dict,Config):
            config_dict = config_or_dict.config_dict
        else:
            config_dict = config_or_dict
        if update_dict is not None:
            config_dict.update(update_dict)
        assert 'epochs' in config_dict
        self.config_dict = config_dict
    
    def __getitem__(self, key):
        return self.config_dict[key]
    
    def __contains__(self,key):
        return key in self.config_dict
    
    def __getattr__(self,k):
        if k not in self:
            raise AttributeError()
        return self[k] # This would've otherwise raised a KeyError, which is not the expected Python behaviour when fetching a non-present attribute
    
    @staticmethod
    def get_default_train_config():
        return Config({
            'tt_split': 0.75, # Train-test split
            'bs': 8, # Training batch size
            'base_lr': 1*(10**-1), # Starting learning rate,
            'end_lr': 1*(10**-3), # Smallest lr you will converge to
            'epochs': 10, # epochs
            'warmup_epochs': 3 # number of warmup epochs
            }) 
    @cached_property
    def ident(self) -> str:
        return ('_'.join([self.name]+[str(self.config_dict[feature]) for feature in self.name_features])).lower()
    
    def update(self,other,name_merge="keep_base"):
        assert isinstance(other,Config) or isinstance(other,dict)
        assert name_merge in ["keep_base","keep_new","concat"]
        return Config(self.config_dict.update(other.config_dict if isinstance(other,Config) else other),self.name if name_merge == "keep_merge" or isinstance(other,dict) else (other.name if name_merge == "keep_new" else other.name+'_'+self.name))


In [6]:
def get_nlinear_config(deltas=False,new_only=True,shuffle=False):
    return Config(None,update_dict={
        "seq_len":10,
        "pred_len":10,
        "individual": False,
        "deltas": deltas,
        "shuffle":shuffle,
        "new_only": new_only
    },name="nlinear",name_features=["seq_len","pred_len","deltas","new_only","shuffle"])

def get_nlinear_model(config):
    return NLinear(config)

---
# Metrics

In [7]:
class Metric():
    def __init__(self,name:str,fn,better_direction : str):
        self.name:str = name
        self.fn=fn
        assert better_direction in ["lower","higher"]
        self.better_direction = better_direction

    def __call__(self, *args, **kwds):
        return self.fn(*args,**kwds)

In [8]:
def rmse(gt,preds):
    return np.sqrt(np.mean((preds-gt)**2))

def mae(gt,preds):
    return np.mean(np.abs(gt - preds))

def _validate_or_get_K(preds: npt.NDArray,K):
    if K is not None:
        assert K == len(preds)
    else:
        K = len(preds)
    return K
def precision_at_K(gt: npt.NDArray, preds: npt.NDArray, K = None):
    K = _validate_or_get_K(preds,K)
    return len(np.intersect1d(preds,gt))/K

def success_at_K(gt: npt.NDArray,preds: npt.NDArray):
    assert isinstance(gt,np.ndarray), f"got `trues` of type {type(gt)}"
    true = gt[:,0]
    assert isinstance(true,np.ndarray)
    return np.isin(true,preds).sum()/len(true)

In [9]:

ALL_METRICS = [
    Metric("Success",success_at_K,"higher"),
    Metric("P@10",precision_at_K,"higher"),
    Metric("RMSE",rmse,"lower"),
    Metric("MAE",mae,"lower")
]

---

# Dataset

In [10]:


def build_batches(sequence:Sequence|pd.Series,history_window_size:int,output_window_size:int = -1,hist_remove_after=False,outputs_fill_after=True,translate_to_page_num = True) -> tuple[npt.NDArray,npt.NDArray|None]:
    # Given a sequence of addresses, translates them to pages, and builds a sequence of histories and optionally desired outputs.
    # The last element of the built history can be considered as the faulted page
    # That is, consider the input sequence as [A_0,A_1,...,A_i,A_i+1,...,A_n],
    # The output is 
    #[
    #   [A_0,A_1,...,A_(history_window_size-1)],  ---> corresponds to the `history_window_size` faulty addresses
    #   [A_1,...,A_(history_window_size)],
    #   [A_2,...,A_(history_window_size+1)],
    #   ...,
    #   
    #]  
    # Optionally,if output_window_size != -1, outputs a sequence of `output_window_size` next faulty addresses
    # That is, outputs
    # [
    #   [A_(history_window_size),A_(history_window_size+1),...,A_(history_window_size+output_window_size-1)],   ---> corresponds to the next `output_window_size` faulty addresses
    #   [A_(history_window_size+1),...,A_(history_window_size+output_window_size)],
    #   ...,
    #   [A_N-1,A_N,<FILLER>,...,<FILLER>].
    #   [A_N,<FILLER>,...,<FILLER>],
    #   [<FILLER>,<FILLER>,..,<FILLER>]
    # ] of same length as the history output, where <FILLER> is simply the last element, repeated.
    assert history_window_size > 0 and len(sequence) > history_window_size, f"Sequence of length {len(sequence)}, yet {history_window_size} requested as history"

    is_pd = isinstance(sequence,pd.Series)
    series = pd.Series(data=sequence) if not is_pd else sequence

    # Translate to page numbers
    if translate_to_page_num:
        series = series.swifter.progress_bar(desc="Getting page number").apply(get_page_num)
    data=series.values
    assert isinstance(data,np.ndarray)

    n = len(sequence)
    
    idx = np.arange(history_window_size)[None, :] + np.arange(n - history_window_size - (output_window_size if hist_remove_after else 0) + 1)[:, None]
    history_windows = data[idx]
    output_windows = None
    if output_window_size != -1:
        out_idx = np.arange(output_window_size)[None, :] + np.arange(history_window_size, n-(output_window_size if not outputs_fill_after else 0)+1)[:, None]
        output_windows = np.zeros(out_idx.shape)
        valid_indices = out_idx < n
        output_windows[np.where(valid_indices)] = data[out_idx[valid_indices]]
        output_windows[~valid_indices] = data[-1]
    
    assert (
            (hist_remove_after != outputs_fill_after          and len(history_windows) == len(output_windows)) or 
            (not hist_remove_after and not outputs_fill_after and len(history_windows) > len(output_windows))  or
            (hist_remove_after and outputs_fill_after         and len(history_windows) < len(output_windows))
        )

    return pd.Series(data=list(history_windows)) if is_pd else np.array(history_windows), pd.Series(data=list(output_windows)) if is_pd else np.array(output_windows)
    

In [11]:

class PageFaultDataset(Dataset):
    def __init__(self,config, pd_serie,indices_split):
        super().__init__()
        self.x,self.y = build_batches(pd_serie.loc[indices_split],history_window_size=config.seq_len,output_window_size=config.pred_len,outputs_fill_after=False,hist_remove_after=True,translate_to_page_num=True)
        self.x = self.x.values
        self.y = self.y.values
        assert len(self.x) == len(self.y)
    
    def __len__(self):
        """
        :return: the number of elements in the dataset
        """
        return len(self.x)

    def __getitem__(self, index) -> dict:
        return torch.tensor(self.x[index],dtype=torch.double).unsqueeze(-1),torch.tensor(self.y[index],dtype=torch.double)

In [12]:

def get_tt_ds(config):
    df_to_use = pd.read_csv(TEST_DF)
    if config.new_only:
        df_to_use = df_to_use[df_to_use["flags"] < 32]
    df_to_use = df_to_use["addr"]
    if config.deltas:
        df_to_use = df_to_use.diff().dropna()
    df_to_use = df_to_use.astype(int)
    train_tensor_size = int(config["tt_split"] * len(df_to_use))
    train_ds = PageFaultDataset(config,df_to_use,df_to_use.index[:train_tensor_size])
    test_ds = PageFaultDataset(config,df_to_use,df_to_use.index[train_tensor_size:])
    return DataLoader(train_ds,batch_size=config.bs,shuffle=config.shuffle), DataLoader(test_ds,batch_size=1,shuffle=False)

In [13]:
def validate_model(model,eval_dataloader,device,metrics: list|None=None, print_validation=True):
    model.eval()
    if metrics is None:
        metrics = ALL_METRICS
    gt = []
    preds = []
    with torch.no_grad():
        for batch in (tqdm(eval_dataloader,desc="Evaluating model",total=len(eval_dataloader)) if print_validation else eval_dataloader):
            x,y = batch
            predicted = model(x).squeeze(dim=-1).detach().cpu()
            if isinstance(y,list):
                assert isinstance(predicted,list)
                gt.extend(y)
                preds.extend(predicted.numpy().tolist())
            else:
                gt.append(y)
                preds.append(predicted)
    gt = np.array(gt).squeeze()
    preds = np.array(preds).squeeze()
    print("Computed batches for all of eval dataset")
    all_res = [metric(gt,preds) for metric in (tqdm(metrics,desc="Computing metric results",total=len(metrics)) if print_validation else metrics)]
    return all_res

def maybe_update_result(current_value,new_result,better="lower"):
    assert better in ["lower","higher"]
    updated = False
    new_value = current_value
    if current_value == -np.inf or (better == "lower" and new_result <= current_value) or (better == "higher" and new_result >= current_value):
        updated = True
        new_value = new_result
    return updated,new_value
                

def generic_train_loop(config:Config,model_fn,override_previous_dir=False,print_validation = True,tqdm_train=True):
    # Trains a model
    # When `override_previous_dir` = True, destroy the directory of the previous saved run with the same config id (if it exists)
    device = "cpu" if not torch.cuda.is_available() else "cuda"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    device = torch.device(device)
    generator = torch.Generator()
    generator.manual_seed(SEED)
    train_dataloader, test_dataloader = get_tt_ds(config)
    epochs = config["epochs"]
    warmup_epochs = config['warmup_epochs']
    model = model_fn(config).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['base_lr'], eps=1e-6,amsgrad=True)
    loss_fn = torch.nn.MSELoss()    

    # LR scheduler
    # We do the following technique : high LR in the beginning, low towards the end
    # starting from base_lr we decrease up to e-5, by a factor of 1/sqrt(10) ~0.3162  k times
    fct = 1/np.sqrt(10)
    final_lr = config["end_lr"]
    end_epoch = min(32,epochs)
    num_groups = math.ceil(math.log(final_lr / config["base_lr"], fct))
    group_size = end_epoch // num_groups
    milestones = [warmup_epochs+group_size*i for i in range(num_groups)]
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,milestones,fct)


    metrics_to_use = ALL_METRICS
    best_results = {metric:-np.inf for metric in metrics_to_use}
    model_bases = Path(MODEL_BASE_DIR)
    model_bases.mkdir(parents=True,exist_ok=True)
    save_dir = model_bases / config.ident
    if override_previous_dir and save_dir.exists():
        assert save_dir.is_dir(),f"{save_dir.absolute().as_posix()} exists and is not a directory!"
        rmtree(save_dir.absolute().as_posix())
    save_dir.mkdir(parents = False,exist_ok=False)
    for metric in metrics_to_use:
        (save_dir/metric.name).mkdir(parents = False,exist_ok=False)

    all_losses = []
    all_results = defaultdict(list)
    worse_success_count = 0

    for epoch in range(epochs):
        torch.cuda.empty_cache()
        model.train()
        train_iterator = tqdm(train_dataloader,desc=f"Processing epoch {epoch:02d} w/ lr ({lr_scheduler.get_last_lr()})",total=len(train_dataloader)) if tqdm_train else train_dataloader
        c = 0
        gl = 0

        # Train
        for batch in train_iterator:
            x,y = batch
            prediction = model(x).squeeze(dim=-1)
            loss = loss_fn(prediction,y)
            c+=1
            cl = loss.item()
            gl += cl
            all_losses.append(cl)
            if tqdm_train:
                train_iterator.set_postfix({"loss": f"{gl/c:6.3e}"})
            loss.backward()
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

        # Validate
        if print_validation:
            print(TEXT_SEPARATOR)
        all_current_results = validate_model(model,test_dataloader,device,metrics_to_use,print_validation=print_validation)
        for metric,new_metric_result in zip(metrics_to_use,all_current_results):    
            if print_validation:
                print(f"{metric.name}: {new_metric_result}\n")
            all_results[metric].append(new_metric_result)
            # Save best models
            curr_metric_res = best_results[metric]
            updated,new_res = maybe_update_result(curr_metric_res,new_metric_result,metric.better_direction)
            best_results[metric] = new_res
            if updated:
                if metric.name.lower() == "success": 
                    worse_success_count = 0
                # Save the model
                metric_dir:Path = save_dir/metric.name
                fname:Path = metric_dir/"model.pt"
                if fname.exists():
                    assert fname.is_file()
                    fname.unlink()
                torch.save(model.state_dict(),fname.absolute().as_posix())
                with open((metric_dir/"config.pkl").absolute().as_posix(),"wb") as f:
                    pkl.dump(config,f,pkl.HIGHEST_PROTOCOL)
            elif metric.name.lower() == "success": 
                worse_success_count += 1
        if print_validation:
            print(TEXT_SEPARATOR)
        
        # Update LR
        lr_scheduler.step()

        if worse_success_count >=3:
            print(f"Reached worse success on validation set three epochs in a row, early stopping the training to avoid overfitting!")
            break
        
    return model,save_dir, all_results, all_losses

In [None]:
config = get_nlinear_config()
last_model,run_save_dir,all_results_base, all_losses_base = generic_train_loop(config,get_nlinear_model,override_previous_dir = True)

In [None]:
config = get_nlinear_config()
tr,tst = get_tt_ds(config)

In [None]:
x,y = next(iter(tr))
x = x.squeeze()
x[0,0].item()

In [16]:
df_prepro = pd.read_csv(TEST_DF)
train_tensor_size = int(config["tt_split"] * len(df_prepro))

In [None]:
df_prepro.head()

In [None]:

df_prepro[df_prepro["flags"] < 32]["addr"].astype(int).plot()

In [None]:
filtered_df = df_prepro[df_prepro["flags"] < 32].copy()
filtered_df['addr_diff'] = filtered_df['addr'].diff()

fig = px.scatter(
    filtered_df,
    x=filtered_df.index,
    y='addr_diff',
    title="Address Differences Over Time"
)

# Add line connecting the points
fig.add_scatter(
    x=filtered_df.index,
    y=filtered_df['addr_diff'],
    mode='lines',
    line=dict(color='rgba(0,0,0,0.3)'),
    showlegend=False
)

fig.update_layout(
    xaxis_title="Index",
    yaxis_title="Address Difference",
    showlegend=False,
    yaxis=dict(
        tickformat='.2e'
    )
)

fig.show()

In [14]:
def cross_validate_hyperparameters():
    # Generate all possible combinations of boolean hyperparameters
    param_combinations = list(itertools.product([False, True], repeat=3))
    
    # Store results for each combination
    all_experiments = {}
    
    for deltas, new_only, shuffle in param_combinations:
        # Generate experiment name
        exp_name = f"d{int(deltas)}n{int(new_only)}s{int(shuffle)}"
        print('='*80)
        print(f"Starting {exp_name}")
        print('='*80)

        # Get configuration and train model
        config = get_nlinear_config(deltas, new_only, shuffle)
        last_model, run_save_dir, all_results_base, all_losses_base = generic_train_loop(
            config,
            get_nlinear_model,
            override_previous_dir=False
        )
        
        # Store results
        all_experiments[exp_name] = {
            'results': all_results_base,
            'losses': all_losses_base,
            'params': {'deltas': deltas, 'new_only': new_only, 'shuffle': shuffle}
        }
    
    # Plot results
    plot_all_results(all_experiments)
    plot_all_losses(all_experiments)
    
    return all_experiments

def plot_all_results(all_experiments):
    # Create figure
    fig = go.Figure()
    
    # Color scale for different experiments
    colors = px.colors.qualitative.Set3
    
    for i, (exp_name, exp_data) in enumerate(all_experiments.items()):
        color = colors[i % len(colors)]
        
        for metric, results in exp_data['results'].items():
            y = np.array(results)
            x = np.arange(len(results))
            
            # Add line trace
            fig.add_trace(go.Scatter(
                x=x,
                y=y,
                name=f"{exp_name} - {metric.name}",
                line=dict(color=color),
                mode='lines'
            ))
            
            # Add minimum point marker
            min_idx = np.argmin(y)
            fig.add_trace(go.Scatter(
                x=[x[min_idx]],
                y=[y[min_idx]],
                name=f"{exp_name} - {metric.name} min",
                mode='markers',
                marker=dict(
                    symbol='x',
                    size=10,
                    color='red'
                ),
                showlegend=False
            ))
    
    # Update layout
    fig.update_layout(
        title="Metric Results Across Experiments",
        xaxis_title="Epoch",
        yaxis_title="Metric Value",
        hovermode='x unified',
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01
        )
    )
    
    fig.show()

def plot_all_losses(all_experiments):
    # Create figure
    fig = go.Figure()
    
    # Color scale for different experiments
    colors = px.colors.qualitative.Set3
    
    for i, (exp_name, exp_data) in enumerate(all_experiments.items()):
        color = colors[i % len(colors)]
        y = np.array(exp_data['losses'])
        x = np.arange(len(y))
        
        # Calculate number of epochs based on data length
        num_epochs = len(exp_data['results'][list(exp_data['results'].keys())[0]])
        iterations_per_epoch = len(y) // num_epochs
        
        # Add raw loss trace with high transparency
        fig.add_trace(go.Scatter(
            x=x,
            y=y,
            name=f"{exp_name} - Raw",
            line=dict(color=color, width=1),
            opacity=0.3,
            showlegend=False
        ))
        
        # Add smoothed loss trace
        y_smoothed = gaussian_filter1d(y, sigma=max(len(y)//200, 15))
        fig.add_trace(go.Scatter(
            x=x,
            y=y_smoothed,
            name=f"{exp_name}",
            line=dict(color=color, width=2)
        ))
        
        # Create custom tick labels for epochs
        if num_epochs <= 10:
            tick_vals = [i * iterations_per_epoch for i in range(num_epochs)]
            tick_text = list(range(num_epochs))
        else:
            tick_indices = np.linspace(0, num_epochs - 1, 10, dtype=int)
            tick_vals = [int(i * iterations_per_epoch) for i in tick_indices]
            tick_text = tick_indices
    
    # Update layout
    fig.update_layout(
        title="Training Loss Across Experiments",
        xaxis=dict(
            title="Epoch",
            ticktext=tick_text,
            tickvals=tick_vals
        ),
        yaxis_title="Loss",
        hovermode='x unified',
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01
        )
    )
    
    fig.show()

In [None]:
experiments = cross_validate_hyperparameters()