In [1]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('resources/data.csv')
df_copy = df.loc[:, (df != 0).any(axis=0)]
df_copy = df_copy.drop(labels=['Sprint', 'Severity of the threat'], axis=1)

In [2]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 12 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   Change code line number                   516 non-null    int64 
 1   Number of vulnerable modules              516 non-null    int64 
 2   Number of people involved in development  516 non-null    int64 
 3   Time to complete each version             516 non-null    object
 4   Commit frequency                          516 non-null    object
 5   Type of environment                       516 non-null    object
 6   Number of libraries detected errors       516 non-null    int64 
 7   Scan date                                 516 non-null    object
 8   Number of potential weaknesses            516 non-null    int64 
 9   Số lượng lỗ hổng cấu hình môi trường      516 non-null    int64 
 10  Evaluate                                  516 non-

In [3]:
df_copy

Unnamed: 0,Change code line number,Number of vulnerable modules,Number of people involved in development,Time to complete each version,Commit frequency,Type of environment,Number of libraries detected errors,Scan date,Number of potential weaknesses,Số lượng lỗ hổng cấu hình môi trường,Evaluate,Repo
0,19,1,2,5 days,Daily,Cloud,22,2025-02-17 00:00:00,26,4,4,admin-jlpt
1,1152,1,2,5 days,Daily,Cloud,22,2025-02-18 00:00:00,26,4,4,admin-jlpt
2,100,1,2,5 days,Daily,Cloud,22,2025-02-19 00:00:00,26,4,4,admin-jlpt
3,221,1,2,5 days,Daily,Cloud,22,2025-02-20 00:00:00,26,4,4,admin-jlpt
4,96,1,2,5 days,Daily,Cloud,22,2025-02-22 00:00:00,26,4,4,admin-jlpt
...,...,...,...,...,...,...,...,...,...,...,...,...
511,530,1,8,5 days,Weekly,Cloud,40,2025-01-09 00:00:00,42,2,2,payment-system
512,1996,1,6,5 days,Daily,Cloud,39,2025-02-04 00:00:00,40,1,3,user-management
513,1986,1,8,7 days,Weekly,Cloud,14,2025-01-15 00:00:00,16,2,3,payment-system
514,435,1,7,10 days,Monthly,Cloud,29,2025-03-14 00:00:00,32,3,1,admin-jlpt


In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_timeseries_dataframe(df: pd.DataFrame):
    df = df.copy()
    df = df.loc[:, (df != 0).any(axis=0)]
    df = df.drop(labels=['Sprint', 'Severity of the threat'], axis=1)
    df['Scan date'] = pd.to_datetime(df['Scan date'])

    categorical_columns = ['Time to complete each version', 'Commit frequency', 'Type of environment', 'Repo']
    group_sum_col = 'Change code line number'
    target_col = 'Evaluate'
    date_col = 'Scan date'

    all_dfs = []
    for repo, group in df.groupby('Repo'):
        # group by date to avoid duplicates before reindexing
        group = group.groupby(date_col).agg({
            group_sum_col: 'sum',
            'Number of vulnerable modules': 'mean',
            'Number of people involved in development': 'mean',
            'Number of libraries detected errors': 'mean',
            'Number of potential weaknesses': 'mean',
            'Số lượng lỗ hổng cấu hình môi trường': 'mean',
            target_col: 'first',
            **{col: 'first' for col in categorical_columns}
        }).sort_index()

        # create full date index and reindex
        all_dates = pd.date_range(start=group.index.min(), end=group.index.max(), freq='D')
        group = group.reindex(all_dates, method='ffill')
        group[date_col] = group.index
        group['Repo'] = repo

        # identify filled-in rows and set code line to 0
        original_dates = df[df['Repo'] == repo]['Scan date'].unique()
        group[group_sum_col] = group[group_sum_col].where(group.index.isin(original_dates), 0)

        all_dfs.append(group)

    df_filled = pd.concat(all_dfs).reset_index(drop=True)

    # encode categoricals
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df_filled[col] = le.fit_transform(df_filled[col])
        label_encoders[col] = le

    # normalize features (excluding Scan date, Repo, Evaluate)
    feature_cols = df_filled.columns.difference([date_col, 'Repo', target_col])
    scaler = StandardScaler()
    df_filled[feature_cols] = scaler.fit_transform(df_filled[feature_cols])

    # final outputs
    X = df_filled.drop(columns=[date_col, 'Repo', target_col])
    y = df_filled[target_col]
    meta = df_filled[[date_col, 'Repo']]

    return X, y, meta, label_encoders, scaler

In [5]:
X, y, meta, encoders, scaler = preprocess_timeseries_dataframe(df)
X

Unnamed: 0,Change code line number,Number of vulnerable modules,Number of people involved in development,Number of libraries detected errors,Number of potential weaknesses,Số lượng lỗ hổng cấu hình môi trường,Time to complete each version,Commit frequency,Type of environment
0,1.078302,0.0,-0.649854,0.488260,0.552000,0.950836,-1.198149,0.096718,0.0
1,0.010589,0.0,-0.067334,-1.500259,-1.611377,-1.724206,-1.198149,-1.099074,0.0
2,-0.064960,0.0,-0.566637,-0.601339,-0.556731,0.568687,1.233915,-1.099074,0.0
3,0.430454,0.0,1.929876,0.706180,0.579042,-1.724206,0.017883,1.292511,0.0
4,0.766747,0.0,-0.316985,0.542740,0.538479,-0.004536,1.233915,0.096718,0.0
...,...,...,...,...,...,...,...,...,...
267,1.052897,0.0,0.431968,-0.002059,0.038198,0.568687,-1.198149,-1.099074,0.0
268,1.032839,0.0,-1.731676,-0.383419,-0.475604,-1.342057,1.233915,1.292511,0.0
269,0.007915,0.0,-1.565242,1.278220,1.146929,-1.724206,-1.198149,1.292511,0.0
270,-0.400584,0.0,0.931271,1.931980,1.958195,0.568687,-1.198149,1.292511,0.0


In [6]:
y

0      4
1      3
2      2
3      3
4      2
      ..
267    4
268    4
269    1
270    1
271    3
Name: Evaluate, Length: 272, dtype: int64

In [7]:
meta

Unnamed: 0,Scan date,Repo
0,2025-01-01,0
1,2025-01-02,0
2,2025-01-03,0
3,2025-01-04,0
4,2025-01-05,0
...,...,...
267,2025-03-27,2
268,2025-03-28,2
269,2025-03-29,2
270,2025-03-30,2


In [8]:
ori_X = scaler.inverse_transform(X[list(scaler.feature_names_in_)]).astype(int)
ori_X = pd.DataFrame(data=ori_X, columns=list(scaler.feature_names_in_))

In [9]:
for col in ('Time to complete each version', 'Commit frequency', 'Type of environment'):
    ori_X[col] = encoders[col].inverse_transform(ori_X[col])

In [10]:
ori_X

Unnamed: 0,Change code line number,Commit frequency,Number of libraries detected errors,Number of people involved in development,Number of potential weaknesses,Number of vulnerable modules,Số lượng lỗ hổng cấu hình môi trường,Time to complete each version,Type of environment
0,3494,Monthly,31,3,34,1,3,10 days,Cloud
1,1897,Daily,7,5,8,1,1,10 days,Cloud
2,1784,Daily,18,4,21,1,3,7 days,Cloud
3,2525,Weekly,34,9,35,1,1,5 days,Cloud
4,3028,Monthly,32,4,34,1,2,7 days,Cloud
...,...,...,...,...,...,...,...,...,...
267,3456,Daily,25,6,28,1,3,10 days,Cloud
268,3426,Weekly,20,1,22,1,1,7 days,Cloud
269,1893,Weekly,41,2,42,1,1,10 days,Cloud
270,1282,Weekly,49,7,52,1,3,10 days,Cloud


In [11]:
ori_meta  = meta.copy()
ori_meta['Repo'] = encoders['Repo'].inverse_transform(ori_meta['Repo'])
ori_meta

Unnamed: 0,Scan date,Repo
0,2025-01-01,admin-jlpt
1,2025-01-02,admin-jlpt
2,2025-01-03,admin-jlpt
3,2025-01-04,admin-jlpt
4,2025-01-05,admin-jlpt
...,...,...
267,2025-03-27,user-management
268,2025-03-28,user-management
269,2025-03-29,user-management
270,2025-03-30,user-management


In [12]:
ori_y = pd.DataFrame(data=y, columns=['Evaluate'])
ori_y

Unnamed: 0,Evaluate
0,4
1,3
2,2
3,3
4,2
...,...
267,4
268,4
269,1
270,1


In [13]:
col_names = list(ori_X.columns) + list(ori_meta.columns) + list(ori_y.columns)
ori_data = pd.concat([ori_X, ori_meta, ori_y], ignore_index=True, axis=1)
ori_data.columns = col_names
ori_data = ori_data[df_copy.columns.tolist()]
ori_data.to_csv('resources/data_cleaned.csv', index=False)

### Model

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
import numpy as np

class BayesianDense(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.w_mu = nn.Parameter(torch.Tensor(out_features, in_features))
        self.w_sigma = nn.Parameter(torch.Tensor(out_features, in_features))
        self.b_mu = nn.Parameter(torch.Tensor(out_features))
        self.b_sigma = nn.Parameter(torch.Tensor(out_features))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_normal_(self.w_mu, mode='fan_in')
        nn.init.constant_(self.w_sigma, -3)
        nn.init.constant_(self.b_mu, 0.1)
        nn.init.constant_(self.b_sigma, -3)

    def forward(self, x):
        w = Normal(self.w_mu, torch.exp(self.w_sigma)).rsample()
        b = Normal(self.b_mu, torch.exp(self.b_sigma)).rsample()
        return F.linear(x, w, b)

class CustomModel(nn.Module):
    def __init__(self, input_features, input_dim, lstm_config, bayesian_layers, final_layer, dropout=0.1):
        super().__init__()
        self.swish = lambda x: x * torch.sigmoid(x)

        self.input_dense = input_dim
        self.lstm = lstm_config
        self.bayesian_layers = nn.ModuleList(bayesian_layers)
        self.final_dense = final_layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.swish(self.input_dense(x))  # (batch, T, hidden)
        if self.lstm:
            x, _ = self.lstm(x)  # (batch, T, hidden)
        for layer in self.bayesian_layers:
            x = self.swish(layer(x))
        x = self.dropout(x)
        return self.final_dense(x)  # (batch, T, num_classes)

    
def build_model(config_id, input_features=9, num_classes=5):
    if config_id == 'A0':
        return CustomModel(
            input_features,
            BayesianDense(input_features, 32),
            nn.LSTM(32, 64, num_layers=2, batch_first=True),
            [BayesianDense(64, 64), BayesianDense(64, 48), BayesianDense(48, 48), BayesianDense(48, 32)],
            BayesianDense(32, num_classes)
        )
    if config_id == 'A1':
        return CustomModel(
            input_features,
            BayesianDense(input_features, 32),
            nn.LSTM(32, 64, num_layers=2, batch_first=True),
            [BayesianDense(64, 32)],
            BayesianDense(32, num_classes)
        )
    if config_id == 'A2':
        return CustomModel(
            input_features,
            BayesianDense(input_features, 32),
            nn.LSTM(32, 64, num_layers=1, batch_first=True),
            [BayesianDense(64, 32)],
            BayesianDense(32, num_classes)
        )
    if config_id == 'A3':
        return CustomModel(
            input_features,
            BayesianDense(input_features, 32),
            None,
            [BayesianDense(32, 48), BayesianDense(48, 32)],
            BayesianDense(32, num_classes)
        )
    if config_id == 'A4':
        return CustomModel(
            input_features,
            nn.Linear(input_features, 32),
            nn.LSTM(32, 64, num_layers=2, batch_first=True),
            [nn.Linear(64, 32)],
            nn.Linear(32, num_classes)
        )
    if config_id == 'A5':
        return CustomModel(
            input_features,
            BayesianDense(input_features, 16),
            nn.LSTM(16, 32, num_layers=2, batch_first=True),
            [BayesianDense(32, 16)],
            BayesianDense(16, num_classes)
        )
    if config_id == 'A6':
        return CustomModel(
            input_features,
            nn.Identity(),
            nn.LSTM(input_features, 64, num_layers=2, batch_first=True),
            [BayesianDense(64, 32)],
            BayesianDense(32, num_classes)
        )
    if config_id == 'A7':
        return CustomModel(
            input_features,
            BayesianDense(input_features, 32),
            nn.LSTM(32, 64, num_layers=2, batch_first=True),
            [BayesianDense(64, 48), nn.Linear(48, 32)],
            BayesianDense(32, num_classes)
        )
    if config_id == 'A8':
        return CustomModel(
            input_features,
            BayesianDense(input_features, 32),
            nn.LSTM(32, 64, num_layers=2, batch_first=True),
            [BayesianDense(64, 64), BayesianDense(64, 32)],
            BayesianDense(32, num_classes),
            dropout=0.5
        )
    if config_id == 'A9':
        return CustomModel(
            input_features,
            BayesianDense(input_features, 32),
            nn.LSTM(32, 128, num_layers=2, batch_first=True),
            [BayesianDense(128, 64), BayesianDense(64, 32)],
            BayesianDense(32, num_classes)
        )
    else:
        raise ValueError('Unknown config')
    

def train_model(model, train_loader, val_loader, device, epochs=200, lr=0.0001, verbose=True, patience=50):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    train_losses, val_accuracies = [], []

    best_acc = 0
    best_metrics = {'acc': 0, 'precision': 0, 'recall': 0, 'f1': 0}
    best_epoch = 0
    no_improve = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            output = model(x_batch)
            loss = criterion(output.view(-1, output.shape[-1]), y_batch.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)

        # eval
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                output = model(x_batch)
                preds = output.argmax(dim=-1)
                all_preds.append(preds.flatten())
                all_labels.append(y_batch.flatten())
        all_preds = torch.cat(all_preds)
        all_labels = torch.cat(all_labels)

        acc = (all_preds == all_labels).float().mean().item()
        precision, recall, f1, _ = precision_recall_fscore_support(
            all_labels.cpu(), all_preds.cpu(), average='weighted', zero_division=0
        )

        val_accuracies.append(acc)
        if verbose:
            print(f'Epoch {epoch+1}: loss = {avg_loss:.4f}, A = {acc:.4f}, P = {precision:.4f}, R = {recall:.4f}, F1 = {f1:.4f}')

        if acc > best_acc:
            best_acc = acc
            best_metrics = {'acc': acc, 'precision': precision, 'recall': recall, 'f1': f1}
            best_epoch = epoch
            no_improve = 0
        else:
            no_improve += 1

        if no_improve >= patience:
            if verbose:
                print(f"Early stopping at epoch {epoch+1}, best epoch was {best_epoch+1}")
            break

    return train_losses, val_accuracies, best_metrics['acc'], best_metrics['precision'], best_metrics['recall'], best_metrics['f1']


def run_ablation(config_list, train_loader, val_loader, runs=5, verbose=True, device='cuda'):
    results = {}
    for config_id in config_list:
        print(f'\n=== Running config: {config_id} ===')
        accs, precisions, recalls, f1s = [], [], [], []
        for r in range(runs):
            if verbose:
                print(f'Run {r+1}/{runs}')
            model = build_model(config_id)
            _, _, acc, precision, recall, f1 = train_model(model, train_loader, val_loader, device, verbose=verbose)
            accs.append(acc)
            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)
        accs, precisions, recalls, f1s = map(np.array, (accs, precisions, recalls, f1s))
        results[config_id] = {
            'mean_accuracy': float(accs.mean()),
            'std_accuracy': float(accs.std()),
            'mean_precision': float(precisions.mean()),
            'mean_recall': float(recalls.mean()),
            'mean_f1': float(f1s.mean()),
            'runs': runs
        }
        print(f'{config_id} : A = {accs.mean():.4f} ± {accs.std():.4f}, P = {precisions.mean():.4f}, R = {recalls.mean():.4f}, F1 = {f1s.mean():.4f}')
    return results

In [15]:
from torch.utils.data import Dataset, DataLoader, random_split

class RepoSplitTimeSeriesDataset(Dataset):
    def __init__(self, X_df, y_series, meta_df, timesteps=5, mode='train', split_ratio=0.8):
        self.samples = []
        assert mode in ['train', 'val'], "mode must be 'train' or 'val'"

        grouped = meta_df.groupby('Repo')

        for repo_id, group_indices in grouped.groups.items():
            # sort by time within each repo
            group = meta_df.loc[group_indices].sort_values('Scan date')
            sorted_idx = group.index.tolist()

            total = len(sorted_idx)
            split_point = int(total * split_ratio)

            if total < timesteps:
                continue  # skip short repos

            if mode == 'train':
                use_idx = sorted_idx[:split_point]
            else:  # val
                use_idx = sorted_idx[split_point:]

            # re-slide within the selected portion
            for i in range(len(use_idx) - timesteps + 1):
                window_idx = use_idx[i:i + timesteps]
                x_seq = torch.tensor(X_df.loc[window_idx].values, dtype=torch.float32)
                y_seq = torch.tensor(y_series.loc[window_idx].values, dtype=torch.long)
                meta_seq = meta_df.loc[window_idx].iloc[-1].to_dict()
                self.samples.append((x_seq, y_seq))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]
    

timesteps = 5
train_dataset = RepoSplitTimeSeriesDataset(X, y, meta, timesteps=timesteps, mode='train', split_ratio=0.8)
val_dataset = RepoSplitTimeSeriesDataset(X, y, meta, timesteps=timesteps, mode='val', split_ratio=0.8)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [16]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
results = run_ablation([f'A{x}' for x in range(10)], train_loader, val_loader, verbose=False, device=device)


=== Running config: A0 ===
A0 : A = 0.4327 ± 0.0225, P = 0.2876, R = 0.4327, F1 = 0.3079

=== Running config: A1 ===
A1 : A = 0.4300 ± 0.0110, P = 0.2659, R = 0.4300, F1 = 0.3078

=== Running config: A2 ===
A2 : A = 0.4427 ± 0.0179, P = 0.3237, R = 0.4427, F1 = 0.3401

=== Running config: A3 ===
A3 : A = 0.5045 ± 0.0091, P = 0.3665, R = 0.5045, F1 = 0.4204

=== Running config: A4 ===
A4 : A = 0.4118 ± 0.0055, P = 0.2152, R = 0.4118, F1 = 0.2629

=== Running config: A5 ===
A5 : A = 0.4164 ± 0.0022, P = 0.3354, R = 0.4164, F1 = 0.2758

=== Running config: A6 ===
A6 : A = 0.4273 ± 0.0081, P = 0.2797, R = 0.4273, F1 = 0.3042

=== Running config: A7 ===
A7 : A = 0.4245 ± 0.0110, P = 0.3110, R = 0.4245, F1 = 0.2916

=== Running config: A8 ===
A8 : A = 0.4300 ± 0.0148, P = 0.3309, R = 0.4300, F1 = 0.3075

=== Running config: A9 ===
A9 : A = 0.4400 ± 0.0253, P = 0.3360, R = 0.4400, F1 = 0.3182
