In [1]:
import pandas as pd
df = pd.read_csv('resources/data.csv')
df_copy = df.loc[:, (df != 0).any(axis=0)]
df_copy = df_copy.drop(labels=['Sprint', 'Severity of the threat'], axis=1)

In [2]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 12 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   Change code line number                   516 non-null    int64 
 1   Number of vulnerable modules              516 non-null    int64 
 2   Number of people involved in development  516 non-null    int64 
 3   Time to complete each version             516 non-null    object
 4   Commit frequency                          516 non-null    object
 5   Type of environment                       516 non-null    object
 6   Number of libraries detected errors       516 non-null    int64 
 7   Scan date                                 516 non-null    object
 8   Number of potential weaknesses            516 non-null    int64 
 9   Số lượng lỗ hổng cấu hình môi trường      516 non-null    int64 
 10  Evaluate                                  516 non-

In [3]:
df_copy

Unnamed: 0,Change code line number,Number of vulnerable modules,Number of people involved in development,Time to complete each version,Commit frequency,Type of environment,Number of libraries detected errors,Scan date,Number of potential weaknesses,Số lượng lỗ hổng cấu hình môi trường,Evaluate,Repo
0,19,1,2,5 days,Daily,Cloud,22,2025-02-17 00:00:00,26,4,4,admin-jlpt
1,1152,1,2,5 days,Daily,Cloud,22,2025-02-18 00:00:00,26,4,4,admin-jlpt
2,100,1,2,5 days,Daily,Cloud,22,2025-02-19 00:00:00,26,4,4,admin-jlpt
3,221,1,2,5 days,Daily,Cloud,22,2025-02-20 00:00:00,26,4,4,admin-jlpt
4,96,1,2,5 days,Daily,Cloud,22,2025-02-22 00:00:00,26,4,4,admin-jlpt
...,...,...,...,...,...,...,...,...,...,...,...,...
511,530,1,8,5 days,Weekly,Cloud,40,2025-01-09 00:00:00,42,2,2,payment-system
512,1996,1,6,5 days,Daily,Cloud,39,2025-02-04 00:00:00,40,1,3,user-management
513,1986,1,8,7 days,Weekly,Cloud,14,2025-01-15 00:00:00,16,2,3,payment-system
514,435,1,7,10 days,Monthly,Cloud,29,2025-03-14 00:00:00,32,3,1,admin-jlpt


In [4]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_timeseries_dataframe(df: pd.DataFrame):
    df = df.copy()
    df = df.loc[:, (df != 0).any(axis=0)]
    df = df.drop(labels=['Sprint', 'Severity of the threat'], axis=1)
    df['Scan date'] = pd.to_datetime(df['Scan date'])

    categorical_columns = ['Time to complete each version', 'Commit frequency', 'Type of environment', 'Repo']
    group_sum_col = 'Change code line number'
    target_col = 'Evaluate'
    date_col = 'Scan date'

    all_dfs = []
    for repo, group in df.groupby('Repo'):
        # group by date to avoid duplicates before reindexing
        group = group.groupby(date_col).agg({
            group_sum_col: 'sum',
            'Number of vulnerable modules': 'mean',
            'Number of people involved in development': 'mean',
            'Number of libraries detected errors': 'mean',
            'Number of potential weaknesses': 'mean',
            'Số lượng lỗ hổng cấu hình môi trường': 'mean',
            target_col: 'first',
            **{col: 'first' for col in categorical_columns}
        }).sort_index()

        # create full date index and reindex
        all_dates = pd.date_range(start=group.index.min(), end=group.index.max(), freq='D')
        group = group.reindex(all_dates, method='ffill')
        group[date_col] = group.index
        group['Repo'] = repo

        # identify filled-in rows and set code line to 0
        original_dates = df[df['Repo'] == repo]['Scan date'].unique()
        group[group_sum_col] = group[group_sum_col].where(group.index.isin(original_dates), 0)

        all_dfs.append(group)

    df_filled = pd.concat(all_dfs).reset_index(drop=True)

    # encode categoricals
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df_filled[col] = le.fit_transform(df_filled[col])
        label_encoders[col] = le

    # normalize features (excluding Scan date, Repo, Evaluate)
    feature_cols = df_filled.columns.difference([date_col, 'Repo', target_col])
    scaler = StandardScaler()
    df_filled[feature_cols] = scaler.fit_transform(df_filled[feature_cols])

    # final outputs
    X = df_filled.drop(columns=[date_col, 'Repo', target_col])
    y = df_filled[target_col]
    meta = df_filled[[date_col, 'Repo']]

    return X, y, meta, label_encoders, scaler

In [5]:
X, y, meta, encoders, scaler = preprocess_timeseries_dataframe(df)
X

Unnamed: 0,Change code line number,Number of vulnerable modules,Number of people involved in development,Number of libraries detected errors,Number of potential weaknesses,Số lượng lỗ hổng cấu hình môi trường,Time to complete each version,Commit frequency,Type of environment
0,1.078302,0.0,-0.649854,0.488260,0.552000,0.950836,-1.198149,0.096718,0.0
1,0.010589,0.0,-0.067334,-1.500259,-1.611377,-1.724206,-1.198149,-1.099074,0.0
2,-0.064960,0.0,-0.566637,-0.601339,-0.556731,0.568687,1.233915,-1.099074,0.0
3,0.430454,0.0,1.929876,0.706180,0.579042,-1.724206,0.017883,1.292511,0.0
4,0.766747,0.0,-0.316985,0.542740,0.538479,-0.004536,1.233915,0.096718,0.0
...,...,...,...,...,...,...,...,...,...
267,1.052897,0.0,0.431968,-0.002059,0.038198,0.568687,-1.198149,-1.099074,0.0
268,1.032839,0.0,-1.731676,-0.383419,-0.475604,-1.342057,1.233915,1.292511,0.0
269,0.007915,0.0,-1.565242,1.278220,1.146929,-1.724206,-1.198149,1.292511,0.0
270,-0.400584,0.0,0.931271,1.931980,1.958195,0.568687,-1.198149,1.292511,0.0


In [6]:
y

0      4
1      3
2      2
3      3
4      2
      ..
267    4
268    4
269    1
270    1
271    3
Name: Evaluate, Length: 272, dtype: int64

In [7]:
meta

Unnamed: 0,Scan date,Repo
0,2025-01-01,0
1,2025-01-02,0
2,2025-01-03,0
3,2025-01-04,0
4,2025-01-05,0
...,...,...
267,2025-03-27,2
268,2025-03-28,2
269,2025-03-29,2
270,2025-03-30,2


In [8]:
ori_X = scaler.inverse_transform(X[list(scaler.feature_names_in_)]).astype(int)
ori_X = pd.DataFrame(data=ori_X, columns=list(scaler.feature_names_in_))

In [9]:
for col in ('Time to complete each version', 'Commit frequency', 'Type of environment'):
    ori_X[col] = encoders[col].inverse_transform(ori_X[col])

In [10]:
ori_X

Unnamed: 0,Change code line number,Commit frequency,Number of libraries detected errors,Number of people involved in development,Number of potential weaknesses,Number of vulnerable modules,Số lượng lỗ hổng cấu hình môi trường,Time to complete each version,Type of environment
0,3494,Monthly,31,3,34,1,3,10 days,Cloud
1,1897,Daily,7,5,8,1,1,10 days,Cloud
2,1784,Daily,18,4,21,1,3,7 days,Cloud
3,2525,Weekly,34,9,35,1,1,5 days,Cloud
4,3028,Monthly,32,4,34,1,2,7 days,Cloud
...,...,...,...,...,...,...,...,...,...
267,3456,Daily,25,6,28,1,3,10 days,Cloud
268,3426,Weekly,20,1,22,1,1,7 days,Cloud
269,1893,Weekly,41,2,42,1,1,10 days,Cloud
270,1282,Weekly,49,7,52,1,3,10 days,Cloud


In [11]:
ori_meta  = meta.copy()
ori_meta['Repo'] = encoders['Repo'].inverse_transform(ori_meta['Repo'])
ori_meta

Unnamed: 0,Scan date,Repo
0,2025-01-01,admin-jlpt
1,2025-01-02,admin-jlpt
2,2025-01-03,admin-jlpt
3,2025-01-04,admin-jlpt
4,2025-01-05,admin-jlpt
...,...,...
267,2025-03-27,user-management
268,2025-03-28,user-management
269,2025-03-29,user-management
270,2025-03-30,user-management


In [12]:
ori_y = pd.DataFrame(data=y, columns=['Evaluate'])
ori_y

Unnamed: 0,Evaluate
0,4
1,3
2,2
3,3
4,2
...,...
267,4
268,4
269,1
270,1


In [13]:
col_names = list(ori_X.columns) + list(ori_meta.columns) + list(ori_y.columns)
ori_data = pd.concat([ori_X, ori_meta, ori_y], ignore_index=True, axis=1)
ori_data.columns = col_names
ori_data = ori_data[df_copy.columns.tolist()]
ori_data.to_csv('resources/data_cleaned.csv', index=False)

### Model

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

class BayesianDense(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.w_mu = nn.Parameter(torch.Tensor(out_features, in_features))
        self.w_sigma = nn.Parameter(torch.Tensor(out_features, in_features))
        self.b_mu = nn.Parameter(torch.Tensor(out_features))
        self.b_sigma = nn.Parameter(torch.Tensor(out_features))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_normal_(self.w_mu, mode='fan_in')
        nn.init.constant_(self.w_sigma, -3)
        nn.init.constant_(self.b_mu, 0.1)
        nn.init.constant_(self.b_sigma, -3)

    def forward(self, x):
        w = Normal(self.w_mu, torch.exp(self.w_sigma)).rsample()
        b = Normal(self.b_mu, torch.exp(self.b_sigma)).rsample()
        return F.linear(x, w, b)

class MyModel(nn.Module):
    def __init__(self, input_features=9, timesteps=5, num_classes=5):
        super().__init__()
        assert input_features == 9, 'only 9 features supported right now'

        self.input_dense = BayesianDense(input_features, 32)
        self.lstm = nn.LSTM(32, 64, num_layers=2, batch_first=True)

        self.bayesian_layers = nn.ModuleList([
            BayesianDense(64, 64),
            BayesianDense(64, 48),
            BayesianDense(48, 48),
            BayesianDense(48, 32)
        ])

        self.dropout = nn.Dropout(0.1)
        self.final_dense = BayesianDense(32, num_classes)

        self.swish = lambda x: x * torch.sigmoid(x)

    def forward(self, x):
        batch_size, T, _ = x.shape

        x = self.swish(self.input_dense(x))  # (batch, T, 32)
        lstm_out, _ = self.lstm(x)  # (batch, T, 64)

        x = lstm_out
        for layer in self.bayesian_layers:
            x = self.swish(layer(x))

        x = self.dropout(x)
        logits = self.final_dense(x)  # (batch, T, num_classes)
        return logits

def train(model, loader, optimizer, criterion, epochs=100):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x_batch, y_batch in loader:
            optimizer.zero_grad()
            output = model(x_batch)  # shape: (batch, T, num_classes)
            loss = criterion(output.view(-1, output.shape[-1]), y_batch.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"epoch {epoch+1}: loss = {total_loss:.4f}")

def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x_batch, y_batch in loader:
            output = model(x_batch)  # (batch, T, num_classes)
            preds = output.argmax(dim=-1)
            all_preds.append(preds.flatten())
            all_labels.append(y_batch.flatten())
    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)
    acc = (all_preds == all_labels).float().mean()
    print(f"accuracy: {acc:.4f}")

In [15]:
from torch.utils.data import Dataset, DataLoader, random_split

class RepoDataset(Dataset):
    def __init__(self, X_df, y_series, meta_df, timesteps=5):
        self.samples = []
        self.meta = []

        grouped = meta_df.groupby("Repo")
        for repo_id, group_indices in grouped.groups.items():
            group = meta_df.loc[group_indices].sort_values("Scan date")
            sorted_idx = group.index.tolist()
            if len(sorted_idx) < timesteps:
                continue
            for i in range(len(sorted_idx) - timesteps + 1):
                window_idx = sorted_idx[i:i + timesteps]
                x_seq = torch.tensor(X_df.loc[window_idx].values, dtype=torch.float32)
                y_seq = torch.tensor(y_series.loc[window_idx].values, dtype=torch.long)
                # meta_seq = meta_df.loc[window_idx].iloc[-1]
                self.samples.append((x_seq, y_seq))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]
    
train_repos = [0, 1]
test_repos = [2]

train_mask = meta['Repo'].isin(train_repos)
test_mask = meta['Repo'].isin(test_repos)

X_train = X[train_mask]
y_train = y[train_mask]
meta_train = meta[train_mask]

X_test = X[test_mask]
y_test = y[test_mask]
meta_test = meta[test_mask]

timesteps = 5
train_dataset = RepoDataset(X_train, y_train, meta_train, timesteps=timesteps)
test_dataset = RepoDataset(X_test, y_test, meta_test, timesteps=timesteps)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [16]:
model = MyModel(input_features=9, timesteps=timesteps, num_classes=5)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

train(model, train_loader, optimizer, criterion, epochs=200)

evaluate(model, test_loader)

epoch 1: loss = 9.2175
epoch 2: loss = 8.9229
epoch 3: loss = 8.6839
epoch 4: loss = 8.4472
epoch 5: loss = 8.1220
epoch 6: loss = 7.9434
epoch 7: loss = 8.0347
epoch 8: loss = 7.7828
epoch 9: loss = 7.6618
epoch 10: loss = 7.5757
epoch 11: loss = 7.2816
epoch 12: loss = 7.1627
epoch 13: loss = 7.0268
epoch 14: loss = 6.6891
epoch 15: loss = 6.6129
epoch 16: loss = 6.4402
epoch 17: loss = 6.1686
epoch 18: loss = 5.9492
epoch 19: loss = 5.7311
epoch 20: loss = 5.4098
epoch 21: loss = 5.4925
epoch 22: loss = 5.1575
epoch 23: loss = 4.9886
epoch 24: loss = 4.6503
epoch 25: loss = 4.9369
epoch 26: loss = 4.3318
epoch 27: loss = 4.0405
epoch 28: loss = 3.8696
epoch 29: loss = 3.6775
epoch 30: loss = 3.8185
epoch 31: loss = 3.2927
epoch 32: loss = 3.0477
epoch 33: loss = 2.8241
epoch 34: loss = 2.8574
epoch 35: loss = 2.5914
epoch 36: loss = 2.4674
epoch 37: loss = 2.1074
epoch 38: loss = 2.0872
epoch 39: loss = 1.9880
epoch 40: loss = 1.7698
epoch 41: loss = 1.8294
epoch 42: loss = 1.6176
e

In [17]:
class RandomDataset(Dataset):
    def __init__(self, X_df, y_series, meta_df, timesteps=5):
        self.samples = []

        grouped = meta_df.groupby("Repo")
        for repo_id, group_indices in grouped.groups.items():
            group = meta_df.loc[group_indices].sort_values("Scan date")
            sorted_idx = group.index.tolist()
            if len(sorted_idx) < timesteps:
                continue
            for i in range(len(sorted_idx) - timesteps + 1):
                window_idx = sorted_idx[i:i + timesteps]
                x_seq = torch.tensor(X_df.loc[window_idx].values, dtype=torch.float32)
                y_seq = torch.tensor(y_series.loc[window_idx].values, dtype=torch.long)
                # meta_seq = meta_df.loc[window_idx].iloc[-1].to_dict()
                self.samples.append((x_seq, y_seq))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]
    
full_dataset = RandomDataset(X, y, meta, timesteps=5)

train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [18]:
model = MyModel(input_features=9, timesteps=timesteps, num_classes=5)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

train(model, train_loader, optimizer, criterion, epochs=200)

evaluate(model, test_loader)

epoch 1: loss = 10.8703
epoch 2: loss = 10.3623
epoch 3: loss = 10.1612
epoch 4: loss = 9.8144
epoch 5: loss = 9.7533
epoch 6: loss = 9.7633
epoch 7: loss = 9.5348
epoch 8: loss = 9.4457
epoch 9: loss = 9.3403
epoch 10: loss = 9.3017
epoch 11: loss = 9.1886
epoch 12: loss = 9.1033
epoch 13: loss = 9.1084
epoch 14: loss = 8.9766
epoch 15: loss = 8.8328
epoch 16: loss = 8.7461
epoch 17: loss = 8.5548
epoch 18: loss = 8.2844
epoch 19: loss = 8.2749
epoch 20: loss = 7.8189
epoch 21: loss = 7.7771
epoch 22: loss = 7.6276
epoch 23: loss = 7.3487
epoch 24: loss = 7.1341
epoch 25: loss = 7.0690
epoch 26: loss = 6.7025
epoch 27: loss = 6.4188
epoch 28: loss = 6.0604
epoch 29: loss = 5.6850
epoch 30: loss = 5.4569
epoch 31: loss = 5.0305
epoch 32: loss = 4.7903
epoch 33: loss = 4.4256
epoch 34: loss = 4.5210
epoch 35: loss = 4.4685
epoch 36: loss = 3.8809
epoch 37: loss = 3.4626
epoch 38: loss = 3.2622
epoch 39: loss = 3.1760
epoch 40: loss = 2.6398
epoch 41: loss = 2.7510
epoch 42: loss = 2.539

In [19]:
import pandas as pd
print(y.value_counts(normalize=True))

Evaluate
4    0.352941
1    0.253676
3    0.205882
2    0.187500
Name: proportion, dtype: float64
