Auto-Learn Your Own Data Augmentation  
Code based on https://pytorch.org/tutorials/beginner/nn_tutorial.html

In [1]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn import functional as F

from sklearn.model_selection import train_test_split
import gc

In [2]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [142]:
class IssueCloseTimeData(Dataset):
    def __init__(self, filename):
        super().__init__()
        path = '.'
        
        df = pd.read_csv(f'{path}/{filename}.csv')
        df.drop(['Unnamed: 0', 'bugID'], axis=1, inplace=True)
        _df = df[['s1', 's2', 's3', 's4', 's5', 's6', 's8', 'y']]
        _df['s70'] = df['s7'].apply(lambda x: eval(x)[0])
        _df['s71'] = df['s7'].apply(lambda x: eval(x)[1])
        _df['s72'] = df['s7'].apply(lambda x: eval(x)[2])
        _df['s90'] = df['s9'].apply(lambda x: eval(x)[0])
        _df['s91'] = df['s9'].apply(lambda x: eval(x)[1])
        self.x = _df.drop('y', axis=1)
        self.y = _df['y']
        
        for col in self.x.columns:
            self.x[f'{col}_mean'] = np.mean(self.x[col])
            self.x[f'{col}_std'] = 1./np.std(self.x[col])
        
        self.x = np.array(self.x)
        
        if filename == 'firefox':
            self.y = np.where(self.y < 4, 0, 1)
        elif filename == 'chromium':
            self.y = np.where(self.y < 5, 0, 1)
        else:
            self.y = np.where(self.y < 6, 0, 1)
        
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y)
        
        self.pos_x = self.x_train[np.where(self.y_train == 1)[0]]
        self.pos_y = self.y_train[np.where(self.y_train == 1)[0]]
        self.neg_x = self.x_train[np.where(self.y_train == 0)[0]]
        self.neg_y = self.y_train[np.where(self.y_train == 0)[0]]
        
        self.pos_x_test = self.x_test[np.where(self.y_test == 1)[0]]
        self.pos_y_test = self.y_test[np.where(self.y_test == 1)[0]]
        self.neg_x_test = self.x_test[np.where(self.y_test == 0)[0]]
        self.neg_y_test = self.y_test[np.where(self.y_test == 0)[0]]
    
    def __getitem__(self, i):
        return self.x_train.iloc[i,:], self.y[i]
    
    def __len__(self):
        return self.y_train.shape[0]
    
    def __iter__(self):
        return next(self.x), next(self.y)

In [189]:
augment_ratio=[1., 1.1]

In [190]:
class ICTPositiveData:
    def __init__(self, filename, batch_size=64):
        self.filename = filename
        self.bs = batch_size
        
        ict_data = IssueCloseTimeData(filename)
        train_ds = TensorDataset(torch.FloatTensor(ict_data.pos_x), 
                                 torch.FloatTensor(ict_data.pos_y))
        test_ds = TensorDataset(torch.FloatTensor(ict_data.pos_x_test),
                                torch.FloatTensor(ict_data.pos_y_test))
        
        self.train_dl = DataLoader(train_ds, batch_size=self.bs, drop_last=True)
        self.test_dl = DataLoader(test_ds, batch_size=self.bs, drop_last=True)
    
    def get_data(self):
        return self.train_dl, self.test_dl

In [191]:
class ICTNegativeData:
    def __init__(self, filename, batch_size=64):
        self.filename = filename
        self.bs = batch_size
        
        ict_data = IssueCloseTimeData(filename)
        train_ds = TensorDataset(torch.FloatTensor(ict_data.neg_x), 
                                 torch.FloatTensor(ict_data.neg_y))
        test_ds = TensorDataset(torch.FloatTensor(ict_data.neg_x_test),
                                torch.FloatTensor(ict_data.neg_y_test))
        
        self.train_dl = DataLoader(train_ds, batch_size=self.bs, drop_last=True)
        self.test_dl = DataLoader(test_ds, batch_size=self.bs, drop_last=True)
        
    def get_data(self):    
        return self.train_dl, self.test_dl

In [213]:
class DataAugmenter(nn.Module):
    def __init__(self, n_samples, ratio=1.):
        super().__init__()
        self.ratio = ratio
        self.n_samples = n_samples
        self.out_size = int(ratio * n_samples)
        
        self.layer1 = nn.Linear(in_features=n_samples, out_features=n_samples)
        self.layer2 = nn.Linear(in_features=n_samples, out_features=n_samples)
        self.augmented = nn.Linear(in_features=n_samples, out_features=self.out_size)
    
    def forward(self, x):
        layer1_out = F.relu(self.layer1(torch.transpose(x, 0, 1)))
        layer2_out = F.relu(self.layer2(layer1_out))
        return torch.transpose(F.relu(self.augmented(layer2_out)), 0, 1)

In [212]:
class MainNet(nn.Module):
    def __init__(self, input_shape:tuple, n_layers:int=2):
        super().__init__()
        self.n_features = input_shape[1]
        self.n_samples = input_shape[0]
        self.n_layers = n_layers
        
        cur_in = n_features
        cur_out = int(cur_in // 2)
                                
        self.layers = []
        while True:
            self.layers.append(nn.Linear(in_features=cur_in, out_features=cur_out))
            self.layers.append(nn.ReLU())
            
            cur_in = cur_out
            cur_out = int(cur_in // 2)
            
            if cur_in == 2 or cur_in == 3:
                break
        
        self.layers.append(nn.Linear(in_features=cur_in, out_features=1))
                
    def forward(self, x):
        data = x
        for layer in self.layers:
            data = layer(data)
        
        return data

In [194]:
class AlYoda(nn.Module):
    def __init__(self, input_shape, filename='firefox', augment_ratio=augment_ratio, n_layers=3):
        """
        The Al YODA net.
        
        :param filename - str. Filename of dataset.
        :param augment_ratio - Iterable. (positive_augment_ratio, negative_augment_ratio)
        :param n_layers - int. Number of layers in the main network
        """
        super().__init__()
        self.filename = filename
        self.n_layers = n_layers
        self.paused = False
        
        n_features = input_shape[1]
        n_samples = input_shape[0]
                        
        # Augment the data
        self.pos_augmenter = DataAugmenter(n_samples, ratio=augment_ratio[0])
        self.neg_augmenter = DataAugmenter(n_samples, ratio=augment_ratio[1])
        
        main_net_input_shape = (self.pos_augmenter.out_size + self.neg_augmenter.out_size,
                                n_features)
        
        # Get the main network
        self.main_net = MainNet(input_shape=input_shape,
                                n_layers=n_layers)
    
    def forward(self, x_pos, x_neg):                                
        # Train the augmenters one step
        x_pos = self.pos_augmenter.forward(x_pos)
        x_neg = self.neg_augmenter.forward(x_neg)
        
        # Concatenate
        x_concatenated = torch.cat((x_pos, x_neg), dim=0)
        
        # Train the main network one step
        return self.main_net(x_concatenated)
    
    def toggle_pause(self):
        self.paused = not self.paused
        self.pos_augmenter.eval()
        self.neg_augmenter.eval()

In [195]:
# Get the data
filename = 'firefox'
pos_train_dl, pos_test_dl = ICTPositiveData(filename).get_data()
neg_train_dl, neg_test_dl = ICTNegativeData(filename).get_data()

In [196]:
gc.collect()

341

In [197]:
n_features = pos_train_dl.dataset[0][0].numel()
n_pos_samples = pos_train_dl.dataset.tensors[1].numel()
n_neg_samples = neg_train_dl.dataset.tensors[1].numel()

In [198]:
n_features

36

In [199]:
model = AlYoda(input_shape=(64, n_features))
optimizer = optim.Adam(model.parameters())

In [200]:
n_epochs = 60

In [201]:
loss_func = nn.BCEWithLogitsLoss()

In [202]:
for x, y in pos_train_dl:
    print(x.shape, y.shape)
    break

torch.Size([64, 36]) torch.Size([64])


In [203]:
for epoch in range(n_epochs):
    if epoch == n_epochs // 2:
        model.toggle_pause()
        
    model.train()
    for (xb_pos, _), (xb_neg, _) in zip(pos_train_dl, neg_train_dl):
        preds = model(xb_pos, xb_neg).view(-1)
        pos_shape = int(augment_ratio[0] * xb_pos.shape[0])
        neg_shape = int(augment_ratio[1] * xb_neg.shape[0])
        yb = torch.cat((torch.ones(pos_shape), torch.zeros(neg_shape)), dim=0)
        train_loss = loss_func(preds, yb)
        
        train_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    model.eval()
    valid_loss = 0.
    with torch.no_grad():
        for (xb_pos, _), (xb_neg, _) in zip(pos_test_dl, neg_test_dl):
            preds = model(xb_pos, xb_neg).view(-1)
            pos_shape = int(augment_ratio[0] * xb_pos.shape[0])
            neg_shape = int(augment_ratio[1] * xb_neg.shape[0])
            yb = torch.cat((torch.ones(pos_shape), torch.zeros(neg_shape)), dim=0)
            valid_loss += loss_func(preds, yb)

    # Log
    print(f'Epoch {epoch+1}/{n_epochs}: train_loss={train_loss} | val_loss={valid_loss}')

Epoch 1/60: train_loss=0.4446515440940857 | val_loss=58.68923568725586
Epoch 2/60: train_loss=0.444627046585083 | val_loss=58.68886947631836
Epoch 3/60: train_loss=0.4446198046207428 | val_loss=58.68876266479492
Epoch 4/60: train_loss=0.4446163475513458 | val_loss=58.6887321472168
Epoch 5/60: train_loss=0.4446142911911011 | val_loss=58.68871307373047
Epoch 6/60: train_loss=0.4333550035953522 | val_loss=57.20277786254883
Epoch 7/60: train_loss=0.43335458636283875 | val_loss=57.20277786254883
Epoch 8/60: train_loss=0.43335437774658203 | val_loss=57.20277786254883
Epoch 9/60: train_loss=0.43335431814193726 | val_loss=57.20277786254883
Epoch 10/60: train_loss=0.4333541989326477 | val_loss=57.20277786254883
Epoch 11/60: train_loss=0.4333541691303253 | val_loss=57.20277404785156
Epoch 12/60: train_loss=0.43335410952568054 | val_loss=57.20277404785156
Epoch 13/60: train_loss=0.43335404992103577 | val_loss=57.20277404785156
Epoch 14/60: train_loss=0.433353990316391 | val_loss=57.20277404785156

In [204]:
valid_loss = 0.
preds = []
ys = []
with torch.no_grad():
    for (xb_pos, _), (xb_neg, _) in zip(pos_test_dl, neg_test_dl):
        pos_shape = int(augment_ratio[0] * xb_pos.shape[0])
        neg_shape = int(augment_ratio[1] * xb_neg.shape[0])
        ys.extend(np.ones(pos_shape))
        ys.extend(np.zeros(neg_shape))
        preds.extend(model(xb_pos, xb_neg).numpy())

In [205]:
ys = np.array(ys).squeeze()
preds = np.array(preds).squeeze()

In [206]:
ys.shape, preds.shape

((17688,), (17688,))

In [207]:
from raise_utils.metrics import ClassificationMetrics

In [208]:
def sigmoid(x):
    return np.exp(x) / sum(np.exp(x-max(x))) <= .5

In [209]:
metr = ClassificationMetrics(ys, sigmoid(preds))
metr.add_metrics(['accuracy', 'pd', 'pf', 'd2h'])

In [210]:
metr.get_metrics()

[0.5223880597014925, 0.0, 0.0, 0.0]