Aim here is to train a supervised CNN to recognise duplications and deletions
1) Generate some event data to classify
2) Define the network
3) Do some training 
4) Test the prediction


In [2]:
import numpy as np
import numpy.random as rng
import torch
from torch import nn
import torch.nn.functional as F

rng.seed(1001)
torch.manual_seed(1001)

<torch._C.Generator at 0x7ff06ebf00b0>

In [3]:
# 1) Generate some data

def data_generator(ndat,nL,sd):
    data = np.zeros((ndat,nL)) 
    labs = np.zeros((ndat,)) 
    for i in range(0,ndat):
        # choose dup or del
        evnt = rng.choice([-1,1])
        #labs[i,] = int( (evnt+1)/2 )
        if evnt == -1:
            labs[i,] = 0
        else:
            labs[i,] = 1
    
        # choose start
        start = rng.randint(0,nL-1)
        #print(start,"\t",type)
        #start = 4
    
        # create event and add noise
        data[i, start:(start+2)] = evnt
        data[i,] += rng.normal(0,sd,(nL))
    
        #print(data[i,])

    return [data, labs]
        
ndat = 10000
nL = 10 #length 
sd = 0.5 #sd of noise 
data, labs = data_generator(ndat, nL, sd)
x_train = torch.from_numpy(data).float()
y_train = torch.from_numpy(labs).long()
#print(labs)
#print(data[0:5,])

In [4]:
# 2) Define a model. We will start with logistic regression
class logisticRegression(nn.Module):
    def __init__(self, in_dim, n_class):
        super(logisticRegression, self).__init__()
        self.linear = nn.Linear(in_dim, n_class)
        
    def forward(self, x):
        out = self.linear(x)
        
        #out = F.softmax(self.linear(x),dim=1) # this is the dimension of the array
        # softmax is not needed here as the cross-entropy loss handles
        
        return out
    
model = logisticRegression(nL, 2)
#print(model)
#print(model(x_train[0:10,]))


In [8]:
# 3) Do some training
batch_size = 100
learning_rate = 1e-3
num_epochs = int(ndat/batch_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

i = 0
for epoch in range(num_epochs):
    #inds = rng.choice(range(0,ndat),size=batch_size, replace=False)
    inds = range(i,i+batch_size)
    inputs = x_train[inds,]
    target = y_train[inds,]

    i += batch_size
    
    # forward
    out = model(inputs)
    #print(target.shape)
    #print(out.shape)
    loss = criterion(out, target)
    
    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 1 == 0:
        print(f'Epoch[{epoch+1}/{num_epochs}], loss: {loss.item():.6f}')
print(out)

print('Finished Training')

Epoch[1/100], loss: 0.691040
Epoch[2/100], loss: 0.726043
Epoch[3/100], loss: 0.694123
Epoch[4/100], loss: 0.722817
Epoch[5/100], loss: 0.697034
Epoch[6/100], loss: 0.710685
Epoch[7/100], loss: 0.706742
Epoch[8/100], loss: 0.715904
Epoch[9/100], loss: 0.709402
Epoch[10/100], loss: 0.662896
Epoch[11/100], loss: 0.715847
Epoch[12/100], loss: 0.721435
Epoch[13/100], loss: 0.714630
Epoch[14/100], loss: 0.740611
Epoch[15/100], loss: 0.713838
Epoch[16/100], loss: 0.685212
Epoch[17/100], loss: 0.691550
Epoch[18/100], loss: 0.693416
Epoch[19/100], loss: 0.694904
Epoch[20/100], loss: 0.700551
Epoch[21/100], loss: 0.698162
Epoch[22/100], loss: 0.693012
Epoch[23/100], loss: 0.705024
Epoch[24/100], loss: 0.692445
Epoch[25/100], loss: 0.735523
Epoch[26/100], loss: 0.724380
Epoch[27/100], loss: 0.741035
Epoch[28/100], loss: 0.706593
Epoch[29/100], loss: 0.684846
Epoch[30/100], loss: 0.687066
Epoch[31/100], loss: 0.707014
Epoch[32/100], loss: 0.740043
Epoch[33/100], loss: 0.704833
Epoch[34/100], loss

In [47]:
# 4) Test the model prediction
#outputs = model(x_train[0:10,])
#print(outputs)
#print(labs[0:10])
#_, predicted = torch.max(outputs.data, 1)
#print(predicted, labs[0:10])

In [9]:
ntest = 1000
tdata, tlabs = data_generator(ntest, nL, sd)
x_test = torch.from_numpy(tdata).float()
correct = 0
total = 0
with torch.no_grad():
    outputs = model(x_test)
    _, predicted = torch.max(outputs.data, 1)
    #print( abs(predicted-labs) )
    incorrect = (abs(predicted-tlabs)).sum()

print('Accuracy of the network on the test images: %d %%' % (100 * (ntest-incorrect) / float(ntest)) )

Accuracy of the network on the test images: 55 %


In [7]:
# examine the parameters of the fitted model
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

linear.weight tensor([[-0.0730, -0.2251,  0.2118, -0.2222, -0.2056, -0.1443, -0.1919,  0.0838,
          0.2506, -0.1956],
        [-0.2334, -0.0082, -0.1015,  0.0927, -0.2110, -0.0133, -0.2665,  0.1894,
          0.0745, -0.2849]])
linear.bias tensor([ 0.0026, -0.2017])
