### Imports

In [167]:
!pip install numpy 
!pip install pandas
!pip install torch
!pip install sklearn

import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
from sklearn.metrics import accuracy_score
from torch.autograd import Variable
import torch.optim as optim



### Manipulating data
* Import csv we want
* Map our features to usuable integers
* Map our labels to a one hot encoding

In [168]:
# FIXME: Give a proper path/subset of data
df = pd.read_csv('C:/Projects/ClinBall/out1.tsv', sep='\t')

# The more negative the score, the more confident the predictor is about the damaging prediction
# The more positive the score, the more confident the predictor is about the non-damaging prediction
# A score of zero is unknown.
features_mapping = {
    'SIFT_pred': {
        'T': 1,
        'D': -1,
    },
    'LRT_pred': {
        'N': 1,
        'D': -1,
        'U': 0,
    },
    'MutationTaster_pred': {
        'N': 1,
        'D': -1,
        'A': -2,
        'P': 2,
    },
    'FATHMM_pred': {
        'T': 1,
        'D': -1,
    }
}

clinvar_map = {
    'clinvar_clnsig': {
        'Pathogenic': 0,
        'Likely_pathogenic': 0,
        'Pathogenic/Likely_pathogenic': 0,
        'Benign/Likely_benign': 1,
        'Likely_benign': 1,
        'Benign': 1
    }
}

# In dbNSFP, entries with no value have a period, change this for a 0 to match our schema
df = df.replace({'.': 0})

# Apply mappings
df = df.replace(features_mapping)
df = df.replace(clinvar_map)

# Replace all strange/ambiguous ClinVar clnsig with a common integer
df = df.replace({'\D': -1}, regex=True)
# Drop all rows which have a strange/ambiguous clinvar clnsig
df = df[df.clinvar_clnsig != -1]

# Split our data into two separate sets for training purposes
train, test = train_test_split(df, shuffle=False)

print(test.head())

       SIFT_pred  LRT_pred  MutationTaster_pred  FATHMM_pred  clinvar_clnsig
12223         -1         1                   -1            1               0
12226          0         0                   -1            0               0
12227          1         0                    1           -1               1
12229         -1         0                   -1           -1               0
12236         -1         0                   -1           -1               0


### DataLoader
* Implement the Dataset class supplied by PyTorch
  * Used Map style
  * API: https://pytorch.org/docs/stable/data.html#map-style-datasets
* Instanciate DataLoader using Dataset instance

In [169]:
class DataSetdbNSFP(Dataset):
    def __init__(self, data):
        self.data = torch.FloatTensor(data.values.astype('float'))
        
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
            target = self.data[index][-1]
            data_val = self.data[index] [:-1]
            return data_val,target
        
train_dataset = DataSetdbNSFP(train)
valid_dataset = DataSetdbNSFP(test)

train_loader = DataLoader(train_dataset, batch_size=1000, shuffle=True)
test_loader = DataLoader(train_dataset, batch_size=1000, shuffle=True)

### Model
* This model is a simple implementation
* Based on https://github.com/ieee8023/NeuralNetwork-Examples/blob/master/pytorch/pytorch-mnist.ipynb

In [170]:

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        # In: nb of features, out: nb of examples
        self.fc = nn.Linear(4, 1000)
        # In: nb of examples, out: nb of predictions
        self.fc2 = nn.Linear(1000, 2)

    def forward(self, x):
        x = x.view((-1, 4))
        h = F.relu(self.fc(x))
        h = self.fc2(h)
        # Softmax to get the actual labels
        return F.softmax(h, dim=1)    
    
model = Model()
# if cuda:
#     model.cuda()

# Optimizer based on Adam algorithm, uses a slightly lower rate
optimizer = optim.Adam(model.parameters(), lr=1e-4)

### Training
* Run a fixed number of epochs
* After each epoch, run the testing set to adjust learning
* Print results
* Based on example: https://github.com/ieee8023/NeuralNetwork-Examples/blob/master/pytorch/pytorch-mnist.ipynb

In [172]:
EPOCHS = 20

model.train()
for epoch in range(EPOCHS):
    for batch_idx, (data, target) in enumerate(train_loader):

        data, target = Variable(data), Variable(target)
        
#         if cuda:
#             data, target = data.cuda(), target.cuda()
        
        optimizer.zero_grad()
        y_pred = model(data) 
        
        # Calculate and appy loss
        loss = F.cross_entropy(y_pred, target.long())
        loss.backward()
        optimizer.step()

        print('\r Train Epoch: {}/{} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch+1,
            EPOCHS,
            batch_idx * len(data), 
            len(train_loader.dataset),
            100. * batch_idx / len(train_loader), 
            loss.cpu().data.item()))
    
    # NOTE: The Variable class is a wrapper class around a tensor with the added functionalities of back propagation
    # For x, take all cols of datafram except the last one
    evaluate_x = Variable(test_loader.dataset.data[:, 0:-1].type_as(torch.FloatTensor()))
    # For y, take last col
    evaluate_y = Variable(test_loader.dataset.data[:, -1:])
#     if cuda:
#         evaluate_x, evaluate_y = evaluate_x.cuda(), evaluate_y.cuda()
    model.eval()
    output = model(evaluate_x)
    pred = output.data.max(1)[1]
    
    y_labels = torch.flatten(evaluate_y.data)
    print('Predictions:', pred)
    print('Actual values:', y_labels)
    d = pred.eq(y_labels).cpu()
    accuracy = d.sum().item()*1./d.size()[0]
    
    print('\r Train Epoch: {}/{} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\t Test Accuracy: {:.2f}%'.format(
        epoch+1,
        EPOCHS,
        len(train_loader.dataset), 
        len(train_loader.dataset),
        100. * batch_idx / len(train_loader), 
        loss.cpu().data.item(),
        accuracy*100))

Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
Prediction

Predictions tensor([1, 0, 1,  ..., 0, 0, 1])
Actual values: tensor([1., 0., 1.,  ..., 0., 0., 1.])
