In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt


In [None]:
import os
import pandas as pd
from torchvision.io import read_image

class CustomDNADataset(Dataset):
    def __init__(self, feature_file,label_file, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(label_file)
        self.img_features = pd.read_csv(feature_file)
        # pandas read the data 
        self.transform = transform
        self.target_transform = target_transform
        # no transform is used here 
        self.dic= {'-':0,'A' :2**2 , 'C':2**3,'T':2**4,'G':2**5,'N':0}
        # I learned the sequence of dana from file:///D:/downloads/dna-and-animal-classification.pdf
        #and from http://ircamera.as.arizona.edu/Astr2016/text/nucleicacid1.htm 
        # which showed that there are 4 dominante letters to determine the sequence

    def __len__(self):
        return len(self.img_labels)
        # this is just the number rows or samples in the input data as used by 
        # Dataset class 

    def __getitem__(self, idx):
 ############# DNA transform ######################     
        DNA = self.img_features.iloc[idx,1]
        # so DNA is getting the img_features panda csv read and we are taking the second
        #colmn , the first is just ids . idx is generated by pytorch randomly depending on
        # if we shuffle the data or not and the number of batch size (part of Dataset class which
        # we inheret from )
        n = []
        Our_pad = 1296-len(DNA)
        # here I am using 1296 which is 36*36 , I choose this number based on 
        # the maximum individual row I found which was around 1058 ,this is used to 
        # generate a padding which is some how consistent along all .I also chose 
        # 36 so that when we do conv and maxpool we get a nice number which is 
        # dividable by 2 How to fix RuntimeError "Expected object 
        #of scalar type Float but got scalar type Double for argument"
        
        for i in DNA:# DNA is our row where we itterate 
          if i in self.dic:
            n.append(float(self.dic[i]))
          else:
            n.append(1)
        for pad in range(Our_pad):# we pad the end of the sequence with zeros 
          n.append(float(0))
        l = np.array(n , dtype=np.float32)
        # this was very trick where the base type of any np array is float 64 or 
        # double but the base type for torch is float 32 so if we convert 
        # a np array to torch we need to first change it to float32 or we get an error 
        # which says 
        Data_array = torch.from_numpy(l)
        #transfer a np to torch 
        
        

        #DNA_dense = Data_array
        # depending on the type of network we can keep it flat or 
        # change the dim (reshape)

        DNA_image = Data_array.reshape(1,36,36)
        # reshape the dim to be in the form of an image of pix 1 which is 
        # usally the RGB or gray channel , in our case it is gray ,
        # then 36 by 36 which what our padding is doing 


#################################################
        label = self.img_labels.iloc[idx, 1]
        # we get the labels from a different file 

        return DNA_image, label

In [None]:
class NN(nn.Module):
  # this is just a feed forward NN 
  def __init__ (self,input_size , num_classes):
    super(NN,self).__init__()
    self.fc1 = nn.Linear(input_size , 50)
    self.fc2 = nn.Linear(50 , 500)
    self.fc3 = nn.Linear(500 , 250)
    self.fc4 = nn.Linear(250 , 1000)
    self.fc5 = nn.Linear(1000,num_classes)

  def forward(self,x):
    #print(x.shape)
    x = F.relu(self.fc1(x))
    #print(x.shape)
    x = F.relu(self.fc2(x))
    #print(x.shape)
    x = F.relu(self.fc3(x))
    #print(x.shape)
    x = F.relu(self.fc4(x))
    
    x = self.fc5(x)

    return x



In [None]:
 # model = NN(36*36,1202)
 # x = torch.randn(64,36*36)
 # print(model(x).shape)

In [None]:
full_dataset= CustomDNADataset(feature_file='/content/drive/MyDrive/deep learning course/dna-barcode-classification/train_features.csv',label_file='/content/drive/MyDrive/deep learning course/dna-barcode-classification/train_labels.csv')
# so pass the whole set by specifing the path of file to be inputed to our custum class 

Test_dataset = CustomDNADataset(feature_file='/content/drive/MyDrive/deep learning course/dna-barcode-classification/test_features.csv',label_file='/content/drive/MyDrive/deep learning course/dna-barcode-classification/train_labels - Copy.csv')
# for the test_dataset I am using fake label data which is not accually there just so that I can use 
# my custom class , it has the same len as the train data ( I choped some in csv file )

In [None]:
len(full_dataset)

12906

In [None]:
train_size = int(0.9 * len(full_dataset))
# this is a creative way to create a validation set by spliting the dataset 
validation_size = len(full_dataset) - train_size
train_dataset, validation_dataset = torch.utils.data.random_split(full_dataset, [train_size, validation_size])


In [None]:
class CNN(nn.Module):
  def __init__(self,in_channels = 1 , num_classes =None):
    super(CNN,self).__init__()
    self.conv1 = nn.Conv2d(1, 9,(3,3))
    self.pool = nn.MaxPool2d(kernel_size=(2,2),stride=(2,2))#17 * 17
    self.conv2 = nn.Conv2d(9 ,18 , kernel_size=(3,3))
    self.fc1 = nn.Linear(18*7*7, num_classes)

  def forward(self,x):
    # use print to make sure that the Linear gets the correct num of nodes in 
    # the creation phase (18*7*7)
    #print(x.shape)
    x = F.relu(self.conv1(x))
    #print(x.shape)
    x = self.pool(x)
    #print(x.shape)
    x = F.relu(self.conv2(x))
    #print(x.shape)
    x = self.pool(x)
    #print(x.shape)
    x = x.reshape(x.shape[0],-1) # reshape to be fully connected layer 
    #print(x.shape)
    x = self.fc1(x)

    return x






In [None]:
#model = CNN()
#x = torch.randn(120,1,36,36)
#print(model(x).shape)
data_labels  = pd.read_csv("/content/drive/MyDrive/deep learning course/dna-barcode-classification/train_labels.csv")

data_labels['labels'].value_counts()
data_labels['labels'].max()
# we are using value_counts() to know the number of classes but 
# it acually didnt work because the number of classes is less than the 
# max scaller number of the labels which was 1213 and so 
# this is why I choose 1214 as the number of classes .
# even though we will have extra classes but that is ok   

1213

In [None]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 


# hyper parameters 
input_size = 36*36
in_channel = 1
num_classes = 1214
learning_rate = 0.001
batch_size = 15
# the batch size appered to be a significant factor on the effectivness of the training
# 15 is found to be a good num 
num_epochs = 3
#


In [None]:
#initialize the network 
model = CNN(1,num_classes).to(device)

In [None]:
#model = NN(input_size,num_classes).to(device)

In [None]:
#load data 
train_loader = DataLoader(full_dataset , batch_size=batch_size , 
                          shuffle = True)
Validation_loader = DataLoader(validation_dataset , batch_size=batch_size , 
                          shuffle = False)
test_loader = DataLoader(Test_dataset  , batch_size=1 , shuffle=False)

In [None]:
#loss and optimizer 

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr= learning_rate)

In [None]:

for epoch in range (num_epochs):
  for batch_ind ,(data, targets) in enumerate(train_loader):
    data = data.to(device = device)
    targets = targets.to(device = device)

    #print(data.shape)
    scores = model(data)
    print(model(data)[0])
    print(targets.shape)
    print(criterion(scores , targets))


    


In [None]:
# train Network
for epoch in range (num_epochs):
  for batch_ind ,(data, targets) in enumerate(train_loader):
    data = data.to(device = device)
    targets = targets.to(device = device)
    # we are coping our data from the cpu to the GPU 

    #print(data.shape)

    # forward
    scores = model(data)
    # this were we send our data to with shape ([batchsize , chennel size , 36,36]in to
    # 15,1214 out as our predition )
    loss = criterion(scores , targets)
    # loss which is the mean square error of the scores which is the weights with the
    # targets which should be (15) and the yi-y

    # backword 

    optimizer.zero_grad()
    loss.backward()
    # compute the gradients 


    # gradient descent or adam step

    optimizer.step()
    #print(loss.item())

    



    #check the accuracy on validation data 

  def check_accuracy (loader,model):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
      for x,y in loader:
        x = x.to(device = device )
        y = y.to(device = device)
        # we are memicing the model and loading the data but this time with a 
        #trained model 
        
        scores = model(x)
        # our (15,1214) tensor 
        _,prediction = scores.max(1)
        # maximum value in the tensor (1) we need the index of it
        #print('model prediction = ',prediction , 'y = ',y)
        num_correct += (prediction == y).sum()
        # compare predition with y our target and get 0 for no and 1 for true or yes 
        num_samples += prediction.size(0)
        # compute the total num of samples 

      print(f'for{num_correct}/{num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
    model.train()

  check_accuracy(train_loader,model)
  #check_accuracy(Validation_loader , model)



In [None]:
torch.save(model.state_dict(), '/content/99_33model.pt')
# save the model to the Path which saves the state_dict(from url https://pytorch.org/tutorials/beginner/saving_loading_models.html)

In [None]:
  def check_accuracy (loader,model):
    num_correct = 0
    num_samples = 0
    model.eval()
    your_file = open('newResult.csv', 'ab')
    # we create a csv file or open it 'ab' for append binery 

    with torch.no_grad():
      for x,y in loader:
        x = x.to(device = device )
        y = y.to(device = device)
        
        scores = model(x)
        _,prediction = scores.max(1)
        #print('model prediction = ',prediction , 'y = ',y)
        num_correct += (prediction == y).sum()
        np.savetxt(your_file,prediction.cpu())
        num_samples += prediction.size(0)

      print(f'for{num_correct}/{num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
    your_file.close()
    model.train()

  check_accuracy(test_loader,model)
  

In [None]:
def write_csv_results (loader,model):
    num_correct = 0
    num_samples = 0
    model.eval()
    your_file = open('resultsWithforeign_75.csv', 'ab')
    # we create a csv file or open it 'ab' for append binery 
    sm = torch.nn.Softmax()
    # we use a softmax to compute the probability of each guess of the network 
    
    

    with torch.no_grad():
      for x,_ in loader:
        x = x.to(device = device )
       
        
        scores = model(x)
        _,prediction = scores.max(1)
        #print(scores.max())
        #print('model prediction = ',prediction , 'y = ',y)
        probabilities = sm(scores) 
        #print(probabilities.max())
        #print(prediction)
        if probabilities.max() < 0.75 :
          prediction = torch.tensor([-1])
          np.savetxt(your_file,prediction.cpu())
          # we save the prediction in the open file but first we have to 
          # send it back to being a cpu 
        else:
          np.savetxt(your_file,prediction.cpu().numpy())  
        
        

      model.train()
      your_file.close()

write_csv_results(test_loader,model)
# using our def 


    
