In [None]:
##importing libraries
import torch
import torch.optim as optim
import torch.nn as nn
import os
from torch.utils.data import Dataset, DataLoader , random_split
from torchvision import transforms , datasets
from PIL import Image
import pandas as pd
import re
import random
import shutil
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import numpy as np


In [4]:
# Custom class to process dataset
class GeneticArrayDataset(Dataset):
    def __init__(self, root_folder, transform=None):
        """
        Initialize the dataset.

        Parameters:
        - root_folder (str): The root directory containing the dataset.
        - transform (callable, optional): A function/transform to apply to the data.

        """
        self.root_folder = root_folder
        self.transform = transform
        
        # Get the list of class labels from the directory names
        self.class_labels = sorted(os.listdir(root_folder))
        
        # Create a mapping from class labels to indices
        self.class_to_idx = {label: idx for idx, label in enumerate(self.class_labels)}
        
        # Build the list of file paths
        self.file_list = self._build_file_list()
    
    def _build_file_list(self):
        """
        Build a list of file paths for all samples in the dataset.

        Returns:
        - file_list (list): A list of file paths.

        """
        file_list = []
        for class_label in self.class_labels:
            class_path = os.path.join(self.root_folder, class_label)
            class_array = [os.path.join(class_path, array_path) for array_path in os.listdir(class_path)]
            file_list.extend(class_array)
        return file_list
    
    def __len__(self):
        """
        Get the total number of samples in the dataset.

        Returns:
        - length (int): Total number of samples.

        """
        return len(self.file_list)
    
    def __getitem__(self, index):
        """
        Get a sample from the dataset by its index.

        Parameters:
        - index (int): Index of the sample to retrieve.

        Returns:
        - sample (dict): A dictionary containing the sample data and its label.

        """
        file_path = self.file_list[index]
        
        # Load array data from file
        array = np.loadtxt(file_path)
        
        # Normalize array data
        mean = np.mean(array)
        std = np.std(array)
        array = (array - mean) / std
        
        # Extract class label from file path
        class_label = os.path.basename(os.path.dirname(file_path))
        label = self.class_to_idx[class_label]
        
        # Extract coefficient from file path using regular expression
        find = re.match(r".+?[_].+?[_].+?[_](.+?)[.][t][x][t]", file_path)
        
        # Convert array to PyTorch tensor
        array = torch.from_numpy(array)
        
        # Return the sample data along with its label and coefficient
        return {'array': array, 'label': label, 'coefficient': find[1]}


In [5]:

data = GeneticArrayDataset("/home/jaskaran/data_arrays") ## creating instance of genetic array dataset
generator = torch.Generator().manual_seed(46) ## generator for seeding the process
length_data = len(data) ## length of total data
training_size = int(0.8*length_data) ## 80 percent for training
testing_size = length_data - training_size ## 20 percent for testing 

train_data , test_data = random_split(data,[training_size, testing_size],generator=generator)
## Splitting the total dataset into training and testing

training_dataloader= DataLoader(train_data,batch_size=64,shuffle=True, drop_last=True) ##Initialising Dataloader
testing_dataloader = DataLoader(test_data,batch_size=64,shuffle=True, drop_last=True) ##Initialising Dataloader

In [31]:
##Defining the CNN model
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels = 32,
                               kernel_size = 9,
                               stride = 1,
                               padding = 9)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2)
        self.flatten = nn.Flatten(start_dim=1)
        self.fc1 = nn.Linear(96800,256)
        self.dropout = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(256,128)
        self.fc3 = nn.Linear(128,2)

    def forward(self,x):
        x = x.float()
        
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        
        x = self.flatten(x)
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
            
        x = self.fc3(x)
        return x
         

In [None]:
## Training the CNN model
torch.manual_seed(45)
model = CNN()
model.to("cuda:2")

if torch.cuda.is_available():
    torch.cuda.manual_seed(45)
    torch.cuda.manual_seed_all(45)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr = 0.0001)

num_epoch = 20

for epoch in range(num_epoch):

    model.train()
    total_loss = 0.0

    for batch in training_dataloader:
        arrays = batch["array"]
        arrays = arrays.unsqueeze(1)
        arrays = arrays.to("cuda:2")
        labels  = batch["label"].to("cuda:2")
        optimizer.zero_grad()
        output = model(arrays)
        loss = criterion(output,labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * arrays.size(0)  # Multiply by batch size

    # Calculate average loss over all batches
    average_loss = total_loss / len(training_dataloader.dataset)
    
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for batch in testing_dataloader:
            arrays = batch["array"]
            arrays = arrays.unsqueeze(1)
            arrays = arrays.to("cuda:2")
            labels = batch["label"].to("cuda:2")
            output = model(arrays)
            _ , predicted = torch.max(output.data,1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct/total
    print(f"Epoch : {epoch} , Loss : {average_loss:.2f} , Accuracy : {accuracy:.2%}")

print("Model is trained")




In [33]:
### Saving the model dict
torch.save(model.state_dict(), f'model_state_dict_array_filtering_lr0.0001_32_deep_leanrning_drop_0.2_9*9.pth')