In [5]:
import torch
from torch.utils.data import Dataset , DataLoader , random_split
import os

In [3]:

class GeneticDataset(Dataset):
    def __init__(self,root_folder, transform =None):
        self.root_folder = root_folder
        self.transform = transform
        self.class_labels = sorted(os.listdir(root_folder))
        self.class_to_idx = {label: idx for idx ,label in enumerate(self.class_labels)}
        self.file_list = self._build_file_list()
    
    def _build_file_list(self):
        file_list = []
        for class_label in self.class_labels:
            class_path = os.path.join(self.root_folder,class_label)
            class_image = [os.path.join(class_path,image_path) for image_path in os.listdir(class_path)]
            file_list.extend(class_image)
        return file_list
    
    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self,index):
        file_path = self.file_list[index]
        image = Image.open(file_path)
        image = image.convert("RGB")
        class_label = os.path.basename(os.path.dirname(file_path))
        label = self.class_to_idx[class_label]
        if self.root_folder.find("training") != -1:
            find = re.match(r".+?[_].+?[_].+?[_](.+?)[.][p][n][g]",file_path)
        else:
            find = re.match(r".+?[_].+?[_](.+?)[.][p][n][g]",file_path)
        if (self.transform) :
            image = self.transform(image)
        return {'image': image, 'label': label, "coefficient" : float(find[1]) }

In [None]:
# Define paths
source_folder = 'data/images'
train_folder = 'data/training_data/training_images/selection'
test_folder = 'data/test_data/selection'

# Create train and test folders if they don't exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

# Get list of all images in source folder
all_images = os.listdir(source_folder)

# Randomly select 200 images
selected_images = random.sample(all_images, 800)

# Move selected images to train folder
for image in selected_images:
    src = os.path.join(source_folder, image)
    dst = os.path.join(train_folder, image)
    shutil.move(src, dst)

# Move remaining images to test folder
remaining_images = set(all_images) - set(selected_images)
for image in remaining_images:
    src = os.path.join(source_folder, image)
    dst = os.path.join(test_folder, image)
    shutil.move(src, dst)

In [None]:

# Define paths
source_folder = 'data/data_bottleneck/images'
train_folder = 'data/training_data/training_images/bottleneck'
test_folder = 'data/test_data/bottleneck'

# Create train and test folders if they don't exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

# Get list of all images in source folder
all_images = os.listdir(source_folder)

# Randomly select 200 images
selected_images = random.sample(all_images, 750)

# Move selected images to train folder
for image in selected_images:
    src = os.path.join(source_folder, image)
    dst = os.path.join(train_folder, image)
    shutil.move(src, dst)

# Move remaining images to test folder
remaining_images = set(all_images) - set(selected_images)
for image in remaining_images:
    src = os.path.join(source_folder, image)
    dst = os.path.join(test_folder, image)
    shutil.move(src, dst)


In [9]:
data = GeneticDataset("data_images")
generator = torch.Generator().manual_seed(46)
length_data = len(data)
training_size = int(0.8*length_data)
testing_size = length_data - training_size

train_data , test_data = random_split(data,[training_size, testing_size],generator=generator)
print(len(train_data))
len(test_data)

7976


1995

In [None]:

data = pd.read_table("data/data/data_table_selection.txt")
data
for file in os.listdir("data/data/txt_file_selection"):
    file_path = os.path.join("data/data/txt_file_selection",file)
    print(file_path)
    with open(file_path, "r") as file:
        content = file.read()
    lines = content.split("\n")

    start_index = lines.index(" The allele frequency numbers are :-  ") + 1


    end_index = next(i for i, line in enumerate(lines) if line.startswith("1. The Tajima'D for the given sequence is"))

    sfs = [num for num in lines[start_index:end_index]]
    sfs = " ".join(sfs)
    sfs = sfs[1:len(sfs)-1]
    sfs = sfs.split()
    sfs = list(map(float, sfs))
    category = [0]*9
    category[0] = sum(sfs[0:1])
    category[1] = sum(sfs[1:2])
    category[2] = sum(sfs[2:4])
    category[4] = sum(sfs[4:7])
    category[5] = sum(sfs[7:10])
    category[6] = sum(sfs[10:20])
    category[7] = sum(sfs[20:50])
    category[8] = sum(sfs[50:])
    find = re.match(r".+?[_].+?[_](.+?)[.][t][x][t]",file_path)
    coef = float(find[1])
    for i in category:
        print(i)
    print(coef)
    j=1
    for i in category:
        data.loc[data["selection_coefficient"]==coef,f"category_sfs_{j}"] = i
        j+=1
    

In [None]:

data1 = pd.read_table("data/training_data/data_table_bottleneck.txt")

for file in os.listdir("data/training_data/txt_file_bottleneck"):
    file_path = os.path.join("data/training_data/txt_file_bottleneck",file)
    print(file_path)
    with open(file_path, "r") as file:
        content = file.read()
    lines = content.split("\n")

    start_index = lines.index(" The allele frequency numbers are :-  ") + 1


    end_index = next(i for i, line in enumerate(lines) if line.startswith("1. The Tajima'D for the given sequence is"))

    sfs = [num for num in lines[start_index:end_index]]
    sfs = " ".join(sfs)
    sfs = sfs[1:len(sfs)-1]
    sfs = sfs.split()
    sfs = list(map(float, sfs))
    category = [0]*9
    category[0] = sum(sfs[0:1])
    category[1] = sum(sfs[1:2])
    category[2] = sum(sfs[2:4])
    category[4] = sum(sfs[4:7])
    category[5] = sum(sfs[7:10])
    category[6] = sum(sfs[10:20])
    category[7] = sum(sfs[20:50])
    category[8] = sum(sfs[50:])
    find = re.match(r".+?[_].+?[_].+?[_].+?[_](.+?)[.][t][x][t]",file_path)
    coef = float(find[1])
    for i in category:
        print(i)
    print(coef)
    j=1
    for i in category:
        data1.loc[data1["bottleneck_intensity"]==coef,f"category_sfs_{j}"] = i
        j+=1
    


In [None]:
data_training , data_testing = 