In [1]:
# create dataset
from torch.utils.data import Dataset
import pickle
import numpy as np
from sklearn.model_selection import train_test_split



In [2]:
path_pos = 'pos_A0201.txt'
path_neg = 'neg_A0201.txt'
num_of_aminos = 20
peptide_length = 9
amino_vocab = {
    'A': 0,
    'R': 1,
    'N': 2,
    'D': 3,
    'C': 4,
    'Q': 5,
    'E': 6,
    'G': 7,
    'H': 8,
    'I': 9,
    'L': 10,
    'K': 11,
    'M': 12,
    'F': 13,
    'P': 14,
    'S': 15,
    'T': 16,
    'W': 17,
    'Y': 18,
    'V': 19
}

In [3]:
def peptide_to_one_hot(peptide):
    peptide_one_hot = []
    for amino in peptide:
        peptide_one_hot += (amino_to_one_hot(amino))
    return peptide_one_hot


def amino_to_one_hot(amino):
    amino_one_hot = [0]*num_of_aminos
    amino_one_hot[amino_vocab[amino]] = 1
    return amino_one_hot


def encode_data(data):
    encoded_data = []
    for peptide in data:
        encoded_data.append(peptide_to_one_hot(peptide))
    return encoded_data


def read_file(file_name):
    with open(file_name) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
    return lines


In [4]:
class MyDataset:
    """ex1 dataset."""

    def __init__(self, path_pos, path_neg):
        self.num_of_positive = 0
        self.num_of_negative = 0
        self.data,self.labels = get_dataset_as_array(path_pos, path_neg)
        self.X_train = []
        self.Y_train = []
        self.X_test = []
        self.Y_test = []
    
    def __str__(self):
        return f" number of samples in the data: {len(self.data)} \
            number of positive samples in the data: {self.num_of_positive} ({self.num_of_positive/len(self.data)})%\
                number of negative samples in the data: {self.num_of_negative} ({self.num_of_negative/len(self.data)})%"
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
    def split_train_test(self):
        self.X_train, self.X_test, self.Y_train, self.Y_test = \
        train_test_split(self.data, self.Y, test_size=0.1, random_state=1)
    
    def get_train_data(self):
        return np.array(zip(self.X_train, Y_train))
    
    def get_test_data(self):
        return np.array(zip(self.X_test, Y_test))
    
    def shuffle(self):
        p = np.random.permutation(len(self.data))
        self.data = self.data[p]
        
    def get_dataset_as_array(self, path_pos, path_neg):
        pos = read_file(path_pos)
        neg = read_file(path_neg)
        self.num_of_positive = len(pos)
        self.num_of_negative = len(neg)
        data_encoded = encode_data(pos+neg)
        labels = [1]*(len(pos)) + [0]*(len(neg))
        return (data_encoded, labels)

    def get_weights(self,x=1,y=1): #x will represent the factor of the weight of the first class and y for the second
        weight = self.num_of_neg/len(self.data) #the weights will be calculates as 1- relative number of class samples in the data
        weightsample = [1-weight,weight]
        weightsample[0]*=x
        weightsample[1]*=y
        weights = []

        for label in range labels:
            weights.append(weightsample[label])
        return weights


In [5]:
def label_names():
    return {0: 'negative', 1: 'positive'}



In [6]:
# useful for using data-loaders
# def get_dataset_as_torch_dataset(path='./data/dataset.pickle'):
#     dataset_as_array = get_dataset_as_array(path)
#     dataset = MyDataset(dataset_as_array)
#     return dataset





' hi \n moshe'