In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


In [None]:
# Função para realizar one-hot encoding de uma sequência de DNA
def one_hot1(sequence):
    mapping = {'A': [1, 0, 0, 0],
               'C': [0, 1, 0, 0],
               'G': [0, 0, 1, 0],
               'T': [0, 0, 0, 1]}
    
    # Transforma cada base da sequência no vetor correspondente
    return np.array([mapping[base] for base in sequence])

In [72]:
def one_hot2(sequence):
    seq_array = np.array(list(sequence))
    
    le = LabelEncoder()
    oe = OneHotEncoder(sparse_output=False)
    integer_e = le.fit_transform(seq_array)
    integer_e = integer_e.reshape(len(integer_e),1)
    onehot_e = oe.fit_transform(integer_e)
    
    return onehot_e

In [100]:
# Função para ler o CSV e gerar os encodings
def process_csv(file_path, function):
    # Lê o CSV ignorando a primeira linha
    df = pd.read_csv(file_path)    
    
    # Aplica o one-hot encoding nas sequências
    if function == 1:
        encoded_sequences = np.array([one_hot1(seq).flatten() for seq in df['sequence']])
    elif function == 2:
        encoded_sequences = np.array([one_hot2(seq).flatten() for seq in df['sequence']])
        
    encoded_df = pd.DataFrame(encoded_sequences)
    
    # Adiciona a coluna de labels ao DataFrame
    encoded_df['label'] = df['label'].values
    
    return encoded_sequences, encoded_df


In [101]:
# Exemplo de uso
csv_file = 'teste.csv'
encoded_sequences, encoded_df = process_csv(csv_file, 1)

In [102]:
encoded_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,395,396,397,398,399,400,401,402,403,label
0,0,0,0,1,0,1,0,0,0,1,...,0,0,1,0,0,0,0,1,0,1
1,0,0,0,1,0,0,1,0,0,1,...,0,0,0,1,0,0,1,0,0,1
2,0,1,0,0,1,0,0,0,1,0,...,1,1,0,0,0,0,0,0,1,1
3,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,1
4,0,1,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,1,0,1
5,0,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
6,0,0,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
7,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
8,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
9,0,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,1


In [103]:
encoded_sequences2, encoded_df2 = process_csv(csv_file, 2)

In [104]:
encoded_sequences2 == encoded_sequences

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [105]:
encoded_df2 == encoded_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,395,396,397,398,399,400,401,402,403,label
0,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
