In [1]:
##############################################
#Geração de dados.
#Tiago Tambonis
#2018/2019
##############################################

In [2]:
from Bio import SeqIO #Para leitura das sequências
import numpy as np
from pydpi.pypro import PyPro
from pydpi.pypro import CTD
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

protein = PyPro() #Global
kmaxlag = 19

In [3]:
#Geração das características

def getcaracteristicas(seqs, label): 
    
    #Composição de Dipeptídeos
    def getcaracteristicasDPComp(pep, kseq):

        protein.ReadProteinSequence(pep)

        kDPComp = protein.GetDPComp()
        kDPComp = pd.DataFrame(kDPComp.items(), columns=['PepAAComp', kseq])
        kDPComp = kDPComp.set_index('PepAAComp')

        return(kDPComp)

    for i in range(len(seqs)):

        if i==0: DPCompData = getcaracteristicasDPComp(seqs[i], kseq=str(label+str(i+1)))
        else: DPCompData = DPCompData.merge(getcaracteristicasDPComp(seqs[i], kseq=str(label+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')
                
    Dados = [DPCompData] #Quando for colocar outros descritores é só adicionar aqui
    Dados = pd.concat(Dados)
    Dados = Dados.T #Transpor para adequar aos pacotes
    
    return(Dados)

# Leitura das sequências positivas

In [4]:
#Leitura das sequências FASTA

seqspositivas = []
for record in SeqIO.parse("Dados/LBtope_Fixed_non_redundant_Positive_pattern.txt.fasta", "fasta"):
    #print(record.seq)
    seqspositivas.append(record.seq)
#seqspositivas = seqspositivas[0:100]

In [5]:
TabelaDadosPositivas = getcaracteristicas(seqs=seqspositivas, label="SeqPos")

In [6]:
TabelaDadosPositivas['Classe'] = np.repeat(1, TabelaDadosPositivas.shape[0]).tolist()

# Leitura das sequências negativas

In [7]:
#Leitura das sequências FASTA

seqsnegativas = []
for record in SeqIO.parse("Dados/LBtope_Fixed_non_redundant_Negative_pattern.txt.fasta", "fasta"):
    #print(record.seq)
    seqsnegativas.append(record.seq)
#seqsnegativas = seqsnegativas[0:100]

In [8]:
TabelaDadosNegativas = getcaracteristicas(seqs=seqsnegativas, label="SeqNeg")

In [9]:
TabelaDadosNegativas['Classe'] = np.repeat(-1, TabelaDadosNegativas.shape[0]).tolist()

# Binding dados

In [10]:
TabelaDadosPositivas.head()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AQ,AP,AS,AR,AT,AW,AV,AY,VK,Classe
SeqPos1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
SeqPos2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
SeqPos3,0.0,0.0,0.0,5.26,0.0,0.0,0.0,5.26,10.53,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,1
SeqPos4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,1
SeqPos5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [11]:
TabelaDadosNegativas.tail()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AQ,AP,AS,AR,AT,AW,AV,AY,VK,Classe
SeqNeg7847,0.0,0.0,0.0,0.0,0.0,21.05,0.0,0.0,0.0,0.0,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7848,0.0,0.0,0.0,0.0,0.0,10.53,0.0,5.26,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7850,0.0,0.0,5.26,0.0,0.0,0.0,5.26,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7851,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1


In [12]:
TabelaDados = [TabelaDadosPositivas, TabelaDadosNegativas]
TabelaDados = pd.concat(TabelaDados)

In [13]:
TabelaDados.head()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AQ,AP,AS,AR,AT,AW,AV,AY,VK,Classe
SeqPos1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
SeqPos2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
SeqPos3,0.0,0.0,0.0,5.26,0.0,0.0,0.0,5.26,10.53,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,1
SeqPos4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,1
SeqPos5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [14]:
TabelaDadosPositivas.head()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AQ,AP,AS,AR,AT,AW,AV,AY,VK,Classe
SeqPos1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
SeqPos2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
SeqPos3,0.0,0.0,0.0,5.26,0.0,0.0,0.0,5.26,10.53,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,1
SeqPos4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,1
SeqPos5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [15]:
TabelaDados.tail()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AQ,AP,AS,AR,AT,AW,AV,AY,VK,Classe
SeqNeg7847,0.0,0.0,0.0,0.0,0.0,21.05,0.0,0.0,0.0,0.0,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7848,0.0,0.0,0.0,0.0,0.0,10.53,0.0,5.26,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7850,0.0,0.0,5.26,0.0,0.0,0.0,5.26,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7851,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1


In [16]:
TabelaDadosNegativas.tail()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AQ,AP,AS,AR,AT,AW,AV,AY,VK,Classe
SeqNeg7847,0.0,0.0,0.0,0.0,0.0,21.05,0.0,0.0,0.0,0.0,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7848,0.0,0.0,0.0,0.0,0.0,10.53,0.0,5.26,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7850,0.0,0.0,5.26,0.0,0.0,0.0,5.26,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
SeqNeg7851,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1


# Split treino-teste

In [17]:
X = np.array(TabelaDados.drop(['Classe'], 1))
y = np.array(TabelaDados['Classe'])

In [18]:
X.shape

(15671, 400)

In [19]:
y.shape

(15671,)

In [20]:
treinoteste = 0.1
from sklearn.model_selection import train_test_split  
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=treinoteste,
                                                        stratify=y, random_state=101)

# Saves

In [21]:
#Features 

features = TabelaDados.columns.values
features[np.where(features=="NA")[0]]="NAA"

In [22]:
DadosTreino = pd.DataFrame(np.column_stack((X_treino, y_treino)), 
                              columns=features)

DadosTeste = pd.DataFrame(np.column_stack((X_teste, y_teste)), 
                              columns=features)

In [23]:
print(DadosTreino.shape)
print(DadosTeste.shape)

(14103, 401)
(1568, 401)


In [24]:
#Salvar tabela de dados principal 

with open("Dados/DadosTreino", "wb") as fp:   #Pickling
    pickle.dump(DadosTreino, fp)   
with open("Dados/DadosTeste", "wb") as fp:   #Pickling
    pickle.dump(DadosTeste, fp) 

In [25]:
#Salvar no formato adequado am mRMR.

if True: 
    
    #Manipular as features para adequar ao mRMR.
    
    featuresmrmr = []
    featuresmrmr.append(DadosTreino.shape[1]-1)
    for i in range(DadosTreino.shape[1]-1):
        featuresmrmr.append(i)

    featuresmrmr = DadosTreino.columns.values[featuresmrmr]
    
    DadosTreinomrmrCSV = DadosTreino[featuresmrmr]

    normalizar = False #Normalizar para usar no libsvm
  
    if normalizar: 
        
        scaler = StandardScaler()

        y_mrmr = np.array(DadosTreinomrmrCSV['Classe'])
        x_mrmr = np.array(DadosTreinomrmrCSV.drop(['Classe'], 1))
        DadosTreinomrmrCSV = pd.DataFrame(np.column_stack((y_mrmr, 
                             scaler.fit_transform(x_mrmr))), 
                             columns=DadosTreinomrmrCSV.columns)

    #Salvar no formato adequado ao mrmr. 
    DadosTreinomrmrCSV.to_csv("Dados/DadosTreinomrmr.csv", header=True, index=False)
    
    DadosTreinomrmrCSV.head()

In [26]:
DadosTreinomrmrCSV.head()

Unnamed: 0,Classe,GW,GV,GT,GS,GR,GQ,GP,GY,GG,...,AN,AQ,AP,AS,AR,AT,AW,AV,AY,VK
0,1.0,0.0,5.26,5.26,0.0,0.0,0.0,0.0,0.0,5.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0
1,1.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
DadosTreino.head()

Unnamed: 0,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AQ,AP,AS,AR,AT,AW,AV,AY,VK,Classe
0,0.0,5.26,5.26,0.0,0.0,0.0,0.0,0.0,5.26,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,1.0
1,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,0.0,...,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [28]:
print("OK.")

OK.
