In [1]:
##############################################
#Geração de dados.
#Tiago Tambonis
#2018/2019
##############################################

In [2]:
from Bio import SeqIO #Para leitura das sequências
import numpy as np
from pydpi.pypro import PyPro
from pydpi.pypro import CTD
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

protein = PyPro() #Global
kmaxlag = 19

### Vou deixar a composição de aminoácidos para facilitar o entendimento do código quando for adicionar outro descritor

In [3]:
#Geração das características

def getcaracteristicas(seqs, label): 
    
    #Composição de Dipeptídeos
    def getcaracteristicasDPComp(pep, kseq):

        protein.ReadProteinSequence(pep)

        kDPComp = protein.GetDPComp()
        kDPComp = pd.DataFrame(kDPComp.items(), columns=['PepAAComp', kseq])
        kDPComp = kDPComp.set_index('PepAAComp')

        return(kDPComp)

    for i in range(len(seqs)):

        if i==0: DPCompData = getcaracteristicasDPComp(seqs[i], kseq=str(label+str(i+1)))
        else: DPCompData = DPCompData.merge(getcaracteristicasDPComp(seqs[i], kseq=str(label+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')
        
    #Composição aminoácidos
    def getcaracteristicasAAComp(pep, kseq):

        protein.ReadProteinSequence(pep)

        kAAComp = protein.GetAAComp()
        kAAComp = pd.DataFrame(kAAComp.items(), columns=['PepAAComp', kseq])
        kAAComp = kAAComp.set_index('PepAAComp')

        return(kAAComp)

    for i in range(len(seqs)):

        if i==0: AACompData = getcaracteristicasAAComp(seqs[i], kseq=str(label+str(i+1)))
        else: AACompData = AACompData.merge(getcaracteristicasAAComp(seqs[i], kseq=str(label+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')
            
    #Dados = [DPCompData] #Quando for colocar outros descritores é só adicionar aqui
    #Dados = pd.concat(Dados)
    #Dados = Dados.T #Transpor para adequar aos pacotes
    
    Dados = [DPCompData, AACompData] #Quando for colocar outros descritores é só adicionar aqui
    Dados = pd.concat(Dados)
    Dados = Dados.T #Transpor para adequar aos pacotes

    return(Dados)

# Leitura das sequências positivas

In [4]:
#Leitura das sequências FASTA

seqspositivas = []
for record in SeqIO.parse("LBtope_Fixed_non_redundant_Positive_pattern.txt.fasta", "fasta"):
    #print(record.seq)
    seqspositivas.append(record.seq)
seqspositivas = seqspositivas[0:100]

In [5]:
TabelaDadosPositivas = getcaracteristicas(seqs=seqspositivas, label="SeqPos")

# Início da comparação - Somente comparação

In [6]:
def getcaracteristicasDPComp(pep, kseq):

        protein.ReadProteinSequence(pep)

        kDPComp = protein.GetDPComp()
        kDPComp = pd.DataFrame(kDPComp.items(), columns=['PepAAComp', kseq])
        kDPComp = kDPComp.set_index('PepAAComp')

        return(kDPComp)

for i in range(len(seqspositivas)):

    if i==0: DPCompData = getcaracteristicasDPComp(seqspositivas[i], kseq=str("Seq"+str(i+1)))
    else: DPCompData = DPCompData.merge(getcaracteristicasDPComp(seqspositivas[i], kseq=str("Seq"+str(i+1))), 
                            left_on='PepAAComp', right_on='PepAAComp', how='inner')

In [7]:
def getcaracteristicasAAComp(pep, kseq):

        protein.ReadProteinSequence(pep)

        kAAComp = protein.GetAAComp()
        kAAComp = pd.DataFrame(kAAComp.items(), columns=['PepAAComp', kseq])
        kAAComp = kAAComp.set_index('PepAAComp')

        return(kAAComp)

for i in range(len(seqspositivas)):

    if i==0: AACompData = getcaracteristicasAAComp(seqspositivas[i], kseq=str("Seq"+str(i+1)))
    else: AACompData = AACompData.merge(getcaracteristicasAAComp(seqspositivas[i], kseq=str("Seq"+str(i+1))), 
                            left_on='PepAAComp', right_on='PepAAComp', how='inner')

In [8]:
DPCompData.T.head()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,AN,AQ,AP,AS,AR,AT,AW,AV,AY,VK
Seq1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Seq2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Seq3,0.0,0.0,0.0,5.26,0.0,0.0,0.0,5.26,10.53,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0
Seq4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,5.26,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0
Seq5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
AACompData.T.head()

PepAAComp,A,C,E,D,G,F,I,H,K,M,L,N,Q,P,S,R,T,W,V,Y
Seq1,65.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,10.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0
Seq2,35.0,0.0,30.0,5.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,5.0,10.0,0.0,0.0,0.0,5.0,0.0
Seq3,30.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,10.0,10.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,10.0,5.0
Seq4,35.0,0.0,0.0,5.0,5.0,5.0,10.0,0.0,5.0,5.0,0.0,5.0,5.0,5.0,10.0,0.0,5.0,0.0,0.0,0.0
Seq5,55.0,0.0,0.0,0.0,15.0,5.0,0.0,0.0,10.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


In [10]:
TabelaDadosPositivas.head()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,L,N,Q,P,S,R,T,W,V,Y
SeqPos1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0
SeqPos2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,10.0,0.0,0.0,0.0,5.0,0.0
SeqPos3,0.0,0.0,0.0,5.26,0.0,0.0,0.0,5.26,10.53,0.0,...,10.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,10.0,5.0
SeqPos4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,0.0,5.0,5.0,5.0,10.0,0.0,5.0,0.0,0.0,0.0
SeqPos5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


# Fim da comparação - Está ótimo! Concatenação correta

In [11]:
TabelaDadosPositivas['Classe'] = np.repeat(1, TabelaDadosPositivas.shape[0]).tolist()

In [12]:
TabelaDadosPositivas.tail()
print(TabelaDadosPositivas.shape)

(100, 421)


# Leitura das sequências negativas

In [13]:
#Leitura das sequências FASTA

seqsnegativas = []
for record in SeqIO.parse("LBtope_Fixed_non_redundant_Negative_pattern.txt.fasta", "fasta"):
    #print(record.seq)
    seqsnegativas.append(record.seq)
seqsnegativas = seqsnegativas[0:100]

In [14]:
TabelaDadosNegativas = getcaracteristicas(seqs=seqsnegativas, label="SeqNeg")

In [15]:
TabelaDadosNegativas['Classe'] = np.repeat(-1, TabelaDadosNegativas.shape[0]).tolist()
print(TabelaDadosNegativas.shape)

(100, 421)


# Binding dados

In [16]:
TabelaDadosPositivas.head()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,N,Q,P,S,R,T,W,V,Y,Classe
SeqPos1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,1
SeqPos2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,10.0,0.0,0.0,0.0,5.0,0.0,1
SeqPos3,0.0,0.0,0.0,5.26,0.0,0.0,0.0,5.26,10.53,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,10.0,5.0,1
SeqPos4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,5.0,5.0,5.0,10.0,0.0,5.0,0.0,0.0,0.0,1
SeqPos5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1


In [17]:
TabelaDadosNegativas.tail()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,N,Q,P,S,R,T,W,V,Y,Classe
SeqNeg96,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,0.0,0.0,...,15.0,5.0,5.0,10.0,0.0,15.0,0.0,0.0,0.0,-1
SeqNeg97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,10.0,0.0,5.0,5.0,0.0,20.0,0.0,10.0,5.0,-1
SeqNeg98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,10.0,5.0,0.0,0.0,5.0,0.0,-1
SeqNeg99,0.0,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,5.0,0.0,10.0,0.0,-1
SeqNeg100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,5.0,0.0,15.0,5.0,10.0,0.0,0.0,10.0,-1


In [18]:
TabelaDados = [TabelaDadosPositivas, TabelaDadosNegativas]
TabelaDados = pd.concat(TabelaDados)

In [19]:
TabelaDados.shape

(200, 421)

In [20]:
TabelaDados.head()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,N,Q,P,S,R,T,W,V,Y,Classe
SeqPos1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,1
SeqPos2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,10.0,0.0,0.0,0.0,5.0,0.0,1
SeqPos3,0.0,0.0,0.0,5.26,0.0,0.0,0.0,5.26,10.53,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,10.0,5.0,1
SeqPos4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,5.0,5.0,5.0,10.0,0.0,5.0,0.0,0.0,0.0,1
SeqPos5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1


In [21]:
TabelaDados.tail()

PepAAComp,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,N,Q,P,S,R,T,W,V,Y,Classe
SeqNeg96,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,0.0,0.0,...,15.0,5.0,5.0,10.0,0.0,15.0,0.0,0.0,0.0,-1
SeqNeg97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,10.0,0.0,5.0,5.0,0.0,20.0,0.0,10.0,5.0,-1
SeqNeg98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,10.0,5.0,0.0,0.0,5.0,0.0,-1
SeqNeg99,0.0,0.0,0.0,5.26,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,5.0,0.0,10.0,0.0,-1
SeqNeg100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,5.0,0.0,15.0,5.0,10.0,0.0,0.0,10.0,-1


# Split treino-teste

In [22]:
X = np.array(TabelaDados.drop(['Classe'], 1))
y = np.array(TabelaDados['Classe'])

In [23]:
X.shape

(200, 420)

In [24]:
y.shape

(200,)

In [25]:
treinoteste = 0.1
from sklearn.model_selection import train_test_split  
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=treinoteste, stratify=y, 
                                                       random_state=101)

# Saves

In [26]:
#Features 

In [27]:
features = TabelaDados.columns.values
features[np.where(features=="NA")[0]]="NAA"

In [28]:
DadosTreino = pd.DataFrame(np.column_stack((X_treino, y_treino)), 
                              columns=features)

DadosTeste = pd.DataFrame(np.column_stack((X_teste, y_teste)), 
                              columns=features)

In [29]:
print(DadosTreino.shape)
print(DadosTeste.shape)

(180, 421)
(20, 421)


In [30]:
#Salvar tabela de dados principal 

with open("DadosTreino", "wb") as fp:   #Pickling
    pickle.dump(DadosTreino, fp)   
with open("DadosTeste", "wb") as fp:   #Pickling
    pickle.dump(DadosTeste, fp) 

In [31]:
#Salvar no formato adequado am mRMR.

if True: 
    
    #Manipular as features para adequar ao mRMR.
    
    featuresmrmr = []
    featuresmrmr.append(DadosTreino.shape[1]-1)
    for i in range(DadosTreino.shape[1]-1):
        featuresmrmr.append(i)

    featuresmrmr = DadosTreino.columns.values[featuresmrmr]
    
    DadosTreinomrmrCSV = DadosTreino[featuresmrmr]

    normalizar = False #Normalizar para usar no libsvm
  
    if normalizar: 
        
        scaler = StandardScaler()

        y_mrmr = np.array(DadosTreinomrmrCSV['Classe'])
        x_mrmr = np.array(DadosTreinomrmrCSV.drop(['Classe'], 1))
        DadosTreinomrmrCSV = pd.DataFrame(np.column_stack((y_mrmr, 
                             scaler.fit_transform(x_mrmr))), 
                             columns=DadosTreinomrmrCSV.columns)

    #Salvar no formato adequado ao mrmr. 
    DadosTreinomrmrCSV.to_csv("DadosTreinoCruCSVmrmr.csv", header=True, index=False)
    
    DadosTreinomrmrCSV.head()

In [32]:
DadosTreinomrmrCSV.head()

Unnamed: 0,Classe,GW,GV,GT,GS,GR,GQ,GP,GY,GG,...,L,N,Q,P,S,R,T,W,V,Y
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,15.0,0.0,0.0,5.0,5.0,0.0,20.0,0.0,0.0,0.0
1,1.0,0.0,5.26,0.0,10.53,0.0,0.0,0.0,0.0,5.26,...,0.0,0.0,15.0,0.0,10.0,0.0,0.0,10.0,5.0,5.0
2,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,...,5.0,10.0,0.0,10.0,10.0,0.0,15.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,...,10.0,5.0,0.0,10.0,0.0,0.0,5.0,0.0,5.0,5.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,10.0,0.0,0.0,0.0,5.0,10.0,10.0,0.0,5.0,0.0


In [35]:
DadosTreino.head()

Unnamed: 0,GW,GV,GT,GS,GR,GQ,GP,GY,GG,GF,...,N,Q,P,S,R,T,W,V,Y,Classe
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,...,0.0,0.0,5.0,5.0,0.0,20.0,0.0,0.0,0.0,1.0
1,0.0,5.26,0.0,10.53,0.0,0.0,0.0,0.0,5.26,0.0,...,0.0,15.0,0.0,10.0,0.0,0.0,10.0,5.0,5.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,0.0,...,10.0,0.0,10.0,10.0,0.0,15.0,0.0,0.0,0.0,-1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,0.0,...,5.0,0.0,10.0,0.0,0.0,5.0,0.0,5.0,5.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.26,0.0,...,0.0,0.0,0.0,5.0,10.0,10.0,0.0,5.0,0.0,1.0


In [36]:
print("OK.")

OK.


# Template para adicionar mais. Desmarque markdown para facilitar

if True : 
    
    def getcaracteristicasDPComp(pep, kseq):

        protein.ReadProteinSequence(pep)

        kDPComp = protein.GetDPComp()
        kDPComp = pd.DataFrame(kDPComp.items(), columns=['PepAAComp', kseq])
        kDPComp = kDPComp.set_index('PepAAComp')

        return(kDPComp)

    for i in range(len(seqs)):

        if i==0: DPCompData = getcaracteristicasDPComp(seqs[i], kseq=str("Seq"+str(i+1)))
        else: DPCompData = DPCompData.merge(getcaracteristicasDPComp(seqs[i], kseq=str("Seq"+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')

    print(DPCompData.shape)
    
if False: 
    
    def getcaracteristicasAAComp(pep, kseq):

        protein.ReadProteinSequence(pep)

        kAAComp = protein.GetAAComp()
        kAAComp = pd.DataFrame(kAAComp.items(), columns=['PepAAComp', kseq])
        kAAComp = kAAComp.set_index('PepAAComp')

        return(kAAComp)

    for i in range(len(seqs)):

        if i==0: AACompData = getcaracteristicasAAComp(seqs[i], kseq=str("Seq"+str(i+1)))
        else: AACompData = AACompData.merge(getcaracteristicasAAComp(seqs[i], kseq=str("Seq"+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')

    print(AACompData.shape)


    def getcaracteristicasMoreauBrotoAuto(pep, kseq):

        protein.ReadProteinSequence(pep)

        kMoreauBrotoAuto = protein.GetMoreauBrotoAuto()
        kMoreauBrotoAuto = pd.DataFrame(kMoreauBrotoAuto.items(), columns=['PepAAComp', kseq])
        kMoreauBrotoAuto = kMoreauBrotoAuto.set_index('PepAAComp')

        return(kMoreauBrotoAuto)

    for i in range(len(seqs)):

        if i==0: MoreauBrotoAutoData = getcaracteristicasMoreauBrotoAuto(seqs[i], kseq=str("Seq"+str(i+1)))
        else: MoreauBrotoAutoData = MoreauBrotoAutoData.merge(getcaracteristicasMoreauBrotoAuto(seqs[i], kseq=str("Seq"+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')

    print(MoreauBrotoAutoData.shape)

    def getcaracteristicasMoranAuto(pep, kseq):

        protein.ReadProteinSequence(pep)

        kMoranAuto = protein.GetMoranAuto()
        kMoranAuto = pd.DataFrame(kMoranAuto.items(), columns=['PepAAComp', kseq])
        kMoranAuto = kMoranAuto.set_index('PepAAComp')

        return(kMoranAuto)

    for i in range(len(seqs)):

        if i==0: MoranAutoData = getcaracteristicasMoranAuto(seqs[i], kseq=str("Seq"+str(i+1)))
        else: MoranAutoData = MoranAutoData.merge(getcaracteristicasMoranAuto(seqs[i], kseq=str("Seq"+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')

    print(MoranAutoData.shape)

    def getcaracteristicasGearyAuto(pep, kseq):

        protein.ReadProteinSequence(pep)

        kGearyAuto = protein.GetGearyAuto()
        kGearyAuto = pd.DataFrame(kGearyAuto.items(), columns=['PepAAComp', kseq])
        kGearyAuto = kGearyAuto.set_index('PepAAComp')

        return(kGearyAuto)

    for i in range(len(seqs)):

        if i==0: GearyAutoData = getcaracteristicasGearyAuto(seqs[i], kseq=str("Seq"+str(i+1)))
        else: GearyAutoData = GearyAutoData.merge(getcaracteristicasGearyAuto(seqs[i], kseq=str("Seq"+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')

    print(GearyAutoData.shape)

    def getcaracteristicasCTD(pep, kseq):

        kCTD = CTD.CalculateCTD(str(pep))
        kCTD = pd.DataFrame(kCTD.items(), columns=['PepAAComp', kseq])
        kCTD = kCTD.set_index('PepAAComp')

        return(kCTD)

    for i in range(len(seqs)):

        if i==0: CTDData = getcaracteristicasCTD(seqs[i], kseq=str("Seq"+str(i+1)))
        else: CTDData = CTDData.merge(getcaracteristicasCTD(seqs[i], kseq=str("Seq"+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')

    print(CTDData.shape)

    def getcaracteristicasSOCN(pep, kseq, kmaxlag=kmaxlag):

        protein.ReadProteinSequence(pep)

        kSOCN = protein.GetSOCN(maxlag=kmaxlag)
        kSOCN = pd.DataFrame(kSOCN.items(), columns=['PepAAComp', kseq])
        kSOCN = kSOCN.set_index('PepAAComp')

        return(kSOCN)

    for i in range(len(seqs)):

        if i==0: SOCNData = getcaracteristicasSOCN(seqs[i], kseq=str("Seq"+str(i+1)))
        else: SOCNData = SOCNData.merge(getcaracteristicasSOCN(seqs[i], kseq=str("Seq"+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')

    print(SOCNData.shape)

    def getcaracteristicasQSO(pep, kseq, kmaxlag=kmaxlag):

        protein.ReadProteinSequence(pep)

        kQSO = protein.GetQSO(maxlag=kmaxlag)
        kQSO = pd.DataFrame(kQSO.items(), columns=['PepAAComp', kseq])
        kQSO = kQSO.set_index('PepAAComp')

        return(kQSO)

    for i in range(len(seqs)):

        if i==0: QSOData = getcaracteristicasQSO(seqs[i], kseq=str("Seq"+str(i+1)))
        else: QSOData = QSOData.merge(getcaracteristicasQSO(seqs[i], kseq=str("Seq"+str(i+1))), 
                                left_on='PepAAComp', right_on='PepAAComp', how='inner')

    print(QSOData.shape)