# Dataset for Machine Learning build

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np

from propythia.protein.sequence import ReadSequence
from propythia.protein.descriptors import ProteinDescritors

## Positive and Negative CD-HIT datasets build 

In [2]:
# Set a function to parse a fasta file
def parse_fasta(fasta_file):
    """
    Parses a FASTA file and returns a list of tuples containing accession IDs and sequences.
    
    Parameters:
    fasta_file (str): Path to the FASTA file.
    
    Returns:
    list: A list of tuples, where each tuple contains an accession ID and a sequence.
    """
    sequences = []
    with open(fasta_file, 'r') as file:
        accession_id = None
        sequence = ""
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if accession_id:
                    sequences.append((accession_id, sequence))
                accession_id = line[1:].split()[0]  # Get the first word after '>'
                sequence = ""
            else:
                sequence += line
        if accession_id:
            sequences.append((accession_id, sequence))
    return sequences

# Set a function to create the positive DataFrame
def create_positive_dataframe(sequences):
    """
    Creates a DataFrame from a list of sequences.
    
    Parameters:
    sequences (list): A list of tuples, where each tuple contains an accession ID and a sequence.
    
    Returns:
    DataFrame: A pandas DataFrame with columns 'accession id', 'sequence', and 'target'.
    """
    df = pd.DataFrame(sequences, columns=['accession_id', 'sequence'])
    df['target'] = 'endolysin'
    return df

# Set a function to create the negative DataFrame
def create_negative_dataframe(sequences):
    """
    Creates a DataFrame from a list of sequences.
    
    Parameters:
    sequences (list): A list of tuples, where each tuple contains an accession ID and a sequence.
    
    Returns:
    DataFrame: A pandas DataFrame with columns 'accession id', 'sequence', and 'target'.
    """
    df = pd.DataFrame(sequences, columns=['accession_id', 'sequence'])
    df['target'] = 'not_endolysin'
    return df

### Positive dataset

In [3]:
# Specify the filename of your FASTA file
fasta_file_name = 'positive_dataset_cdhit.fasta'

# Parse the FASTA file
sequences = parse_fasta(fasta_file_name)

# Create the DataFrame
df_positive = create_positive_dataframe(sequences)

# Display the dimensions of the DataFrame
print("DataFrame dimensions:", df_positive.shape)

# Display the head of the DataFrame
df_positive.head()

DataFrame dimensions: (2758, 3)


Unnamed: 0,accession_id,sequence,target
0,XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,endolysin
1,XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,endolysin
2,XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,endolysin
3,WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,endolysin
4,WVX91681.1,MYLNDYVGKFIKEDNYYGYQSTDLVSNYVQRLTLGRYKTKLNANKM...,endolysin


### Negative Datasets

Original dataset

In [4]:
# Specify the filename of your FASTA file
fasta_file_name = 'negative_dataset_cdhit.fasta'

# Parse the FASTA file
sequences = parse_fasta(fasta_file_name)

# Create the DataFrame
df_negative = create_negative_dataframe(sequences)

# Display the dimensions of the DataFrame
print("DataFrame dimensions:", df_negative.shape)

# Display the head of the DataFrame
df_negative.head()

DataFrame dimensions: (57389, 3)


Unnamed: 0,accession_id,sequence,target
0,QBR30802.1,MIGMDRHTGQPISGIEHLRQSVADILGTPLLSRRERPEYGSKLRRM...,not_endolysin
1,QBR30574.1,MIGIDRDSGATVDDWLQFVQRATRALTTPLGTRQKRPLYGSLIPTL...,not_endolysin
2,QBR21635.1,MIYKNTAVHFDVNAQVKRSVSANIQFSTQDIGTAKLSFNLTKDGVP...,not_endolysin
3,QBR19749.1,MIYKDTDIHFSINSQIKRSIAANIQFSTQDIDTAKLTFSLTKDGIP...,not_endolysin
4,WP_068457916.1,MLDKPEREEIRYGVTPYGFRRKLYAEALAERMSRAKEVFGVNIDLS...,not_endolysin


Random sampled dataset with double number of entries of positive dataset

In [5]:
# Set double number of rows of positive dataset 
num_rows_to_select = 2 * len(df_positive)

# Perform the random sampling of the original negative dataset
df_negative_double = df_negative.sample(n=num_rows_to_select, random_state=1)

In [6]:
# Display the dimensions of the resulting DataFrame
print("Double negative entries DataFrame dimensions:", df_negative_double.shape)

# Display the randomly selected rows
df_negative_double.head()

Double negative entries DataFrame dimensions: (5516, 3)


Unnamed: 0,accession_id,sequence,target
39795,WP_099022634.1,MSDELAQLLSVLDADQARLAQLEKYATGTQPLAFLSPEAKAALGNR...,not_endolysin
14008,WP_042871121.1,MSQTKTPQAEKCLNHYNTLLAKAFGVPENALAKQFSVSEPMEMVLR...,not_endolysin
37748,WP_109024242.1,MKMSTIPTLLGPDGMTSLREYAGYHGGGSGFGGQLRAWNPPGESVD...,not_endolysin
26466,AZF89591.1,MVKKKRPPIQFNDEQLLLQASNVADIYHQLALDLFDNVVERVTERG...,not_endolysin
31876,TBW66254.1,MKQGKQRALGRIKSSFLKWLGVPISLTDGSFWSAWGGMGSSSGETV...,not_endolysin


Random sampled dataset with equal number of entries of the positive dataset

In [7]:
# Set equal number of rows of positive dataset
num_rows_to_select = len(df_positive)

# Perform the random sampling of the original negative dataset
df_negative_equal = df_negative.sample(n=num_rows_to_select, random_state=1)

In [8]:
# Display the dimensions of the resulting DataFrame
print("Equal negative entries DataFrame dimensions:", df_negative_equal.shape)

# Display the randomly selected rows
df_negative_equal.head()

Equal negative entries DataFrame dimensions: (2758, 3)


Unnamed: 0,accession_id,sequence,target
39795,WP_099022634.1,MSDELAQLLSVLDADQARLAQLEKYATGTQPLAFLSPEAKAALGNR...,not_endolysin
14008,WP_042871121.1,MSQTKTPQAEKCLNHYNTLLAKAFGVPENALAKQFSVSEPMEMVLR...,not_endolysin
37748,WP_109024242.1,MKMSTIPTLLGPDGMTSLREYAGYHGGGSGFGGQLRAWNPPGESVD...,not_endolysin
26466,AZF89591.1,MVKKKRPPIQFNDEQLLLQASNVADIYHQLALDLFDNVVERVTERG...,not_endolysin
31876,TBW66254.1,MKQGKQRALGRIKSSFLKWLGVPISLTDGSFWSAWGGMGSSSGETV...,not_endolysin


## Generate Descriptors with **Propythia**

- Application of Propythia on both positive and negative CD-HIT datasets

Positive Dataset

In [9]:
read_seqs = ReadSequence()
res = read_seqs.par_preprocessing(dataset=df_positive, col='sequence', B='N', Z='Q', U='C', O='K', J='I', X='')
res

Unnamed: 0,accession_id,sequence,target
0,XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,endolysin
1,XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,endolysin
2,XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,endolysin
3,WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,endolysin
4,WVX91681.1,MYLNDYVGKFIKEDNYYGYQSTDLVSNYVQRLTLGRYKTKLNANKM...,endolysin
...,...,...,...
2753,AFU63124.1,MNKNTLVASAAVVAGLALGVQTVSADTVTVQAGDTVSEIAHEHGTS...,endolysin
2754,AFQ96195.1,MALTEAWLLEKANRRLNEKGMLKEVSDKTRAVIKEMAKQGIYINVA...,endolysin
2755,AFO72163.1,MKICITVGHSILKNGSCTSADGVVNEYKYNKSLAPVLADTFRKEGH...,endolysin
2756,ACR45925.1,MTNLKSGGFEVYHWPSFNDRLSDKLSKKTIHRQVIHEPYSRTANKV...,endolysin


In [10]:
descriptors_df = ProteinDescritors(dataset=df_positive, col='sequence')
descriptors_df.dataset

Unnamed: 0,accession_id,sequence,target
0,XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,endolysin
1,XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,endolysin
2,XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,endolysin
3,WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,endolysin
4,WVX91681.1,MYLNDYVGKFIKEDNYYGYQSTDLVSNYVQRLTLGRYKTKLNANKM...,endolysin
...,...,...,...
2753,AFU63124.1,MNKNTLVASAAVVAGLALGVQTVSADTVTVQAGDTVSEIAHEHGTS...,endolysin
2754,AFQ96195.1,MALTEAWLLEKANRRLNEKGMLKEVSDKTRAVIKEMAKQGIYINVA...,endolysin
2755,AFO72163.1,MKICITVGHSILKNGSCTSADGVVNEYKYNKSLAPVLADTFRKEGH...,endolysin
2756,ACR45925.1,MTNLKSGGFEVYHWPSFNDRLSDKLSKKTIHRQVIHEPYSRTANKV...,endolysin


In [11]:
descriptors_df_positive = descriptors_df.get_all_physicochemical(ph=7, amide=False, n_jobs=4)
descriptors_df_positive

Unnamed: 0,accession_id,sequence,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
0,XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,endolysin,92.0,1.064,0.000102,465,726,118,143,...,7.761671,17.880435,0.336957,0.206522,0.434783,16960,17085,82.717391,1.471087,0.358696
1,XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,endolysin,131.0,5.203,0.000353,655,1039,177,197,...,9.644100,36.342748,0.312977,0.297710,0.366412,17990,17990,93.587786,1.597939,0.358779
2,XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,endolysin,220.0,4.001,0.000165,1055,1658,302,324,...,8.972855,32.338182,0.313636,0.322727,0.318182,39420,39545,80.818182,1.677636,0.381818
3,WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,endolysin,341.0,14.853,0.000395,1647,2528,468,515,...,9.569832,26.605279,0.269795,0.316716,0.328446,56840,56965,70.117302,1.828974,0.322581
4,WVX91681.1,MYLNDYVGKFIKEDNYYGYQSTDLVSNYVQRLTLGRYKTKLNANKM...,endolysin,169.0,7.200,0.000355,917,1418,238,273,...,9.271988,23.445562,0.313609,0.266272,0.396450,29340,29340,76.627219,2.511006,0.289941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2753,AFU63124.1,MNKNTLVASAAVVAGLALGVQTVSADTVTVQAGDTVSEIAHEHGTS...,endolysin,232.0,-4.692,-0.000193,998,1594,276,398,...,5.176975,45.820259,0.254310,0.353448,0.314655,16960,16960,73.491379,1.768276,0.301724
2754,AFQ96195.1,MALTEAWLLEKANRRLNEKGMLKEVSDKTRAVIKEMAKQGIYINVA...,endolysin,315.0,25.258,0.000738,1523,2376,416,455,...,9.857425,20.384762,0.323810,0.339683,0.304762,53860,53985,64.444444,1.519079,0.314286
2755,AFO72163.1,MKICITVGHSILKNGSCTSADGVVNEYKYNKSLAPVLADTFRKEGH...,endolysin,282.0,6.003,0.000190,1416,2238,364,428,...,8.743154,27.191525,0.326241,0.297872,0.368794,29340,29715,84.290780,1.588298,0.336879
2756,ACR45925.1,MTNLKSGGFEVYHWPSFNDRLSDKLSKKTIHRQVIHEPYSRTANKV...,endolysin,508.0,-3.336,-0.000057,2561,4013,677,829,...,6.104360,44.725020,0.303150,0.273622,0.381890,68760,68760,88.090551,2.102717,0.320866


Negative Dataset (Double)

In [12]:
read_seqs = ReadSequence()
res = read_seqs.par_preprocessing(dataset=df_negative_double, col='sequence', B='N', Z='Q', U='C', O='K', J='I', X='')
res

Unnamed: 0,accession_id,sequence,target
39795,WP_099022634.1,MSDELAQLLSVLDADQARLAQLEKYATGTQPLAFLSPEAKAALGNR...,not_endolysin
14008,WP_042871121.1,MSQTKTPQAEKCLNHYNTLLAKAFGVPENALAKQFSVSEPMEMVLR...,not_endolysin
37748,WP_109024242.1,MKMSTIPTLLGPDGMTSLREYAGYHGGGSGFGGQLRAWNPPGESVD...,not_endolysin
26466,AZF89591.1,MVKKKRPPIQFNDEQLLLQASNVADIYHQLALDLFDNVVERVTERG...,not_endolysin
31876,TBW66254.1,MKQGKQRALGRIKSSFLKWLGVPISLTDGSFWSAWGGMGSSSGETV...,not_endolysin
...,...,...,...
35258,WP_118376275.1,MAAIFWRGAAMPRIMSGMSIRERMANMWGAVKRSANRVSSVATAPF...,not_endolysin
9398,AME02668.1,MSELRHRITILRPVADTDEEGNILSSPVVEVEKAWALVLPFAAKIS...,not_endolysin
5378,RHO66229.1,MTGRVKIIRVTTEIKEGRKEPTTEVFYECWCDVQSLGTNEKYTALQ...,not_endolysin
55507,SQN63542.1,MHRIDTKTAQKDKFGAGKNGFTRGNPQTGTPATDLDDDYFDMLQEE...,not_endolysin


In [13]:
descriptors_df = ProteinDescritors(dataset=df_negative_double, col='sequence')
descriptors_df.dataset

Unnamed: 0,accession_id,sequence,target
39795,WP_099022634.1,MSDELAQLLSVLDADQARLAQLEKYATGTQPLAFLSPEAKAALGNR...,not_endolysin
14008,WP_042871121.1,MSQTKTPQAEKCLNHYNTLLAKAFGVPENALAKQFSVSEPMEMVLR...,not_endolysin
37748,WP_109024242.1,MKMSTIPTLLGPDGMTSLREYAGYHGGGSGFGGQLRAWNPPGESVD...,not_endolysin
26466,AZF89591.1,MVKKKRPPIQFNDEQLLLQASNVADIYHQLALDLFDNVVERVTERG...,not_endolysin
31876,TBW66254.1,MKQGKQRALGRIKSSFLKWLGVPISLTDGSFWSAWGGMGSSSGETV...,not_endolysin
...,...,...,...
35258,WP_118376275.1,MAAIFWRGAAMPRIMSGMSIRERMANMWGAVKRSANRVSSVATAPF...,not_endolysin
9398,AME02668.1,MSELRHRITILRPVADTDEEGNILSSPVVEVEKAWALVLPFAAKIS...,not_endolysin
5378,RHO66229.1,MTGRVKIIRVTTEIKEGRKEPTTEVFYECWCDVQSLGTNEKYTALQ...,not_endolysin
55507,SQN63542.1,MHRIDTKTAQKDKFGAGKNGFTRGNPQTGTPATDLDDDYFDMLQEE...,not_endolysin


In [14]:
descriptors_df_negative_double = descriptors_df.get_all_physicochemical(ph=7, amide=False, n_jobs=4)
descriptors_df_negative_double

Unnamed: 0,accession_id,sequence,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
0,WP_099022634.1,MSDELAQLLSVLDADQARLAQLEKYATGTQPLAFLSPEAKAALGNR...,not_endolysin,453.0,-18.555,-0.000380,2110,3375,603,698,...,4.775123,39.023179,0.342163,0.295806,0.333333,47440,47440,90.750552,1.843907,0.392936
1,WP_042871121.1,MSQTKTPQAEKCLNHYNTLLAKAFGVPENALAKQFSVSEPMEMVLR...,not_endolysin,365.0,-11.555,-0.000289,1746,2684,466,570,...,5.091091,29.594795,0.315068,0.298630,0.339726,43890,44140,76.520548,1.641534,0.367123
2,WP_109024242.1,MKMSTIPTLLGPDGMTSLREYAGYHGGGSGFGGQLRAWNPPGESVD...,not_endolysin,135.0,10.492,0.000713,652,1024,196,184,...,10.720528,42.673333,0.303704,0.355556,0.303704,16960,16960,75.259259,1.689926,0.340741
3,AZF89591.1,MVKKKRPPIQFNDEQLLLQASNVADIYHQLALDLFDNVVERVTERG...,not_endolysin,559.0,2.332,0.000037,2784,4396,768,877,...,7.685280,34.572290,0.347048,0.266547,0.330948,64290,64415,82.343470,2.127871,0.332737
4,TBW66254.1,MKQGKQRALGRIKSSFLKWLGVPISLTDGSFWSAWGGMGSSSGETV...,not_endolysin,437.0,5.419,0.000113,2104,3311,577,662,...,8.979431,40.738947,0.345538,0.256293,0.368421,71390,71515,84.004577,1.521716,0.372998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5511,WP_118376275.1,MAAIFWRGAAMPRIMSGMSIRERMANMWGAVKRSANRVSSVATAPF...,not_endolysin,390.0,-3.736,-0.000086,1899,3007,527,589,...,5.693528,43.704872,0.333333,0.287179,0.338462,51910,51910,84.846154,1.805333,0.382051
5512,AME02668.1,MSELRHRITILRPVADTDEEGNILSSPVVEVEKAWALVLPFAAKIS...,not_endolysin,108.0,-0.960,-0.000078,552,883,151,163,...,5.852109,66.590741,0.305556,0.240741,0.407407,22460,22460,101.851852,1.880093,0.370370
5513,RHO66229.1,MTGRVKIIRVTTEIKEGRKEPTTEVFYECWCDVQSLGTNEKYTALQ...,not_endolysin,105.0,3.736,0.000304,542,856,146,161,...,8.911997,43.979238,0.333333,0.180952,0.390476,18450,18700,66.761905,2.168476,0.361905
5514,SQN63542.1,MHRIDTKTAQKDKFGAGKNGFTRGNPQTGTPATDLDDDYFDMLQEE...,not_endolysin,150.0,5.033,0.000314,679,1108,208,222,...,9.263414,45.931333,0.320000,0.340000,0.273333,1490,1740,71.066667,2.181000,0.353333


Negative Dataset (Equal)

In [15]:
read_seqs = ReadSequence()
res = read_seqs.par_preprocessing(dataset=df_negative_equal, col='sequence', B='N', Z='Q', U='C', O='K', J='I', X='')
res

Unnamed: 0,accession_id,sequence,target
39795,WP_099022634.1,MSDELAQLLSVLDADQARLAQLEKYATGTQPLAFLSPEAKAALGNR...,not_endolysin
14008,WP_042871121.1,MSQTKTPQAEKCLNHYNTLLAKAFGVPENALAKQFSVSEPMEMVLR...,not_endolysin
37748,WP_109024242.1,MKMSTIPTLLGPDGMTSLREYAGYHGGGSGFGGQLRAWNPPGESVD...,not_endolysin
26466,AZF89591.1,MVKKKRPPIQFNDEQLLLQASNVADIYHQLALDLFDNVVERVTERG...,not_endolysin
31876,TBW66254.1,MKQGKQRALGRIKSSFLKWLGVPISLTDGSFWSAWGGMGSSSGETV...,not_endolysin
...,...,...,...
29272,AWK13748.1,MAIHTDLQRLLPGNRIRLFEVDGTQFNADILRFHADTLAHTPEELT...,not_endolysin
18509,WP_029003701.1,MTSSLSAPETKVAVPESRSALDTLMHTFETYKEANEIRLADLERRG...,not_endolysin
37764,WP_108404793.1,MPFPAPNTPWPLPQWEPVRTMVAEAALWWEGNTAQLASTYHTQYRP...,not_endolysin
50424,RNC65271.1,MPTYLHPGVYVEEIPSGSRPIEGVGTSTAAFVGYTTKGPVAEPTLL...,not_endolysin


In [16]:
descriptors_df = ProteinDescritors(dataset=df_negative_equal, col='sequence')
descriptors_df.dataset

Unnamed: 0,accession_id,sequence,target
39795,WP_099022634.1,MSDELAQLLSVLDADQARLAQLEKYATGTQPLAFLSPEAKAALGNR...,not_endolysin
14008,WP_042871121.1,MSQTKTPQAEKCLNHYNTLLAKAFGVPENALAKQFSVSEPMEMVLR...,not_endolysin
37748,WP_109024242.1,MKMSTIPTLLGPDGMTSLREYAGYHGGGSGFGGQLRAWNPPGESVD...,not_endolysin
26466,AZF89591.1,MVKKKRPPIQFNDEQLLLQASNVADIYHQLALDLFDNVVERVTERG...,not_endolysin
31876,TBW66254.1,MKQGKQRALGRIKSSFLKWLGVPISLTDGSFWSAWGGMGSSSGETV...,not_endolysin
...,...,...,...
29272,AWK13748.1,MAIHTDLQRLLPGNRIRLFEVDGTQFNADILRFHADTLAHTPEELT...,not_endolysin
18509,WP_029003701.1,MTSSLSAPETKVAVPESRSALDTLMHTFETYKEANEIRLADLERRG...,not_endolysin
37764,WP_108404793.1,MPFPAPNTPWPLPQWEPVRTMVAEAALWWEGNTAQLASTYHTQYRP...,not_endolysin
50424,RNC65271.1,MPTYLHPGVYVEEIPSGSRPIEGVGTSTAAFVGYTTKGPVAEPTLL...,not_endolysin


In [17]:
descriptors_df_negative_equal = descriptors_df.get_all_physicochemical(ph=7, amide=False, n_jobs=4)
descriptors_df_negative_equal

Unnamed: 0,accession_id,sequence,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
0,WP_099022634.1,MSDELAQLLSVLDADQARLAQLEKYATGTQPLAFLSPEAKAALGNR...,not_endolysin,453.0,-18.555,-0.000380,2110,3375,603,698,...,4.775123,39.023179,0.342163,0.295806,0.333333,47440,47440,90.750552,1.843907,0.392936
1,WP_042871121.1,MSQTKTPQAEKCLNHYNTLLAKAFGVPENALAKQFSVSEPMEMVLR...,not_endolysin,365.0,-11.555,-0.000289,1746,2684,466,570,...,5.091091,29.594795,0.315068,0.298630,0.339726,43890,44140,76.520548,1.641534,0.367123
2,WP_109024242.1,MKMSTIPTLLGPDGMTSLREYAGYHGGGSGFGGQLRAWNPPGESVD...,not_endolysin,135.0,10.492,0.000713,652,1024,196,184,...,10.720528,42.673333,0.303704,0.355556,0.303704,16960,16960,75.259259,1.689926,0.340741
3,AZF89591.1,MVKKKRPPIQFNDEQLLLQASNVADIYHQLALDLFDNVVERVTERG...,not_endolysin,559.0,2.332,0.000037,2784,4396,768,877,...,7.685280,34.572290,0.347048,0.266547,0.330948,64290,64415,82.343470,2.127871,0.332737
4,TBW66254.1,MKQGKQRALGRIKSSFLKWLGVPISLTDGSFWSAWGGMGSSSGETV...,not_endolysin,437.0,5.419,0.000113,2104,3311,577,662,...,8.979431,40.738947,0.345538,0.256293,0.368421,71390,71515,84.004577,1.521716,0.372998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2753,AWK13748.1,MAIHTDLQRLLPGNRIRLFEVDGTQFNADILRFHADTLAHTPEELT...,not_endolysin,250.0,-8.725,-0.000312,1217,1875,327,397,...,5.171519,35.894400,0.308000,0.292000,0.344000,30940,31190,74.600000,1.910120,0.344000
2754,WP_029003701.1,MTSSLSAPETKVAVPESRSALDTLMHTFETYKEANEIRLADLERRG...,not_endolysin,415.0,-4.676,-0.000107,1944,3030,528,599,...,5.509028,38.683398,0.385542,0.289157,0.318072,49850,49850,79.927711,1.248313,0.412048
2755,WP_108404793.1,MPFPAPNTPWPLPQWEPVRTMVAEAALWWEGNTAQLASTYHTQYRP...,not_endolysin,506.0,-18.004,-0.000319,2485,3805,679,770,...,5.042380,37.640711,0.288538,0.286561,0.357708,102900,103025,79.288538,1.709704,0.371542
2756,RNC65271.1,MPTYLHPGVYVEEIPSGSRPIEGVGTSTAAFVGYTTKGPVAEPTLL...,not_endolysin,638.0,-27.924,-0.000409,3017,4660,790,966,...,4.582382,25.063652,0.291536,0.313480,0.368339,81820,82070,85.329154,1.216395,0.394984


- CD-HIT positive and negative (double) datasets merge

In [18]:
merged_dataset_double = pd.concat([descriptors_df_positive, descriptors_df_negative_double], ignore_index=True)
merged_dataset_double

Unnamed: 0,accession_id,sequence,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
0,XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,endolysin,92.0,1.064,0.000102,465,726,118,143,...,7.761671,17.880435,0.336957,0.206522,0.434783,16960,17085,82.717391,1.471087,0.358696
1,XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,endolysin,131.0,5.203,0.000353,655,1039,177,197,...,9.644100,36.342748,0.312977,0.297710,0.366412,17990,17990,93.587786,1.597939,0.358779
2,XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,endolysin,220.0,4.001,0.000165,1055,1658,302,324,...,8.972855,32.338182,0.313636,0.322727,0.318182,39420,39545,80.818182,1.677636,0.381818
3,WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,endolysin,341.0,14.853,0.000395,1647,2528,468,515,...,9.569832,26.605279,0.269795,0.316716,0.328446,56840,56965,70.117302,1.828974,0.322581
4,WVX91681.1,MYLNDYVGKFIKEDNYYGYQSTDLVSNYVQRLTLGRYKTKLNANKM...,endolysin,169.0,7.200,0.000355,917,1418,238,273,...,9.271988,23.445562,0.313609,0.266272,0.396450,29340,29340,76.627219,2.511006,0.289941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8269,WP_118376275.1,MAAIFWRGAAMPRIMSGMSIRERMANMWGAVKRSANRVSSVATAPF...,not_endolysin,390.0,-3.736,-0.000086,1899,3007,527,589,...,5.693528,43.704872,0.333333,0.287179,0.338462,51910,51910,84.846154,1.805333,0.382051
8270,AME02668.1,MSELRHRITILRPVADTDEEGNILSSPVVEVEKAWALVLPFAAKIS...,not_endolysin,108.0,-0.960,-0.000078,552,883,151,163,...,5.852109,66.590741,0.305556,0.240741,0.407407,22460,22460,101.851852,1.880093,0.370370
8271,RHO66229.1,MTGRVKIIRVTTEIKEGRKEPTTEVFYECWCDVQSLGTNEKYTALQ...,not_endolysin,105.0,3.736,0.000304,542,856,146,161,...,8.911997,43.979238,0.333333,0.180952,0.390476,18450,18700,66.761905,2.168476,0.361905
8272,SQN63542.1,MHRIDTKTAQKDKFGAGKNGFTRGNPQTGTPATDLDDDYFDMLQEE...,not_endolysin,150.0,5.033,0.000314,679,1108,208,222,...,9.263414,45.931333,0.320000,0.340000,0.273333,1490,1740,71.066667,2.181000,0.353333


- CD-HIT positive and negative (equal) datasets merge

In [19]:
merged_dataset_equal = pd.concat([descriptors_df_positive, descriptors_df_negative_equal], ignore_index=True)
merged_dataset_equal

Unnamed: 0,accession_id,sequence,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
0,XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,endolysin,92.0,1.064,0.000102,465,726,118,143,...,7.761671,17.880435,0.336957,0.206522,0.434783,16960,17085,82.717391,1.471087,0.358696
1,XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,endolysin,131.0,5.203,0.000353,655,1039,177,197,...,9.644100,36.342748,0.312977,0.297710,0.366412,17990,17990,93.587786,1.597939,0.358779
2,XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,endolysin,220.0,4.001,0.000165,1055,1658,302,324,...,8.972855,32.338182,0.313636,0.322727,0.318182,39420,39545,80.818182,1.677636,0.381818
3,WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,endolysin,341.0,14.853,0.000395,1647,2528,468,515,...,9.569832,26.605279,0.269795,0.316716,0.328446,56840,56965,70.117302,1.828974,0.322581
4,WVX91681.1,MYLNDYVGKFIKEDNYYGYQSTDLVSNYVQRLTLGRYKTKLNANKM...,endolysin,169.0,7.200,0.000355,917,1418,238,273,...,9.271988,23.445562,0.313609,0.266272,0.396450,29340,29340,76.627219,2.511006,0.289941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5511,AWK13748.1,MAIHTDLQRLLPGNRIRLFEVDGTQFNADILRFHADTLAHTPEELT...,not_endolysin,250.0,-8.725,-0.000312,1217,1875,327,397,...,5.171519,35.894400,0.308000,0.292000,0.344000,30940,31190,74.600000,1.910120,0.344000
5512,WP_029003701.1,MTSSLSAPETKVAVPESRSALDTLMHTFETYKEANEIRLADLERRG...,not_endolysin,415.0,-4.676,-0.000107,1944,3030,528,599,...,5.509028,38.683398,0.385542,0.289157,0.318072,49850,49850,79.927711,1.248313,0.412048
5513,WP_108404793.1,MPFPAPNTPWPLPQWEPVRTMVAEAALWWEGNTAQLASTYHTQYRP...,not_endolysin,506.0,-18.004,-0.000319,2485,3805,679,770,...,5.042380,37.640711,0.288538,0.286561,0.357708,102900,103025,79.288538,1.709704,0.371542
5514,RNC65271.1,MPTYLHPGVYVEEIPSGSRPIEGVGTSTAAFVGYTTKGPVAEPTLL...,not_endolysin,638.0,-27.924,-0.000409,3017,4660,790,966,...,4.582382,25.063652,0.291536,0.313480,0.368339,81820,82070,85.329154,1.216395,0.394984


- Set the 'acession_id' column as dataframe index

In [20]:
merged_dataset_double.set_index('accession_id', inplace=True)
merged_dataset_double

Unnamed: 0_level_0,sequence,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,formulaS,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
accession_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,endolysin,92.0,1.064,0.000102,465,726,118,143,3,...,7.761671,17.880435,0.336957,0.206522,0.434783,16960,17085,82.717391,1.471087,0.358696
XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,endolysin,131.0,5.203,0.000353,655,1039,177,197,2,...,9.644100,36.342748,0.312977,0.297710,0.366412,17990,17990,93.587786,1.597939,0.358779
XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,endolysin,220.0,4.001,0.000165,1055,1658,302,324,11,...,8.972855,32.338182,0.313636,0.322727,0.318182,39420,39545,80.818182,1.677636,0.381818
WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,endolysin,341.0,14.853,0.000395,1647,2528,468,515,6,...,9.569832,26.605279,0.269795,0.316716,0.328446,56840,56965,70.117302,1.828974,0.322581
WVX91681.1,MYLNDYVGKFIKEDNYYGYQSTDLVSNYVQRLTLGRYKTKLNANKM...,endolysin,169.0,7.200,0.000355,917,1418,238,273,3,...,9.271988,23.445562,0.313609,0.266272,0.396450,29340,29340,76.627219,2.511006,0.289941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WP_118376275.1,MAAIFWRGAAMPRIMSGMSIRERMANMWGAVKRSANRVSSVATAPF...,not_endolysin,390.0,-3.736,-0.000086,1899,3007,527,589,16,...,5.693528,43.704872,0.333333,0.287179,0.338462,51910,51910,84.846154,1.805333,0.382051
AME02668.1,MSELRHRITILRPVADTDEEGNILSSPVVEVEKAWALVLPFAAKIS...,not_endolysin,108.0,-0.960,-0.000078,552,883,151,163,2,...,5.852109,66.590741,0.305556,0.240741,0.407407,22460,22460,101.851852,1.880093,0.370370
RHO66229.1,MTGRVKIIRVTTEIKEGRKEPTTEVFYECWCDVQSLGTNEKYTALQ...,not_endolysin,105.0,3.736,0.000304,542,856,146,161,8,...,8.911997,43.979238,0.333333,0.180952,0.390476,18450,18700,66.761905,2.168476,0.361905
SQN63542.1,MHRIDTKTAQKDKFGAGKNGFTRGNPQTGTPATDLDDDYFDMLQEE...,not_endolysin,150.0,5.033,0.000314,679,1108,208,222,7,...,9.263414,45.931333,0.320000,0.340000,0.273333,1490,1740,71.066667,2.181000,0.353333


In [21]:
merged_dataset_equal.set_index('accession_id', inplace=True)
merged_dataset_equal

Unnamed: 0_level_0,sequence,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,formulaS,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
accession_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,endolysin,92.0,1.064,0.000102,465,726,118,143,3,...,7.761671,17.880435,0.336957,0.206522,0.434783,16960,17085,82.717391,1.471087,0.358696
XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,endolysin,131.0,5.203,0.000353,655,1039,177,197,2,...,9.644100,36.342748,0.312977,0.297710,0.366412,17990,17990,93.587786,1.597939,0.358779
XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,endolysin,220.0,4.001,0.000165,1055,1658,302,324,11,...,8.972855,32.338182,0.313636,0.322727,0.318182,39420,39545,80.818182,1.677636,0.381818
WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,endolysin,341.0,14.853,0.000395,1647,2528,468,515,6,...,9.569832,26.605279,0.269795,0.316716,0.328446,56840,56965,70.117302,1.828974,0.322581
WVX91681.1,MYLNDYVGKFIKEDNYYGYQSTDLVSNYVQRLTLGRYKTKLNANKM...,endolysin,169.0,7.200,0.000355,917,1418,238,273,3,...,9.271988,23.445562,0.313609,0.266272,0.396450,29340,29340,76.627219,2.511006,0.289941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AWK13748.1,MAIHTDLQRLLPGNRIRLFEVDGTQFNADILRFHADTLAHTPEELT...,not_endolysin,250.0,-8.725,-0.000312,1217,1875,327,397,10,...,5.171519,35.894400,0.308000,0.292000,0.344000,30940,31190,74.600000,1.910120,0.344000
WP_029003701.1,MTSSLSAPETKVAVPESRSALDTLMHTFETYKEANEIRLADLERRG...,not_endolysin,415.0,-4.676,-0.000107,1944,3030,528,599,10,...,5.509028,38.683398,0.385542,0.289157,0.318072,49850,49850,79.927711,1.248313,0.412048
WP_108404793.1,MPFPAPNTPWPLPQWEPVRTMVAEAALWWEGNTAQLASTYHTQYRP...,not_endolysin,506.0,-18.004,-0.000319,2485,3805,679,770,18,...,5.042380,37.640711,0.288538,0.286561,0.357708,102900,103025,79.288538,1.709704,0.371542
RNC65271.1,MPTYLHPGVYVEEIPSGSRPIEGVGTSTAAFVGYTTKGPVAEPTLL...,not_endolysin,638.0,-27.924,-0.000409,3017,4660,790,966,17,...,4.582382,25.063652,0.291536,0.313480,0.368339,81820,82070,85.329154,1.216395,0.394984


- Conversion of 'target' column values from non-numeric to numeric

0 - not_endolysin

1 - endolysin

In [22]:
merged_dataset_double['target'] = merged_dataset_double['target'].map({'endolysin': 1, 'not_endolysin': 0})
merged_dataset_double

Unnamed: 0_level_0,sequence,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,formulaS,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
accession_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,1,92.0,1.064,0.000102,465,726,118,143,3,...,7.761671,17.880435,0.336957,0.206522,0.434783,16960,17085,82.717391,1.471087,0.358696
XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,1,131.0,5.203,0.000353,655,1039,177,197,2,...,9.644100,36.342748,0.312977,0.297710,0.366412,17990,17990,93.587786,1.597939,0.358779
XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,1,220.0,4.001,0.000165,1055,1658,302,324,11,...,8.972855,32.338182,0.313636,0.322727,0.318182,39420,39545,80.818182,1.677636,0.381818
WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,1,341.0,14.853,0.000395,1647,2528,468,515,6,...,9.569832,26.605279,0.269795,0.316716,0.328446,56840,56965,70.117302,1.828974,0.322581
WVX91681.1,MYLNDYVGKFIKEDNYYGYQSTDLVSNYVQRLTLGRYKTKLNANKM...,1,169.0,7.200,0.000355,917,1418,238,273,3,...,9.271988,23.445562,0.313609,0.266272,0.396450,29340,29340,76.627219,2.511006,0.289941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WP_118376275.1,MAAIFWRGAAMPRIMSGMSIRERMANMWGAVKRSANRVSSVATAPF...,0,390.0,-3.736,-0.000086,1899,3007,527,589,16,...,5.693528,43.704872,0.333333,0.287179,0.338462,51910,51910,84.846154,1.805333,0.382051
AME02668.1,MSELRHRITILRPVADTDEEGNILSSPVVEVEKAWALVLPFAAKIS...,0,108.0,-0.960,-0.000078,552,883,151,163,2,...,5.852109,66.590741,0.305556,0.240741,0.407407,22460,22460,101.851852,1.880093,0.370370
RHO66229.1,MTGRVKIIRVTTEIKEGRKEPTTEVFYECWCDVQSLGTNEKYTALQ...,0,105.0,3.736,0.000304,542,856,146,161,8,...,8.911997,43.979238,0.333333,0.180952,0.390476,18450,18700,66.761905,2.168476,0.361905
SQN63542.1,MHRIDTKTAQKDKFGAGKNGFTRGNPQTGTPATDLDDDYFDMLQEE...,0,150.0,5.033,0.000314,679,1108,208,222,7,...,9.263414,45.931333,0.320000,0.340000,0.273333,1490,1740,71.066667,2.181000,0.353333


In [23]:
merged_dataset_equal['target'] = merged_dataset_equal['target'].map({'endolysin': 1, 'not_endolysin': 0})
merged_dataset_equal

Unnamed: 0_level_0,sequence,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,formulaS,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
accession_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
XBA09021.1,MKLSQRGVETLGITDAVDISPYITTETTQNQFDALTSLATDIGIET...,1,92.0,1.064,0.000102,465,726,118,143,3,...,7.761671,17.880435,0.336957,0.206522,0.434783,16960,17085,82.717391,1.471087,0.358696
XAV38393.1,MFKFSQKSLNNLKGVKPQLVKVVERALQLSPVDFGVREGLRTVEQQ...,1,131.0,5.203,0.000353,655,1039,177,197,2,...,9.644100,36.342748,0.312977,0.297710,0.366412,17990,17990,93.587786,1.597939,0.358779
XAV38170.1,MKITKDILITGTGCTTDRAIKWLDDVQAAMDKFHIESPRAIAAYLA...,1,220.0,4.001,0.000165,1055,1658,302,324,11,...,8.972855,32.338182,0.313636,0.322727,0.318182,39420,39545,80.818182,1.677636,0.381818
WWT34990.1,MVEIINKTVTRGVAGRRPGAVKGVVFHNTWGNSTAKQEANRLAAMN...,1,341.0,14.853,0.000395,1647,2528,468,515,6,...,9.569832,26.605279,0.269795,0.316716,0.328446,56840,56965,70.117302,1.828974,0.322581
WVX91681.1,MYLNDYVGKFIKEDNYYGYQSTDLVSNYVQRLTLGRYKTKLNANKM...,1,169.0,7.200,0.000355,917,1418,238,273,3,...,9.271988,23.445562,0.313609,0.266272,0.396450,29340,29340,76.627219,2.511006,0.289941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AWK13748.1,MAIHTDLQRLLPGNRIRLFEVDGTQFNADILRFHADTLAHTPEELT...,0,250.0,-8.725,-0.000312,1217,1875,327,397,10,...,5.171519,35.894400,0.308000,0.292000,0.344000,30940,31190,74.600000,1.910120,0.344000
WP_029003701.1,MTSSLSAPETKVAVPESRSALDTLMHTFETYKEANEIRLADLERRG...,0,415.0,-4.676,-0.000107,1944,3030,528,599,10,...,5.509028,38.683398,0.385542,0.289157,0.318072,49850,49850,79.927711,1.248313,0.412048
WP_108404793.1,MPFPAPNTPWPLPQWEPVRTMVAEAALWWEGNTAQLASTYHTQYRP...,0,506.0,-18.004,-0.000319,2485,3805,679,770,18,...,5.042380,37.640711,0.288538,0.286561,0.357708,102900,103025,79.288538,1.709704,0.371542
RNC65271.1,MPTYLHPGVYVEEIPSGSRPIEGVGTSTAAFVGYTTKGPVAEPTLL...,0,638.0,-27.924,-0.000409,3017,4660,790,966,17,...,4.582382,25.063652,0.291536,0.313480,0.368339,81820,82070,85.329154,1.216395,0.394984


- Removal of non-numeric 'sequence' column

In [24]:
merged_dataset_double = merged_dataset_double.drop(columns=['sequence'])
merged_dataset_double.head()

Unnamed: 0_level_0,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,formulaS,tot,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
accession_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
XBA09021.1,1,92.0,1.064,0.000102,465,726,118,143,3,1687,...,7.761671,17.880435,0.336957,0.206522,0.434783,16960,17085,82.717391,1.471087,0.358696
XAV38393.1,1,131.0,5.203,0.000353,655,1039,177,197,2,2388,...,9.6441,36.342748,0.312977,0.29771,0.366412,17990,17990,93.587786,1.597939,0.358779
XAV38170.1,1,220.0,4.001,0.000165,1055,1658,302,324,11,3882,...,8.972855,32.338182,0.313636,0.322727,0.318182,39420,39545,80.818182,1.677636,0.381818
WWT34990.1,1,341.0,14.853,0.000395,1647,2528,468,515,6,6007,...,9.569832,26.605279,0.269795,0.316716,0.328446,56840,56965,70.117302,1.828974,0.322581
WVX91681.1,1,169.0,7.2,0.000355,917,1418,238,273,3,3231,...,9.271988,23.445562,0.313609,0.266272,0.39645,29340,29340,76.627219,2.511006,0.289941


In [25]:
merged_dataset_equal = merged_dataset_equal.drop(columns=['sequence'])
merged_dataset_equal.head()

Unnamed: 0_level_0,target,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,formulaS,tot,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
accession_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
XBA09021.1,1,92.0,1.064,0.000102,465,726,118,143,3,1687,...,7.761671,17.880435,0.336957,0.206522,0.434783,16960,17085,82.717391,1.471087,0.358696
XAV38393.1,1,131.0,5.203,0.000353,655,1039,177,197,2,2388,...,9.6441,36.342748,0.312977,0.29771,0.366412,17990,17990,93.587786,1.597939,0.358779
XAV38170.1,1,220.0,4.001,0.000165,1055,1658,302,324,11,3882,...,8.972855,32.338182,0.313636,0.322727,0.318182,39420,39545,80.818182,1.677636,0.381818
WWT34990.1,1,341.0,14.853,0.000395,1647,2528,468,515,6,6007,...,9.569832,26.605279,0.269795,0.316716,0.328446,56840,56965,70.117302,1.828974,0.322581
WVX91681.1,1,169.0,7.2,0.000355,917,1418,238,273,3,3231,...,9.271988,23.445562,0.313609,0.266272,0.39645,29340,29340,76.627219,2.511006,0.289941


Datasets Export

In [26]:
# Save the DataFrame to a CSV file if needed
merged_dataset_double.to_csv('ML_dataset_double.csv', index=True)

In [27]:
# Save the DataFrame to a CSV file if needed
merged_dataset_equal.to_csv('ML_dataset_equal.csv', index=True)