In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cobmineddb/combined_protein.csv
/kaggle/input/cobmineddb/combined_peptides.csv
/kaggle/input/toxin-database/peptide.csv
/kaggle/input/toxin-database/protein_test1002.csv
/kaggle/input/toxin-database/protein_train1002.csv


In [29]:
import pandas as pd
# Load the dataset
file_path = '/kaggle/input/cobmineddb/combined_protein.csv'
protein_data = pd.read_csv(file_path)
protein_data

Unnamed: 0,name,sequence,label,sequence_length
0,>HPCL1_XENTR,MGKQNSKLRPEVLQDLRENTEFTDHELQEWYKGFLKDCPTGHLTVE...,0,193
1,>SIX4_HOTTS,DGYIKGNKGCKVSCVINNVFCNSMCKSSGGSYGYCWSWGLACWCEG...,1,62
2,>TAG2L_AGEOR,MRAIISLLLISAMVFSIIEAVPEEEGLQLSEDERGGCLPHNRFCNA...,1,70
3,>P2011_DANRE,MKTKFTKKTVLKFFGILFAILLLSVLILFSVVIGRTFTFKVNRELG...,0,515
4,>O165_CONTE,MKLTCMVIVAVLFLTAWTFVTAITSNGLENLFPNAHHEMKNPEASK...,1,76
...,...,...,...,...
9051,>EMC2B_XENLA,MSKVSDLYDVTWEDMRDKMKTWREDNYRNSEQIVDVGEELINEHAS...,0,297
9052,>COLI_LITCT,MLQPVWHACILAILGVFIFHVGEVRSQCWESNKCTDLSSEDGILEC...,0,263
9053,>MED30_DANRE,MTTPPLAQFSGQQQQQTQAARDVNTASLCRIGQETVQDIVLRTMEI...,0,174
9054,>RB87F_DROME,MAEQNDSNGNYDDGEEITEPEQLRKLFIGGLDYRTTDDGLKAHFEK...,0,385


In [30]:
# Load the dataset
file_path = '/kaggle/input/cobmineddb/combined_peptides.csv'
peptide_data = pd.read_csv(file_path)
peptide_data

Unnamed: 0,name,sequence,label,sequence_length
0,>COC2C_CONCL,DVCDSLVGGRCIHNGCYCERDAPNGNCCNTDGCTARWWCPGTKWD,1,45
1,>NA15_ANTXA,GVSCLCDSDGPSVSGNTLSGIIWLAGCPSGWHNCKAHGPNIGWCCKK,1,47
2,>PPK6_SHELA,SESEVPGMWFGPRL,0,14
3,>SK1_BLAGE,EQFDDYGHMRF,0,11
4,>COMB_CONMA,AATCTHWALIYFKTVQLFGWHFNYQVDATYCPQFQPCMP,1,39
...,...,...,...,...
5616,,EDDHHHHHHHHHGVGGGGGGGGGG,0,24
5617,,MTTNTQYIYPIFTVRWLAVHALAVPTVFFLGSISAMQFIQR,0,41
5618,,GILDVAKTLVGKLRNVLGI,0,19
5619,,MKVLSSLASAKTRYPDCQVVRRRGRVYVICKSNPRFKAVQGRKKRR,0,46


# Train Test Split

In [31]:
%%capture
!pip install lazypredict
import pandas as pd
import numpy as np
from collections import Counter  # Import Counter
import itertools
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import StandardScaler

# Protein Prediction

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
import itertools
from collections import Counter
from lazypredict.Supervised import LazyClassifier

# Load the dataset
data = protein_data

# Define the feature extraction functions
def calculate_aac(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    sequence_length = len(sequence)
    aa_count = Counter(sequence)
    aac = [aa_count[aa] / sequence_length for aa in amino_acids]
    return aac

def calculate_dpc(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    dipeptides = [''.join(pair) for pair in itertools.product(amino_acids, repeat=2)]
    dipeptide_count = Counter([sequence[i:i+2] for i in range(len(sequence)-1)])
    sequence_length = len(sequence) - 1
    dpc = [dipeptide_count[dp] / sequence_length for dp in dipeptides]
    return dpc

# Extract features for a given dataset with renamed columns to avoid overlap
def extract_features(data):
    aac_features = data['sequence'].apply(calculate_aac)
    dpc_features = data['sequence'].apply(calculate_dpc) 

    aac_df = pd.DataFrame(aac_features.tolist(), columns=[f'aac_{i}' for i in range(20)])
    dpc_df = pd.DataFrame(dpc_features.tolist(), columns=[f'dpc_{i}' for i in range(400)])
  
    features = aac_df.join(dpc_df)
    
    return features

# Extract features and save them to CSV
features = extract_features(data)
features['label'] = data['label']
features.to_csv('/kaggle/working/protein_features.csv', index=False)



In [33]:
# Load the features from the saved CSV
loaded_data = pd.read_csv('/kaggle/working/protein_features.csv')

# Split the data into training (60%), validation (20%), and testing (20%) sets
train_data, temp_data = train_test_split(loaded_data, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Separate features and labels
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Evaluate models using LazyPredict
clf = LazyClassifier()
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

# Display model performance
print(models)


 97%|█████████▋| 28/29 [01:39<00:03,  3.11s/it]

[LightGBM] [Info] Number of positive: 2053, number of negative: 3380
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 105697
[LightGBM] [Info] Number of data points in the train set: 5433, number of used features: 420
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377876 -> initscore=-0.498574
[LightGBM] [Info] Start training from score -0.498574


100%|██████████| 29/29 [01:48<00:00,  3.74s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.95               0.95     0.95      0.95   
XGBClassifier                      0.95               0.95     0.95      0.95   
SVC                                0.95               0.95     0.95      0.95   
ExtraTreesClassifier               0.95               0.94     0.94      0.95   
RandomForestClassifier             0.94               0.94     0.94      0.94   
BaggingClassifier                  0.93               0.92     0.92      0.93   
LinearDiscriminantAnalysis         0.93               0.92     0.92      0.93   
RidgeClassifierCV                  0.93               0.91     0.91      0.93   
RidgeClassifier                    0.93               0.91     0.91      0.93   
LogisticRegression                 0.92               0.91     0.91      0.92   
QuadraticDiscriminantAnalysi




In [43]:
y_val

1720    0
856     1
1617    1
2127    1
2133    1
       ..
205     1
3825    0
1175    1
3153    1
1345    0
Name: label, Length: 1124, dtype: int64

In [35]:
y_test

4844    0
4970    1
748     0
3105    0
7425    0
       ..
7457    1
1586    1
1242    0
8840    0
8254    0
Name: label, Length: 1812, dtype: int64

In [36]:
best_model = models.index[0]
best_model

'LGBMClassifier'

In [37]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Define the top models with their names as keys
model_dict = {
    'LGBMClassifier': LGBMClassifier(),
    'XGBClassifier': XGBClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
}

# Retrieve the name of the best model (e.g., 'LGBMClassifier')
best_model_name = models.index[0]

# Initialize the best model from the model_dict using the best_model_name
best_model = model_dict[best_model_name]

# Fit the best model on the training data
best_model.fit(X_train, y_train)

# Predict on the validation data
val_predictions = best_model.predict(X_val)
val_accuracy = (val_predictions == y_val).mean()

# Predict on the test data
test_predictions = best_model.predict(X_test)
test_accuracy = (test_predictions == y_test).mean()

# Print the accuracy results
print(f"Best Model: {best_model_name}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


[LightGBM] [Info] Number of positive: 2053, number of negative: 3380
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 105521
[LightGBM] [Info] Number of data points in the train set: 5433, number of used features: 420
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377876 -> initscore=-0.498574
[LightGBM] [Info] Start training from score -0.498574
Best Model: LGBMClassifier
Validation Accuracy: 0.9531
Test Accuracy: 0.9570


# Peptide prediction

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
import itertools
from collections import Counter
from lazypredict.Supervised import LazyClassifier

# Load the dataset
data = peptide_data

# Define the feature extraction functions
def calculate_aac(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    sequence_length = len(sequence)
    aa_count = Counter(sequence)
    aac = [aa_count[aa] / sequence_length for aa in amino_acids]
    return aac

def calculate_dpc(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    dipeptides = [''.join(pair) for pair in itertools.product(amino_acids, repeat=2)]
    dipeptide_count = Counter([sequence[i:i+2] for i in range(len(sequence)-1)])
    sequence_length = len(sequence) - 1
    dpc = [dipeptide_count[dp] / sequence_length for dp in dipeptides]
    return dpc

# Extract features for a given dataset with renamed columns to avoid overlap
def extract_features(data):
    aac_features = data['sequence'].apply(calculate_aac)
    dpc_features = data['sequence'].apply(calculate_dpc)

    aac_df = pd.DataFrame(aac_features.tolist(), columns=[f'aac_{i}' for i in range(20)])
    dpc_df = pd.DataFrame(dpc_features.tolist(), columns=[f'dpc_{i}' for i in range(400)])

    features = aac_df.join(dpc_df)
    
    return features

# Extract features and save them to CSV
features = extract_features(data)
features['label'] = data['label']
features.to_csv('/kaggle/working/peptide_features.csv', index=False)



In [39]:
# Load the features from the saved CSV
loaded_data = pd.read_csv('/kaggle/working/peptide_features.csv')

# Split the data into training (60%), validation (20%), and testing (20%) sets
train_data, temp_data = train_test_split(loaded_data, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Separate features and labels
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_val = val_data.drop(columns=['label'])
y_val = val_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Evaluate models using LazyPredict
clf = LazyClassifier()
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

# Display model performance
print(models)


 97%|█████████▋| 28/29 [00:34<00:01,  1.07s/it]

[LightGBM] [Info] Number of positive: 1812, number of negative: 1560
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15926
[LightGBM] [Info] Number of data points in the train set: 3372, number of used features: 419
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.537367 -> initscore=0.149745
[LightGBM] [Info] Start training from score 0.149745


100%|██████████| 29/29 [00:35<00:00,  1.22s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
XGBClassifier                      0.94               0.94     0.94      0.94   
LGBMClassifier                     0.93               0.93     0.93      0.93   
ExtraTreesClassifier               0.93               0.93     0.93      0.93   
RandomForestClassifier             0.93               0.93     0.93      0.93   
SVC                                0.92               0.92     0.92      0.92   
BaggingClassifier                  0.92               0.92     0.92      0.92   
QuadraticDiscriminantAnalysis      0.91               0.91     0.91      0.91   
DecisionTreeClassifier             0.89               0.89     0.89      0.89   
NuSVC                              0.88               0.88     0.88      0.88   
LogisticRegression                 0.88               0.88     0.88      0.88   
LinearDiscriminantAnalysis  




In [40]:
best_model = models.index[0]
best_model

'XGBClassifier'

In [41]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Define the top models with their names as keys
model_dict = {
    'LGBMClassifier': LGBMClassifier(),
    'XGBClassifier': XGBClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
}

# Retrieve the name of the best model (e.g., 'LGBMClassifier')
best_model_name = models.index[0]

# Initialize the best model from the model_dict using the best_model_name
best_model = model_dict[best_model_name]

# Fit the best model on the training data
best_model.fit(X_train, y_train)

# Predict on the validation data
val_predictions = best_model.predict(X_val)
val_accuracy = (val_predictions == y_val).mean()

# Predict on the test data
test_predictions = best_model.predict(X_test)
test_accuracy = (test_predictions == y_test).mean()

# Print the accuracy results
print(f"Best Model: {best_model_name}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Best Model: XGBClassifier
Validation Accuracy: 0.9359
Test Accuracy: 0.9298
