In [None]:
pip install transformers pandas scikit-learn torch lazypredict


In [None]:
import pandas as pd

## peptides

In [None]:
import torch
from transformers import BertForMaskedLM, BertTokenizer, BertModel

# Load the CSV file
# file_path = 'path/to/your/combined_peptides.csv'
data = pd.read_csv('/kaggle/input/new-combined-data/combined_peptides.csv')

# Load the pretrained ProtBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert', do_lower_case=False)
model = BertModel.from_pretrained('Rostlab/prot_bert')
model.to('cuda')

def add_spaces(s):
    return ' '.join(s)

# Prepare the sequence data
sequences = data['sequence'].tolist()
sequences = [add_spaces(s) for s in sequences]
labels = data['label'].tolist()

# Tokenize sequences
def tokenize_sequences(sequences, tokenizer, max_length=512):
    tokenized_sequences = []
    for seq in sequences:
        # Tokenize and encode the sequence
        inputs = tokenizer(seq, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
        tokenized_sequences.append(inputs)
    return tokenized_sequences

tokenized_sequences = tokenize_sequences(sequences, tokenizer)

# Extract features
# Extract features
def extract_features(tokenized_sequences, model):
    model.eval()
    all_embeddings = []
    with torch.no_grad():
        for tokenized_sequence in tokenized_sequences:
            # Move tokenized sequence to CUDA
            tokenized_sequence = {key: val.to('cuda') for key, val in tokenized_sequence.items()}
            # Get model outputs
            outputs = model(**tokenized_sequence)
            # Compute mean pooling on the token embeddings
            sequence_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
            all_embeddings.append(sequence_embeddings)
    return all_embeddings

sequence_embeddings = extract_features(tokenized_sequences, model)

# Convert the embeddings to a DataFrame
embedding_df = pd.DataFrame(sequence_embeddings)
embedding_df['label'] = labels

embedding_df.to_csv('/kaggle/working/embeddings.csv', index=False)


In [None]:
from sklearn.model_selection import train_test_split

# Load the embeddings
embedding_df = pd.read_csv('/kaggle/working/embeddings.csv')

# Split the data into features and labels
X = embedding_df.drop(columns=['label'])
y = embedding_df['label']

# First split into training and temporary (test + validation) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Second split to divide temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the split datasets
X_train.to_csv('/kaggle/working/X_train.csv', index=False)
X_test.to_csv('/kaggle/working/X_test.csv', index=False)
y_train.to_csv('/kaggle/working/y_train.csv', index=False)
y_test.to_csv('/kaggle/working/y_test.csv', index=False)
X_val.to_csv('/kaggle/working/X_val.csv', index=False)
y_val.to_csv('/kaggle/working/y_val.csv', index=False)


In [None]:
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score

# Load the split datasets
X_train = pd.read_csv('/kaggle/working/X_train.csv')
X_test = pd.read_csv('/kaggle/working/X_test.csv')
X_val = pd.read_csv('/kaggle/working/X_val.csv')
y_train = pd.read_csv('/kaggle/working/y_train.csv').values.ravel()
y_test = pd.read_csv('/kaggle/working/y_test.csv').values.ravel()
y_val = pd.read_csv('/kaggle/working/y_val.csv').values.ravel()

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Train and test the models
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

# Display the results
print(models)

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Define the model dictionary including AdaBoostClassifier
model_dict = {
    'LGBMClassifier': LGBMClassifier(),
    'XGBClassifier': XGBClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
}

# Fit and evaluate each model
best_model_name = None
best_accuracy = 0

for model_name, model in model_dict.items():
    model.fit(X_train, y_train)
    val_predictions = model.predict(X_val)
    test_predictions = model.predict(X_test)
    val_accuracy = accuracy_score(y_val, val_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    print(f"{model_name} validation accuracy: {val_accuracy:.4f}, test accuracy: {test_accuracy:.4f}")

    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model_name = model_name

print(f"\nBest Model Name: {best_model_name}")


## proteins

In [None]:
import torch
from transformers import BertForMaskedLM, BertTokenizer, BertModel

# Load the CSV file
# file_path = 'path/to/your/combined_peptides.csv'
data = pd.read_csv('/kaggle/input/new-combined-data/combined_protein.csv')

# Load the pretrained ProtBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert', do_lower_case=False)
model = BertModel.from_pretrained('Rostlab/prot_bert')
model.to('cuda')

def add_spaces(s):
    return ' '.join(s)

# Prepare the sequence data
sequences = data['sequence'].tolist()
sequences = [add_spaces(s) for s in sequences]
labels = data['label'].tolist()

# Tokenize sequences
def tokenize_sequences(sequences, tokenizer, max_length=512):
    tokenized_sequences = []
    for seq in sequences:
        # Tokenize and encode the sequence
        inputs = tokenizer(seq, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
        tokenized_sequences.append(inputs)
    return tokenized_sequences

tokenized_sequences = tokenize_sequences(sequences, tokenizer)

# Extract features
# Extract features
def extract_features(tokenized_sequences, model):
    model.eval()
    all_embeddings = []
    with torch.no_grad():
        for tokenized_sequence in tokenized_sequences:
            # Move tokenized sequence to CUDA
            tokenized_sequence = {key: val.to('cuda') for key, val in tokenized_sequence.items()}
            # Get model outputs
            outputs = model(**tokenized_sequence)
            # Compute mean pooling on the token embeddings
            sequence_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
            all_embeddings.append(sequence_embeddings)
    return all_embeddings

sequence_embeddings = extract_features(tokenized_sequences, model)

# Convert the embeddings to a DataFrame
embedding_df = pd.DataFrame(sequence_embeddings)
embedding_df['label'] = labels

embedding_df.to_csv('/kaggle/working/embeddings_pro.csv', index=False)


In [None]:
from sklearn.model_selection import train_test_split

# Load the embeddings
embedding_df = pd.read_csv('/kaggle/working/embeddings_pro.csv')

# Split the data into features and labels
X = embedding_df.drop(columns=['label'])
y = embedding_df['label']

# First split into training and temporary (test + validation) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Second split to divide temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the split datasets
X_train.to_csv('/kaggle/working/X_train_pro.csv', index=False)
X_test.to_csv('/kaggle/working/X_test_pro.csv', index=False)
y_train.to_csv('/kaggle/working/y_train_pro.csv', index=False)
y_test.to_csv('/kaggle/working/y_test_pro.csv', index=False)
X_val.to_csv('/kaggle/working/X_val_pro.csv', index=False)
y_val.to_csv('/kaggle/working/y_val_pro.csv', index=False)


In [None]:
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score

# Load the split datasets
X_train = pd.read_csv('/kaggle/working/X_train_pro.csv')
X_test = pd.read_csv('/kaggle/working/X_test_pro.csv')
X_val = pd.read_csv('/kaggle/working/X_val_pro.csv')
y_train = pd.read_csv('/kaggle/working/y_train_pro.csv').values.ravel()
y_test = pd.read_csv('/kaggle/working/y_test_pro.csv').values.ravel()
y_val = pd.read_csv('/kaggle/working/y_val_pro.csv').values.ravel()

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Train and test the models
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

# Display the results
print(models)

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Define the model dictionary including AdaBoostClassifier
model_dict = {
    'LGBMClassifier': LGBMClassifier(),
    'XGBClassifier': XGBClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
}

# Fit and evaluate each model
best_model_name = None
best_accuracy = 0

for model_name, model in model_dict.items():
    model.fit(X_train, y_train)
    val_predictions = model.predict(X_val)
    test_predictions = model.predict(X_test)
    val_accuracy = accuracy_score(y_val, val_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    print(f"{model_name} validation accuracy: {val_accuracy:.4f}, test accuracy: {test_accuracy:.4f}")

    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model_name = model_name

print(f"\nBest Model Name: {best_model_name}")
