In [7]:
import pandas as pd
import torch
import torch.nn as nn
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, pyll
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics.pairwise import cosine_similarity
from pandarallel import pandarallel
import joblib

pandarallel.initialize(progress_bar=True)


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [8]:
df = pd.read_pickle("../../data/esm_v1_embeddings.pkl")
print(df.LABEL.value_counts())
df.head()

LABEL
NEUTRAL    83924
LOF        25376
GOF         3137
Name: count, dtype: int64


Unnamed: 0,VARIANTKEY,LABEL,ENSG,GENE_SYMBOL,AA_POSITION,PROTEIN_REF,PROTEIN_ALT,REF_EMBEDDING_ESM1v,ALT_EMBEDDING_ESM1v
0,1-100196274-A-C,LOF,ENSG00000137992,DBT,477,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[0.023711795, 0.17607689, -0.09241844, 0.00010...","[0.023188237, 0.17757455, -0.08903169, 0.00287..."
1,1-100196286-T-C,NEUTRAL,ENSG00000137992,DBT,473,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[0.023711795, 0.17607689, -0.09241844, 0.00010...","[0.017270943, 0.17832978, -0.089323506, -0.000..."
2,1-100196349-T-C,LOF,ENSG00000137992,DBT,452,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[0.023711795, 0.17607689, -0.09241844, 0.00010...","[0.020151792, 0.18066218, -0.09096193, 0.00096..."
3,1-100206470-G-A,LOF,ENSG00000137992,DBT,395,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[0.023711795, 0.17607689, -0.09241844, 0.00010...","[0.02822395, 0.17860094, -0.08821337, 0.002231..."
4,1-100206621-C-T,LOF,ENSG00000137992,DBT,345,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[0.023711795, 0.17607689, -0.09241844, 0.00010...","[0.01695501, 0.17416799, -0.088790074, 0.00124..."


In [9]:
# Preprocess the data
label_mapping = {'NEUTRAL': 0, 'LOF': 1, 'GOF': 2}
df['LABEL_MAP'] = df['LABEL'].map(label_mapping)


In [10]:
ref_embedding_df = df['REF_EMBEDDING_ESM1v'].apply(pd.Series)
ref_embedding_df.columns = ['ref_' + str(col) for col in ref_embedding_df.columns]

alt_embedding_df = df['ALT_EMBEDDING_ESM1v'].apply(pd.Series)
alt_embedding_df.columns = ['alt_' + str(col) for col in alt_embedding_df.columns]

X = pd.concat([ref_embedding_df, alt_embedding_df], axis=1)
y = df['LABEL_MAP']


In [11]:
TEST_SIZE = 0.1
VALIDATION_SIZE = 0.1
VALIDATION_RATIO = VALIDATION_SIZE / (1 - TEST_SIZE)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=VALIDATION_RATIO, random_state=42)

print(f"Training set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Testing set: {len(X_test)} samples")
X_train

Training set: 89949 samples
Validation set: 11244 samples
Testing set: 11244 samples


Unnamed: 0,ref_0,ref_1,ref_2,ref_3,ref_4,ref_5,ref_6,ref_7,ref_8,ref_9,...,alt_1270,alt_1271,alt_1272,alt_1273,alt_1274,alt_1275,alt_1276,alt_1277,alt_1278,alt_1279
20269,-0.146173,0.156376,0.000888,-0.230072,0.101029,0.101044,0.252387,-0.066951,0.097575,-0.065676,...,0.079426,-0.212979,-0.087866,0.041466,-0.093521,-0.027821,0.006296,-0.097347,0.006004,-0.173182
25081,0.029672,-0.051521,-0.033328,-0.299323,0.122467,-0.031444,0.084524,-0.034774,-0.108854,0.105211,...,0.024459,0.048786,-0.164407,-0.088154,-0.242851,-0.015880,0.074064,0.210618,0.101236,-0.196951
18979,0.060891,0.197601,0.043066,-0.039801,-0.333359,0.030640,0.048553,-0.112229,0.297236,-0.162119,...,-0.011876,-0.153362,-0.365641,-0.186887,-0.150051,-0.126997,0.129966,0.069477,0.077457,-0.122997
82624,-0.170117,0.194081,0.025448,-0.360824,-0.298668,0.042658,0.072794,-0.061644,0.150384,0.108005,...,-0.084211,-0.021815,-0.088528,-0.245504,-0.206662,-0.176167,0.015710,-0.156966,0.208727,0.089556
12669,-0.338375,0.136773,0.202468,-0.246902,-0.079970,0.020851,0.328304,-0.062548,0.109150,-0.203248,...,-0.034403,-0.201885,-0.134612,0.000358,-0.023993,-0.244789,0.084039,-0.163325,0.088148,-0.199793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105470,-0.284750,-0.075759,0.025080,-0.025060,-0.174668,0.102968,0.017929,0.099065,0.263065,-0.345071,...,-0.189804,-0.271070,-0.075504,0.129895,0.018593,-0.257288,-0.017744,-0.149134,0.161583,-0.056928
86161,-0.356369,0.118840,0.091371,-0.330445,-0.230696,-0.080591,0.224906,-0.027180,0.174413,-0.225150,...,0.120394,0.058010,0.017630,-0.164787,-0.270029,-0.198696,-0.212453,-0.128897,0.179976,-0.053019
107790,-0.069459,0.197073,-0.066983,-0.010813,-0.276706,0.100385,0.010662,-0.006500,0.223018,-0.182063,...,0.091189,-0.084389,-0.200827,-0.032496,-0.122004,-0.139325,0.107268,0.047707,0.163671,0.078454
23934,-0.156729,0.128466,0.004666,-0.101992,-0.154433,-0.058414,-0.106457,0.037389,0.136786,-0.113681,...,0.006366,0.070988,-0.160017,0.104846,-0.141966,-0.126741,0.050888,0.047158,0.175657,0.068590


In [12]:
# Train a logistic regression model
linear_model = LogisticRegression()
linear_model.fit(X_train, y_train)
# Evaluate the model
lr_pred = linear_model.predict(X_test)
print(classification_report(y_test, lr_pred, target_names=label_mapping.keys()))

              precision    recall  f1-score   support

     NEUTRAL       0.85      0.92      0.88      8412
         LOF       0.67      0.52      0.58      2543
         GOF       0.56      0.29      0.38       289

    accuracy                           0.81     11244
   macro avg       0.69      0.58      0.62     11244
weighted avg       0.80      0.81      0.80     11244



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Train a random forest classification model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
# Evaluate the model
rf_pred = rf_model.predict(X_test)
print(classification_report(y_test, rf_pred, target_names=label_mapping.keys()))

              precision    recall  f1-score   support

     NEUTRAL       0.92      0.94      0.93      8412
         LOF       0.78      0.74      0.76      2543
         GOF       0.85      0.77      0.81       289

    accuracy                           0.89     11244
   macro avg       0.85      0.82      0.83     11244
weighted avg       0.89      0.89      0.89     11244



In [14]:
# Train an XGBoost model
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate the XGBoost model
xgb_pred = xgb_model.predict(X_test)
print(classification_report(y_test, xgb_pred, target_names=label_mapping.keys()))

              precision    recall  f1-score   support

     NEUTRAL       0.90      0.94      0.92      8412
         LOF       0.79      0.67      0.72      2543
         GOF       0.87      0.65      0.75       289

    accuracy                           0.87     11244
   macro avg       0.85      0.76      0.80     11244
weighted avg       0.87      0.87      0.87     11244



In [15]:
# Train a LightGBM model
lgb_model = LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
lgb_model.fit(X_train, y_train)

# Evaluate the LightGBM model
lgb_pred = lgb_model.predict(X_test)
print(classification_report(y_test, lgb_pred, target_names=label_mapping.keys()))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.970470 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 652800
[LightGBM] [Info] Number of data points in the train set: 89949, number of used features: 2560
[LightGBM] [Info] Start training from score -0.291971
[LightGBM] [Info] Start training from score -1.491335
[LightGBM] [Info] Start training from score -3.570628
              precision    recall  f1-score   support

     NEUTRAL       0.90      0.94      0.92      8412
         LOF       0.78      0.67      0.72      2543
         GOF       0.81      0.73      0.77       289

    accuracy                           0.87     11244
   macro avg       0.83      0.78      0.80     11244
weighted avg       0.87      0.87      0.87     11244



In [16]:
# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long).to(device)

# Define the deep learning model
class ProteinClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes, dropout_rate=0.5):
        super(ProteinClassifier, self).__init__()
        self.hidden_layers = nn.ModuleList()
        self.hidden_layers.append(nn.Linear(input_size, hidden_sizes[0]))
        for i in range(1, len(hidden_sizes)):
            self.hidden_layers.append(nn.Linear(hidden_sizes[i-1], hidden_sizes[i]))
        self.output_layer = nn.Linear(hidden_sizes[-1], num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        for layer in self.hidden_layers:
            x = self.relu(layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

# Set hyperparameters
input_size = X_train_tensor.shape[1]
hidden_sizes = [256, 128]  # Increased hidden layer sizes
num_classes = 3
num_epochs = 100  # Increased number of epochs
batch_size = 32
learning_rate = 0.001
weight_decay = 0.001  # Added weight decay regularization

# Initialize the model
model = ProteinClassifier(input_size, hidden_sizes, num_classes).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)  # Changed optimizer to AdamW

# Train the model
for epoch in range(num_epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        batch_X = X_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size]

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

#  Evaluate the model
with torch.no_grad():
    dl_outputs = model(X_test_tensor)
    _, dl_predicted = torch.max(dl_outputs.data, 1)

print(classification_report(y_test_tensor.cpu(), dl_predicted.cpu(), target_names=label_mapping.keys()))

Epoch [10/100], Loss: 0.3478
Epoch [20/100], Loss: 0.2518
Epoch [30/100], Loss: 0.2463
Epoch [40/100], Loss: 0.2646
Epoch [50/100], Loss: 0.2372
Epoch [60/100], Loss: 0.2349
Epoch [70/100], Loss: 0.2761
Epoch [80/100], Loss: 0.2998
Epoch [90/100], Loss: 0.3425
Epoch [100/100], Loss: 0.2613
              precision    recall  f1-score   support

     NEUTRAL       0.90      0.91      0.90      8412
         LOF       0.71      0.68      0.69      2543
         GOF       0.58      0.57      0.58       289

    accuracy                           0.85     11244
   macro avg       0.73      0.72      0.72     11244
weighted avg       0.84      0.85      0.85     11244



In [17]:
def objective(params):
    lgb_model = LGBMClassifier(**params, random_state=42)
    scores = cross_val_score(lgb_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    return {'loss': -scores.mean(), 'status': STATUS_OK}

# Define the hyperparameter search space
space = {
    'num_leaves': pyll.scope.int(hp.quniform('num_leaves', 31, 127, 1)),
    'max_depth': pyll.scope.int(hp.quniform('max_depth', 5, 9, 1)),
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'n_estimators': pyll.scope.int(hp.quniform('n_estimators', 50, 200, 1)),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0)
}

# Perform Bayesian optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

best_params = {
  "num_leaves": int(best['num_leaves']),
  "max_depth": int(best['max_depth']),
  "n_estimators": int(best['n_estimators']),
  **{k: v for k, v in best.items() if k not in ["num_leaves", "max_depth", "n_estimators"]}
}

print(best_params)

best_lgb_model = LGBMClassifier(**best_params, random_state=42)
best_lgb_model.fit(X_train, y_train)

# Evaluate the tuned LightGBM model
best_lgb_pred = best_lgb_model.predict(X_test)
print(classification_report(y_test, best_lgb_pred, target_names=label_mapping.keys()))
joblib.dump(best_lgb_model, '../../data/lgb_esm1.pkl')

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.915930 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 652800
[LightGBM] [Info] Number of data points in the train set: 71959, number of used features: 2560
[LightGBM] [Info] Start training from score -0.291958
[LightGBM] [Info] Start training from score -1.491332
[LightGBM] [Info] Start training from score -3.571021
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.371273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 652800
[LightGBM] [Info] Number of data points in the train set: 71960, number of used features: 2560
[LightGBM] [Info] Start training from score -0.291971
[LightGBM] [Info] Start training from score -1.491346
[LightGBM] [Info] Start training from score -3.570541
[LightGBM] [Info] Auto-choosing col-

['../../data/lgb_esm1.pkl']

In [18]:
df["PREDICTION"] = best_lgb_model.predict(X)
df["PREDICTED_LABEL"] = df["PREDICTION"].map({v: k for k, v in label_mapping.items()})

In [19]:
# Relationship between cosine similarity and label
def cosine_similarity_score(row):
    return cosine_similarity([row["REF_EMBEDDING_ESM1v"]], [row["ALT_EMBEDDING_ESM1v"]])[0][0]

df['COSINE_SIMILARITY'] = df.parallel_apply(cosine_similarity_score, axis=1)
df['COSINE_DISTANCE'] = 1 - df['COSINE_SIMILARITY']

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11244), Label(value='0 / 11244')))…

In [20]:
df.head()

Unnamed: 0,VARIANTKEY,LABEL,ENSG,GENE_SYMBOL,AA_POSITION,PROTEIN_REF,PROTEIN_ALT,REF_EMBEDDING_ESM1v,ALT_EMBEDDING_ESM1v,LABEL_MAP,PREDICTION,PREDICTED_LABEL,COSINE_SIMILARITY,COSINE_DISTANCE
0,1-100196274-A-C,LOF,ENSG00000137992,DBT,477,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[0.023711795, 0.17607689, -0.09241844, 0.00010...","[0.023188237, 0.17757455, -0.08903169, 0.00287...",1,1,LOF,0.999989,1.1e-05
1,1-100196286-T-C,NEUTRAL,ENSG00000137992,DBT,473,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[0.023711795, 0.17607689, -0.09241844, 0.00010...","[0.017270943, 0.17832978, -0.089323506, -0.000...",0,0,NEUTRAL,0.999986,1.4e-05
2,1-100196349-T-C,LOF,ENSG00000137992,DBT,452,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[0.023711795, 0.17607689, -0.09241844, 0.00010...","[0.020151792, 0.18066218, -0.09096193, 0.00096...",1,1,LOF,0.999994,6e-06
3,1-100206470-G-A,LOF,ENSG00000137992,DBT,395,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[0.023711795, 0.17607689, -0.09241844, 0.00010...","[0.02822395, 0.17860094, -0.08821337, 0.002231...",1,1,LOF,0.999993,7e-06
4,1-100206621-C-T,LOF,ENSG00000137992,DBT,345,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[0.023711795, 0.17607689, -0.09241844, 0.00010...","[0.01695501, 0.17416799, -0.088790074, 0.00124...",1,1,LOF,0.999985,1.5e-05


In [21]:
df.to_pickle("../../data/esm_v1_embeddings_with_predictions.pkl")

In [23]:
train = X_train.copy()
train["label"] = y_train
train["PREDICTION"] = best_lgb_model.predict(X_train)
train["PREDICTED_LABEL"] = train["PREDICTION"].map({v: k for k, v in label_mapping.items()})

test = X_test.copy()
test["label"] = y_test
test["PREDICTION"] = best_lgb_model.predict(X_test)
test["PREDICTED_LABEL"] = test["PREDICTION"].map({v: k for k, v in label_mapping.items()})

validation = X_val.copy()
validation["label"] = y_val
validation["PREDICTION"] = best_lgb_model.predict(X_val)
validation["PREDICTED_LABEL"] = validation["PREDICTION"].map({v: k for k, v in label_mapping.items()})

train.to_pickle("../../data/esm_v1_embeddings_train.pkl")
test.to_pickle("../../data/esm_v1_embeddings_test.pkl")
validation.to_pickle("../../data/esm_v1_embeddings_validation.pkl")
