In [1]:
import pandas as pd
import torch
import torch.nn as nn
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, pyll
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics.pairwise import cosine_similarity
from pandarallel import pandarallel
import joblib

pandarallel.initialize(progress_bar=True)


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
df = pd.read_pickle("../../data/esm2_embeddings.pkl")
print(df.LABEL.value_counts())
df.head()

LABEL
NEUTRAL    83924
LOF        25376
GOF         3137
Name: count, dtype: int64


Unnamed: 0,VARIANTKEY,LABEL,ENSG,GENE_SYMBOL,AA_POSITION,PROTEIN_REF,PROTEIN_ALT,REF_EMBEDDING_ESM2,ALT_EMBEDDING_ESM2
0,1-100196274-A-C,LOF,ENSG00000137992,DBT,477,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[-0.34903002, -1.0991553, -5.5619345, -0.78340...","[-0.35547394, -1.0680466, -5.513677, -0.804930..."
1,1-100196286-T-C,NEUTRAL,ENSG00000137992,DBT,473,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[-0.34903002, -1.0991553, -5.5619345, -0.78340...","[-0.34180462, -1.1007842, -5.556662, -0.767595..."
2,1-100196349-T-C,LOF,ENSG00000137992,DBT,452,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[-0.34903002, -1.0991553, -5.5619345, -0.78340...","[-0.37278727, -1.1201291, -5.4481387, -0.74900..."
3,1-100206470-G-A,LOF,ENSG00000137992,DBT,395,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[-0.34903002, -1.0991553, -5.5619345, -0.78340...","[-0.3627783, -1.0958921, -5.580294, -0.7817215..."
4,1-100206621-C-T,LOF,ENSG00000137992,DBT,345,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[-0.34903002, -1.0991553, -5.5619345, -0.78340...","[-0.3289509, -1.0981828, -5.574082, -0.8473001..."


In [3]:
# Preprocess the data
label_mapping = {'NEUTRAL': 0, 'LOF': 1, 'GOF': 2}
df['LABEL_MAP'] = df['LABEL'].map(label_mapping)


In [4]:
ref_embedding_df = df['REF_EMBEDDING_ESM2'].apply(pd.Series)
ref_embedding_df.columns = ['ref_' + str(col) for col in ref_embedding_df.columns]

alt_embedding_df = df['ALT_EMBEDDING_ESM2'].apply(pd.Series)
alt_embedding_df.columns = ['alt_' + str(col) for col in alt_embedding_df.columns]

X = pd.concat([ref_embedding_df, alt_embedding_df], axis=1)
y = df['LABEL_MAP']


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,ref_0,ref_1,ref_2,ref_3,ref_4,ref_5,ref_6,ref_7,ref_8,ref_9,...,alt_1270,alt_1271,alt_1272,alt_1273,alt_1274,alt_1275,alt_1276,alt_1277,alt_1278,alt_1279
97954,1.007159,0.625890,-1.147749,-0.868848,-0.398689,0.180315,3.564031,2.325544,2.305351,2.573489,...,3.042873,-0.673230,-5.108484,0.487474,1.192972,-3.002266,1.256073,-1.461449,2.503974,1.683331
42165,-2.241435,1.962986,-1.438145,-1.125100,-1.529073,1.308071,2.555913,-7.813066,2.284248,3.498311,...,3.884404,2.940055,-4.310533,-1.498106,1.020644,2.346589,-3.815270,-1.028258,-1.908015,-3.430133
24865,1.319570,-3.345788,-0.619051,1.600786,4.437977,-5.445426,1.858398,0.188545,-1.095894,0.880065,...,6.449509,1.558457,-4.582923,-5.528811,-0.091863,-6.554604,1.724669,0.204377,2.391893,1.059249
95444,2.955713,-2.664345,-1.761680,-3.133312,-3.024415,-1.363234,3.160049,1.948040,3.395301,7.876840,...,3.997331,-2.330583,-5.338358,-5.821898,-0.683085,2.740407,-0.086905,-1.680465,2.469670,-1.644204
56201,0.699221,-1.364980,2.463659,0.971319,-1.628326,-1.182589,2.951009,0.353827,0.955772,1.752395,...,3.813994,1.182792,-2.976541,-1.034894,2.737365,-5.111894,-4.260856,1.795724,3.894674,3.697846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76820,-0.080988,2.473662,-0.126151,-1.560510,2.889087,-0.289816,4.128846,-0.811071,-2.659663,2.503237,...,6.120043,-0.689029,-1.760294,-1.060103,0.623009,-1.475985,1.091164,-0.568905,-0.509177,2.380261
110268,1.034920,-3.714515,-3.785233,2.337017,-3.315877,-1.621261,1.621308,-2.648594,2.212569,-0.123004,...,3.749042,3.388148,2.179872,-4.475160,-1.750455,0.799229,-0.229550,3.592467,2.306698,-0.336641
103694,9.674859,-0.483721,4.663747,3.516311,-2.619012,-9.629010,-4.015301,-0.107326,-0.223116,5.709130,...,15.438075,2.647180,-4.780652,-2.466497,-3.827278,-5.084875,-0.864906,-1.133607,3.089062,-5.357327
860,0.788752,-1.672394,1.632381,0.160317,2.809713,-2.452529,-2.461202,3.469492,2.983599,5.904491,...,3.213947,2.201417,-4.456237,0.749377,-5.141415,5.032283,-1.418648,-1.070773,-6.819933,-2.118454


In [8]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Evaluate the model
lr_pred = model.predict(X_test)
print(classification_report(y_test, lr_pred, target_names=label_mapping.keys()))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

     NEUTRAL       0.85      0.92      0.89     16880
         LOF       0.68      0.52      0.59      5027
         GOF       0.48      0.24      0.32       581

    accuracy                           0.82     22488
   macro avg       0.67      0.56      0.60     22488
weighted avg       0.80      0.82      0.80     22488



In [9]:
# Train an XGBoost model
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate the XGBoost model
xgb_pred = xgb_model.predict(X_test)
print(classification_report(y_test, xgb_pred, target_names=label_mapping.keys()))

              precision    recall  f1-score   support

     NEUTRAL       0.90      0.93      0.91     16880
         LOF       0.76      0.67      0.71      5027
         GOF       0.76      0.61      0.68       581

    accuracy                           0.86     22488
   macro avg       0.81      0.74      0.77     22488
weighted avg       0.86      0.86      0.86     22488



In [10]:
# Train a LightGBM model
lgb_model = LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
lgb_model.fit(X_train, y_train)

# Evaluate the LightGBM model
lgb_pred = lgb_model.predict(X_test)
print(classification_report(y_test, lgb_pred, target_names=label_mapping.keys()))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.634313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 652640
[LightGBM] [Info] Number of data points in the train set: 89949, number of used features: 2560
[LightGBM] [Info] Start training from score -0.293894
[LightGBM] [Info] Start training from score -1.486211
[LightGBM] [Info] Start training from score -3.560799
              precision    recall  f1-score   support

     NEUTRAL       0.90      0.93      0.91     16880
         LOF       0.75      0.68      0.71      5027
         GOF       0.74      0.62      0.67       581

    accuracy                           0.86     22488
   macro avg       0.80      0.74      0.77     22488
weighted avg       0.86      0.86      0.86     22488



In [11]:
# Convert data to PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long).to(device)

# Define the deep learning model
class ProteinClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes, dropout_rate=0.5):
        super(ProteinClassifier, self).__init__()
        self.hidden_layers = nn.ModuleList()
        self.hidden_layers.append(nn.Linear(input_size, hidden_sizes[0]))
        for i in range(1, len(hidden_sizes)):
            self.hidden_layers.append(nn.Linear(hidden_sizes[i-1], hidden_sizes[i]))
        self.output_layer = nn.Linear(hidden_sizes[-1], num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        for layer in self.hidden_layers:
            x = self.relu(layer(x))
            x = self.dropout(x)
        x = self.output_layer(x)
        return x

# Set hyperparameters
input_size = X_train_tensor.shape[1]
hidden_sizes = [256, 128]  # Increased hidden layer sizes
num_classes = 3
num_epochs = 100  # Increased number of epochs
batch_size = 32
learning_rate = 0.001
weight_decay = 0.001  # Added weight decay regularization

# Initialize the model
model = ProteinClassifier(input_size, hidden_sizes, num_classes).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)  # Changed optimizer to AdamW

# Train the model
for epoch in range(num_epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        batch_X = X_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size]

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

#  Evaluate the model
with torch.no_grad():
    dl_outputs = model(X_test_tensor)
    _, dl_predicted = torch.max(dl_outputs.data, 1)

print(classification_report(y_test_tensor.cpu(), dl_predicted.cpu(), target_names=label_mapping.keys()))

Epoch [10/100], Loss: 0.4874
Epoch [20/100], Loss: 0.5093
Epoch [30/100], Loss: 0.4465
Epoch [40/100], Loss: 0.6383
Epoch [50/100], Loss: 0.5556
Epoch [60/100], Loss: 0.4661
Epoch [70/100], Loss: 0.4715
Epoch [80/100], Loss: 0.4689
Epoch [90/100], Loss: 0.5170
Epoch [100/100], Loss: 0.4880
              precision    recall  f1-score   support

     NEUTRAL       0.82      0.96      0.88     16880
         LOF       0.74      0.41      0.53      5027
         GOF       0.00      0.00      0.00       581

    accuracy                           0.81     22488
   macro avg       0.52      0.46      0.47     22488
weighted avg       0.78      0.81      0.78     22488



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
def objective(params):
    lgb_model = LGBMClassifier(**params, random_state=42)
    scores = cross_val_score(lgb_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    return {'loss': -scores.mean(), 'status': STATUS_OK}

# Define the hyperparameter search space
space = {
    'num_leaves': pyll.scope.int(hp.quniform('num_leaves', 31, 127, 1)),
    'max_depth': pyll.scope.int(hp.quniform('max_depth', 5, 9, 1)),
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'n_estimators': pyll.scope.int(hp.quniform('n_estimators', 50, 200, 1)),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0)
}

# Perform Bayesian optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

best_params = {
  "num_leaves": int(best['num_leaves']),
  "max_depth": int(best['max_depth']),
  "n_estimators": int(best['n_estimators']),
  **{k: v for k, v in best.items() if k not in ["num_leaves", "max_depth", "n_estimators"]}
}

print(best_params)

best_lgb_model = LGBMClassifier(**best_params, random_state=42)
best_lgb_model.fit(X_train, y_train)

# Evaluate the tuned LightGBM model
best_lgb_pred = best_lgb_model.predict(X_test)
print(classification_report(y_test, best_lgb_pred, target_names=label_mapping.keys()))
joblib.dump(best_lgb_model, '../../data/lgb_esm2.pkl')

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 7.984817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 652628
[LightGBM] [Info] Number of data points in the train set: 71960, number of used features: 2560
[LightGBM] [Info] Start training from score -0.293909
[LightGBM] [Info] Start training from score -1.486173
[LightGBM] [Info] Start training from score -3.560713
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 8.927594 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 652639
[LightGBM] [Info] Number of data points in the train set: 71959, number of used features: 2560
[LightGBM] [Info] Start training from score -0.293895
[LightGBM] [Info] Start training from score -1.486221
[LightGBM] [Info] Start training from score -3.560699
[LightGBM] [Info] Auto-choosing col-

['../../data/lgb_esm2.pkl']

In [8]:
df["PREDICTION"] = best_lgb_model.predict(X)
df["PREDICTED_LABEL"] = df["PREDICTION"].map({v: k for k, v in label_mapping.items()})

In [9]:
# Relationship between cosine similarity and label
def cosine_similarity_score(row):
    return cosine_similarity([row["REF_EMBEDDING_ESM2"]], [row["ALT_EMBEDDING_ESM2"]])[0][0]

df['COSINE_SIMILARITY'] = df.parallel_apply(cosine_similarity_score, axis=1)
df['COSINE_DISTANCE'] = 1 - df['COSINE_SIMILARITY']

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11244), Label(value='0 / 11244')))…

In [10]:
df.head()

Unnamed: 0,VARIANTKEY,LABEL,ENSG,GENE_SYMBOL,AA_POSITION,PROTEIN_REF,PROTEIN_ALT,REF_EMBEDDING_ESM2,ALT_EMBEDDING_ESM2,LABEL_MAP,PREDICTION,PREDICTED_LABEL,COSINE_SIMILARITY,COSINE_DISTANCE
0,1-100196274-A-C,LOF,ENSG00000137992,DBT,477,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[-0.34903002, -1.0991553, -5.5619345, -0.78340...","[-0.35547394, -1.0680466, -5.513677, -0.804930...",1,1,LOF,0.999994,6e-06
1,1-100196286-T-C,NEUTRAL,ENSG00000137992,DBT,473,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[-0.34903002, -1.0991553, -5.5619345, -0.78340...","[-0.34180462, -1.1007842, -5.556662, -0.767595...",0,0,NEUTRAL,0.999997,3e-06
2,1-100196349-T-C,LOF,ENSG00000137992,DBT,452,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[-0.34903002, -1.0991553, -5.5619345, -0.78340...","[-0.37278727, -1.1201291, -5.4481387, -0.74900...",1,1,LOF,0.999976,2.4e-05
3,1-100206470-G-A,LOF,ENSG00000137992,DBT,395,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[-0.34903002, -1.0991553, -5.5619345, -0.78340...","[-0.3627783, -1.0958921, -5.580294, -0.7817215...",1,1,LOF,0.999997,3e-06
4,1-100206621-C-T,LOF,ENSG00000137992,DBT,345,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,MAAVRMLRTWSRNAGKLICVRYFQTCGNVHVLKPNYVCFFGYPSFK...,"[-0.34903002, -1.0991553, -5.5619345, -0.78340...","[-0.3289509, -1.0981828, -5.574082, -0.8473001...",1,1,LOF,0.999992,8e-06


In [11]:
df.to_pickle("../../data/esm2_embeddings_with_predictions.pkl")