### This notebook holds the tasks done on the human dataset's embeddings

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch, json, re

from scipy.cluster.hierarchy import dendrogram, linkage

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
# load data
df_dna = pd.read_csv('dna_embeddings_human.csv')
df_protein = pd.read_csv('protein_embeddings_human.csv')

In [None]:
# load embeddings as numpy array and remove redundent columns
df_dna['embeddings'] = df_dna['embeddings_json'].apply(lambda x: np.array(json.loads(x)))
drop_cols = ['Unnamed: 0', 'embeddings_np', 'embeddings_tensor', 'embeddings_json']
for col in drop_cols:
    if col in df_dna.columns:
        df_dna.drop(columns=[col], inplace=True)

df_protein['embeddings'] = df_protein['embeddings_json'].apply(lambda x: np.array(json.loads(x)))
drop_cols = ['Unnamed: 0', 'embeddings_np', 'embeddings_tensor', 'embeddings_json']
for col in drop_cols:
    if col in df_protein.columns:
        df_protein.drop(columns=[col], inplace=True)

### Random forest classifier - predict gene group

In [None]:
def print_measures(y_test, y_pred):
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision per class:\n{precision_score(y_test, y_pred, average=None)}")
    print(f"Recall per class:\n{recall_score(y_test, y_pred, average=None)}")
    print(f"F1-Score per class:\n{f1_score(y_test, y_pred, average=None)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

In [None]:
def random_forest_classifier_kfold(df, col_to_pred, n_splits=4):
    # Preprocess the data
    X = pd.DataFrame(df['embeddings'].tolist())
    y = df[col_to_pred]
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    all_y_true = []
    all_y_pred = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = RandomForestClassifier( random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        all_y_true.extend(y_test)
        all_y_pred.extend(y_pred)
    
    print_measures(all_y_true, all_y_pred)

In [None]:
random_forest_classifier_kfold(df_dna,'class')
random_forest_classifier_kfold(df_protein,'class')

###  Hierarchical clustering (by gene)

In [None]:
fig, ax = plt.subplots(2,1, figsize=(20, 10))

plt.suptitle("Dendrogram of Hierarchical Clustering", fontsize=20)

# Protein 
X = np.array(df_protein['embeddings'].tolist())
Z = linkage(cosine_similarity(X), 'ward')
# Plot the Dendrogram
ax1 = ax[0]
ax1.set_title("ProteinBERT", fontsize=15)
dendrogram(Z, labels=df_protein['gene'].values, leaf_rotation=30, leaf_font_size=12, color_threshold=1.5, ax=ax1)
ax1.set_ylabel('Distance')
ax1.text(-0.05, 1.1, "A.", fontsize=18, ha='center', va='center', transform=ax1.transAxes)

# DNA
X = np.array(df_dna['embeddings'].tolist())
Z = linkage(cosine_similarity(X), 'ward')
# Plot the Dendrogram
ax2 = ax[1]
ax2.set_title("DNABERT-2", fontsize=15)
dendrogram(Z, labels=df_dna['gene'].values, leaf_rotation=30, leaf_font_size=12, color_threshold=0.7, ax=ax2)

ax2.set_ylabel('Distance')
ax2.set_xlabel('Gene', )
ax2.text(-0.05, 1.1, "B.", fontsize=18, ha='center', va='center', transform=ax2.transAxes)
fig.tight_layout()
plt.show()

In [None]:
# Elbow method for KMeans
X_protein = np.array(df_protein['embeddings'].tolist())
X_dna = np.array(df_dna['embeddings'].tolist())

inertia_protein = []
inertia_dna = []
K = range(1, 20)
for k in K:
    kmeans_protein = KMeans(n_clusters=k, random_state=42)
    kmeans_protein.fit(X_protein)
    inertia_protein.append(kmeans_protein.inertia_)
    kmeans_dna = KMeans(n_clusters=k, random_state=42)
    kmeans_dna.fit(X_dna)
    inertia_dna.append(kmeans_dna.inertia_)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(K, inertia_protein, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method for Protein')
plt.subplot(1, 2, 2)
plt.plot(K, inertia_dna, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method for DNA')
plt.show()

In [None]:
# kmeans clustering
num_clusters = 5
kmeans_protein = KMeans(n_clusters=num_clusters, random_state=42)
protein_labels = kmeans_protein.fit_predict(X_protein)

tsne = TSNE(n_components=2, random_state=42)
X_protein_tsne = tsne.fit_transform(X_protein)

kmeans_dna = KMeans(n_clusters=num_clusters, random_state=42)
dna_labels = kmeans_dna.fit_predict(X_dna)
X_dna_tsne = tsne.fit_transform(X_dna)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
plt.suptitle("t-SNE of KMeans Clustering", fontsize=20)
# Protein
ax1 = ax[0]
ax1.scatter(X_protein_tsne[:, 0], X_protein_tsne[:, 1], c=protein_labels, cmap='viridis', s=20)
ax1.set_title("ProteinBERT")
ax1.set_xlabel("t-SNE 1")
ax1.set_ylabel("t-SNE 2")
ax1.text(-0.05, 1.01, "A.", fontsize=15, ha='center', va='center', transform=ax1.transAxes)
for i, txt in enumerate(df_protein['gene']):
    ax1.annotate(txt, (X_protein_tsne[i, 0], X_protein_tsne[i, 1]), fontsize=8)
# DNA
ax2 = ax[1]
ax2.scatter(X_dna_tsne[:, 0], X_dna_tsne[:, 1], c=dna_labels, cmap='viridis', s=50)
ax2.set_title("DNABERT-2")
ax2.set_xlabel("t-SNE 1")
ax2.set_ylabel("t-SNE 2")
ax2.text(-0.05, 1.01, "B.", fontsize=15, ha='center', va='center', transform=ax2.transAxes)
for i, txt in enumerate(df_dna['gene']):
    ax2.annotate(txt, (X_dna_tsne[i, 0], X_dna_tsne[i, 1]), fontsize=8)

plt.tight_layout()
plt.show()