## Classification Model

This file details the classification model we used to predict the likelihood of Alzheimer's in female patients.

In [4]:
# Necessary Installations
pip install numpy pandas tensorflow scikit-learn
pip install git+https://github.com/genentech/gReLU.git



We import the needed libraries and files  below:

In [6]:
import pandas as pd
import grelu.resources
import grelu.sequence.format
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm  # Import tqdm for progress bar
import numpy as np

  TF_GAMMAS = torch.load(str(DIR / "precomputed"/ "tf_gammas.pt"))


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The data is then loaded and labelled accordingly:

In [8]:
# Load your CSV file without headers, treating all lines as sequences

data = pd.read_csv('/content/drive/My Drive/BioInformatics Hackathon/output.csv', header=None)  # No headers in the CSV file
data.columns = ['sequence']  # Assign a header to the column

# Ensure even and odd rows are labeled as "good" or "faulty"
data['label'] = ['good' if i % 2 == 0 else 'faulty' for i in range(len(data))]

# Now, you have the sequences with labels
sequences = data['sequence']

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvheahe-work[0m ([33mvheahe-work-university-of-toronto[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Downloading large artifact human_fold0:latest, 711.00MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:7.9


Model is loaded from gReLU:

In [None]:
# Load the Borzoi model from gReLU
model = grelu.resources.load_model(
    project="borzoi",
    model_name="human_fold0"
)


Processing and data manipulation is performed before a Random Forest classifier is trained to predict the relationship between specific genetic mutations and the probability of a patient having Alzheimer's.

In [None]:
# Convert sequences into a format that Borzoi can understand
# Make sure all sequences are padded to match the expected input length
expected_length = model.data_params['train_seq_len']

def pad_sequence(seq, length):
    return seq + 'A' * (length - len(seq)) if len(seq) < length else seq[:length]

# Add a progress bar for padding sequences
input_seqs = [pad_sequence(seq, expected_length) for seq in tqdm(sequences, desc="Padding sequences")]

# Use Borzoi to make predictions on the input sequences
print("Running Borzoi model predictions...")

# Add a progress bar for predictions
predictions = []
for seq in tqdm(input_seqs, desc="Predicting sequences"):
    pred = model.predict_on_seqs([seq], device="cpu")  # Predict one sequence at a time to show progress
    predictions.append(pred)

# Convert predictions list to numpy array
predictions = np.vstack(predictions)

# Borzoi outputs predictions (e.g., probability tracks); use these as features
# Flatten the predictions for Random Forest input
predictions_flat = predictions.reshape(predictions.shape[0], -1)

# Prepare the features (X) and labels (y) for Random Forest
X = pd.DataFrame(predictions_flat)  # Borzoi predictions as features
y = data['label']  # Labels: 'good' or 'faulty'

# Encode the labels to numerical values: 0 for 'faulty', 1 for 'good'
y = y.map({'faulty': 0, 'good': 1})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)  # Define the classifier
clf.fit(X_train, y_train)  # Train the classifier

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Save the Random Forest model for future use
joblib.dump(clf, 'random_forest_alzheimer_model.pkl')

Padding sequences: 100%|██████████| 7116/7116 [00:05<00:00, 1359.17it/s]


Running Borzoi model predictions...


Predicting sequences:   0%|          | 0/7116 [00:00<?, ?it/s]