In [None]:
# Install dependencies
!pip install matplotlib pandas seaborn scikit-learn openpyxl
!pip install biopython

# Required imports
from Bio import SeqIO
import gzip
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import seaborn as sns
from sklearn.metrics import roc_auc_score

# Set root path
os.chdir('/content')

In [None]:
!git clone https://github.com/ArcInstitute/evo2.git

In [None]:
#!git clone --recurse-submodules git@github.com:ArcInstitute/evo2.git
!cd /content/evo2 && pip install .

In [None]:
!pip install vtx

In [None]:
# Step 1: Uninstall existing flash-attn (ignore errors if not installed)
!pip uninstall flash-attn -y

# Step 2: Clone the Flash-Attention repo
!git clone https://github.com/Dao-AILab/flash-attention.git

# Step 3: Change directory and install flash-attn with no build isolation
%cd flash-attention
!pip install flash-attn . --no-build-isolation

In [None]:
!pip install transformer_engine[pytorch]==1.13

In [None]:
import pandas as pd
from evo2.models import Evo2
import csv

# Load input CSV
df = pd.read_csv('/content/full_constructsb.csv')

# Define columns
ref_col = 'Codon'
exclude_cols = ['Unnamed: 0', ref_col]
variant_cols = [col for col in df.columns if col not in exclude_cols]

# Reconstruct reference sequence
reference_seq = ''.join(df[ref_col])

# Load Evo2 model
model = Evo2("evo2_7b_base")

# Score reference sequence once
ref_score = model.score_sequences([reference_seq])[0]

# Open output CSV and write header
output_path = '/content/variant_scores.csv'
with open(output_path, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=[
        'variant_name', 'ref_seq', 'mut_seq', 'ref_score', 'mut_score', 'delta_score'
    ])
    writer.writeheader()

    # Score variants one by one
    for variant_col in variant_cols:
        variant_seq = ''.join(df[variant_col])
        try:
            mut_score = model.score_sequences([variant_seq])[0]
            writer.writerow({
                'variant_name': variant_col,
                'ref_seq': reference_seq,
                'mut_seq': variant_seq,
                'ref_score': ref_score,
                'mut_score': mut_score,
                'delta_score': mut_score - ref_score
            })
        except Exception as e:
            print(f"Error scoring {variant_col}: {e}")

In [None]:
from google.colab import files

# Save your dataframe to CSV
result_df.to_csv('/content/result_df.csv', index=False)

# Download the file to your local machine
files.download('/content/result_df.csv')