In [2]:
import warnings
warnings.filterwarnings("ignore")
import torch
import numpy as np
import pandas as pd
model_path="/Volumes/PortableSSD/megaDNA_phage_145M.pt"
nucleotides = ['**', 'A', 'T', 'C', 'G', '#'] # vocabulary
mnucle = {'A':1,'T':2,'C':3,'G':4}
device = 'cuda' if torch.cuda.is_available() else 'cpu'
megadna = torch.load(model_path, map_location=torch.device(device))
megadna.eval()

def embeddings(seq):
    encoded_sequence = np.array([0] +[ mnucle[c] for c in seq]+[5])
    input_seq = torch.tensor(encoded_sequence).unsqueeze(0).to(device) 
    output = megadna(input_seq, return_value = 'embedding')
    return output[0].reshape((1,-1)).cpu().detach().numpy()

def embedding_batch(seqs):
    encoded_sequence = np.array([np.array([0] +[ mnucle[c] for c in seq]+[5]) for seq in seqs])
    input_seq = torch.tensor(encoded_sequence).to(device) 
    output = megadna(input_seq, return_value = 'embedding')
    return output[0].reshape((len(seqs),-1)).cpu().detach().numpy()

def get_loss_for_sequence(sequence):
    """Get model loss for a given sequence."""
    encoded_sequence = np.array([0] +[ mnucle[c] for c in sequence]+[5])
    input_seq = torch.tensor(encoded_sequence).unsqueeze(0).to(device) 
    with torch.no_grad():
        loss = megadna(input_seq, return_value='loss')
    return loss.cpu().numpy()


seqs=["ATCCCGGGTGAGGCATCCCACCATCCTC",]
embeddings(seqs[0]).shape, get_loss_for_sequence(seqs[0]) # (1, 1024)

((1, 1024), array(1.6882224, dtype=float32))

In [None]:
import tqdm
import pandas as pd
import scipy.stats as stats

df = pd.read_csv("./DNA_RNA_Day8.csv")
df["seq"] =df["dna_seq"]
df["abs_score"] = df["rna_counts"]/df["plasmid_counts"]
df["score"] = np.log(df["abs_score"])
df["length"] = df["seq"].apply(lambda x:len(x))
df["ttype"] = df["seq"].apply(lambda x:len(x)==118)
df["loss"] = [get_loss_for_sequence(x).tolist() for x in tqdm.tqdm(df["seq"].values)]


def r2(x,y):
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    return r_value**2

s1, s2 = df["loss"],df["score"]
r = r2(s1,s2)
pr =  stats.pearsonr(s1,s2)
print('loss vs score: r-squared = ', r, "pearsonR = ", pr[0])

In [None]:
import matplotlib.pyplot as plt
tdf = df
plt.figure(figsize=(8, 8))
plt.scatter(np.log(tdf["loss"]), np.log(tdf["score"]), marker='o')
plt.xlabel("loss")
plt.ylabel("score")
plt.show()

In [None]:
import scipy.stats as stats
from sklearn import preprocessing

def r2(x,y):
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    return r_value**2

datadir="./data"
day5 = pd.read_csv(f"{datadir}/DNA_RNA_Day5.csv")
day8 = pd.read_csv(f"{datadir}/DNA_RNA_Day8.csv")
day5["seq"]=day5["dna_seq"]
day8["seq"]=day8["dna_seq"]
day5["abs_score"] = day5["rna_counts"]/day5["plasmid_counts"]
day5["score"] = np.log(day5["abs_score"])
day8["abs_score"] = day8["rna_counts"]/day8["plasmid_counts"]
day8["score"] = np.log(day8["abs_score"])
day5=day5[["seq","score"]].rename(columns={"score":"score_day5"})
day8=day8[["seq","score"]].rename(columns={"score":"score_day8"})
df = pd.merge(day5,day8,on="seq")

r = r2(df["score_day8"],df["score_day5"])
pr =  stats.pearsonr(df["score_day8"],df["score_day5"])
print('test day5, day8 r-squared = ', r, "pearsonR = ", pr[0])

df = pd.read_csv(f"{datadir}/DNA_RNA_Day8.csv")
df=df[df["plasmid_counts"]>20]
df = df[(df["rna1"]>0)&(df["rna2"]>0)]
rna1, rna2 = np.log(df["rna1"]),np.log(df["rna2"])
r = r2(rna1, rna2)
pr =  stats.pearsonr(rna1, rna2)
print('test day5 rna1&2 r-squared = ', r, "pearsonR = ", pr[0])

In [None]:
import matplotlib.pyplot as plt
tdf = df

plt.figure(figsize=(8, 8))
plt.scatter(np.log(tdf["rna1"]), np.log(tdf["rna2"]), marker='o')
plt.xlabel("rna1")
plt.ylabel("rna2")
plt.show()