In [1]:
# using Kernel PyTorch-1.10
# using Kernel PyTorch-1.10
# using Kernel PyTorch-1.10

from gpn.data import GenomeMSA #, Tokenizer
import gpn.model

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from transformers import AutoModel #, AutoModelForMaskedLM
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import csv
import warnings
warnings.filterwarnings('ignore')

%run preprocess_utility.py

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
model_path = "songlab/gpn-msa-sapiens"
msa_path = "zip:///::/home/sunhuaikuan/ondemand/blue_gpn/examples/msa/89.zarr.zip"
genome_msa = GenomeMSA(msa_path)
model = AutoModel.from_pretrained(model_path).to(device)
model.eval();

Loading MSA...
Loading MSA... Done


Some weights of the model checkpoint at songlab/gpn-msa-sapiens were not used when initializing GPNRoFormerModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing GPNRoFormerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPNRoFormerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Main Function to get Embedding

In [3]:
comp = {'A':1, 'C':2, 'G':3, 'T':4}

max_seqlen=128

def Genosome2Embedding(chrom, pos_start, pos_end, rowid, y):

    msa = genome_msa.get_msa(str(chrom), pos_start, pos_end, strand="+", tokenize=True)
    
    msa = torch.tensor(np.expand_dims(msa, 0).astype(np.int64))
    
    # separating human from rest of species
    input_ids, aux_features = msa[:, :, 0], msa[:, :, 1:]
    
    input_ids = input_ids.to(device)
    aux_features = aux_features.to(device)


    with torch.no_grad():
        last_hidden_state = model(input_ids=input_ids, aux_features=aux_features).last_hidden_state
        
        # Mean Pooling: Compute the mean across the sequence length (dim=1)
        mean_pooled = last_hidden_state.mean(dim=1)  # Shape: (batch_size, embedding_dim)

    feature=np.append(mean_pooled.cpu().numpy(),  [rowid, y]) # chrom, pos_end-pos_start,
    
    return feature

### Output CSV File

In [5]:
def output2CSV(df, csv_Filename):

    if os.path.exists(csv_Filename):
        os.remove(csv_Filename)

    rows=[]
    for index, row in df.iterrows():
        
        chrom=row['CHROM']
        pos_start=row['START']
        pos_end=row['END']
        rowid=row['ROWID']
        y=row['y']
        try:
            embedding  =  Genosome2Embedding(chrom, pos_start, pos_end, rowid, y)
            rows.append(embedding)
    
        except Exception as e:
            print(f"exception caught: {e}"+str(row['CHROM'])+'-'+str(row['START']))
    

        if ((index % 1000) ==0):
            with open(csv_Filename, mode='a', newline='') as file:
                writer = csv.writer(file)
                for row in rows:
                    writer.writerow(row)
            rows=[]
            # progress_bar.update(1)
            print(f"complete index={index}")

    
    with open(csv_Filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        for row in rows:
            writer.writerow(row)

    print(f"Create File: "+csv_Filename)


In [21]:
datafile_path = '../../datasets/task03-genomic-regions/Homo_sapiens.GRCh38.109.txt.gz'  
df = preprocess_home_sapiens_datafile(datafile_path)
df
pd.set_option('display.max_rows', None)
df[['CLUSTER']].value_counts().sort_index(level='CLUSTER').head(100)

CLUSTER              
first_exon               4926
first_five_prime_UTR     4952
first_intron             4985
first_three_prime_UTR    4971
ncRNA_gene               4975
pseudogene               4998
smallRNA                 3701
dtype: int64

In [27]:
df[df['CLUSTER']=='first_intron']

Unnamed: 0,CHROM,START,END,TYPE,CLUSTER,SIZE,ROWID,y
5000,16,57372489,57372639,first_intron,first_intron,150,5000,1
5001,18,57586553,57586703,first_intron,first_intron,150,5001,1
5002,20,63707659,63707809,first_intron,first_intron,150,5002,1
5003,X,75522888,75523038,first_intron,first_intron,150,5003,1
5004,16,84819984,84820134,first_intron,first_intron,150,5004,1
5005,19,13150138,13150288,first_intron,first_intron,150,5005,1
5006,19,29812003,29812153,first_intron,first_intron,150,5006,1
5007,19,11355423,11355573,first_intron,first_intron,150,5007,1
5008,19,18683679,18683829,first_intron,first_intron,150,5008,1
5009,12,123268168,123268318,first_intron,first_intron,150,5009,1


### Load Homo_Sapiens data

In [4]:
datafile_path = '../../datasets/task03-genomic-regions/Homo_sapiens.GRCh38.109.txt.gz'  
df = preprocess_home_sapiens_datafile(datafile_path)

df.loc[df['SIZE'] > max_seqlen, 'END'] = df['START'] + max_seqlen
df.loc[df['SIZE'] > max_seqlen, 'SIZE'] = max_seqlen

df=df.drop(columns=['TYPE','CLUSTER'])
df

Unnamed: 0,CHROM,START,END,SIZE,ROWID,y
0,7,116953541,116953669,128,0,0
1,12,54241755,54241883,128,1,0
2,20,38033948,38034076,128,2,0
3,15,82659560,82659688,128,3,0
4,9,131579497,131579625,128,4,0
...,...,...,...,...,...,...
33713,9,124692442,124692551,109,33713,6
33714,9,128244721,128244830,109,33714,6
33715,X,45746157,45746266,109,33715,6
33716,X,45747015,45747124,109,33716,6


In [7]:
%%time
output2CSV(df,'./homo_sapiens_gpn_embedding.csv')

complete index=0
complete index=1000
complete index=2000
complete index=3000
complete index=4000
complete index=5000
complete index=6000
complete index=7000
complete index=8000
complete index=9000
complete index=10000
complete index=11000
complete index=12000
complete index=13000
complete index=14000
complete index=15000
complete index=16000
complete index=17000
complete index=18000
complete index=19000
complete index=20000
complete index=21000
complete index=22000
complete index=23000
complete index=24000
complete index=25000
complete index=26000
complete index=27000
complete index=28000
complete index=29000
complete index=30000
complete index=31000
complete index=32000
complete index=33000
Create File: ./homo_sapiens_gpn_embedding.csv
CPU times: user 6min 56s, sys: 1min 25s, total: 8min 22s
Wall time: 35min 52s


### Load CSV File

In [8]:
def load_embedding_file(csv_filename):

    df=pd.read_csv(csv_filename)
    
    column_names = [f'{i}' for i in range(1, df.shape[1]-1)]
    column_names.extend(['ROWID', 'y']) 
    
    df.columns = column_names
    return df

df = load_embedding_file('./homo_sapiens_gpn_embedding.csv')
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,761,762,763,764,765,766,767,768,ROWID,y
0,-0.444453,0.829066,0.096076,-0.693920,1.323465,-0.858230,-0.294933,-0.434608,-0.119355,-0.269585,...,-0.447579,-0.130508,0.456571,-0.614223,-0.021212,-0.397703,0.116457,-0.461298,1.0,0.0
1,0.272819,0.866934,0.527317,-0.374678,0.796706,-0.783266,-0.135167,-0.243605,0.309559,0.478524,...,-0.467666,-0.484659,0.188427,-0.111583,-0.280430,-0.540670,-0.369964,0.163323,2.0,0.0
2,0.002897,0.300540,0.363745,0.083066,0.097976,-0.705333,-0.345298,-0.162139,0.272691,0.111771,...,-0.375416,-0.182703,-0.326266,-0.158091,-0.380751,-0.772700,0.031157,0.344509,3.0,0.0
3,-0.497240,0.093710,0.335565,-0.105232,1.134319,-0.539970,-0.541573,-0.415931,-0.286506,0.133382,...,-0.464082,-0.365336,-0.058795,-0.306530,-0.313155,-0.506973,0.349965,-0.236048,4.0,0.0
4,0.285708,0.275770,0.598883,0.461520,0.081254,-1.124011,-0.944545,-0.015475,0.940282,0.396197,...,-0.210003,-0.503278,-0.084990,0.244842,-0.982839,-0.535660,-0.367225,0.574554,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33502,0.021807,0.796965,0.068293,-0.237650,0.083819,-1.799712,-0.451906,0.308115,-0.011921,0.049140,...,-0.217358,-0.197091,0.100965,-0.055664,-0.483156,-0.748016,-0.219737,0.361740,33713.0,6.0
33503,-0.299236,1.391922,0.650065,-0.218609,0.883092,-1.781720,-0.375446,0.310933,0.104638,0.311812,...,-0.361967,-0.592416,0.255129,-0.420886,-0.484935,-0.410109,-0.289352,0.262789,33714.0,6.0
33504,0.068219,1.491318,0.323441,-0.304896,0.715240,-2.110754,-0.510900,0.210548,0.308247,-0.155464,...,0.065997,-0.275441,-0.002474,-0.374743,-1.009526,-0.549350,-0.354674,0.503089,33715.0,6.0
33505,0.238253,1.167527,0.303864,-0.263249,0.213139,-1.957132,-0.377137,0.443044,0.325981,-0.091771,...,0.124397,-0.317400,0.065623,-0.214288,-0.857551,-0.598479,-0.507455,0.492559,33716.0,6.0
