# Transformer Testing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
from scipy.spatial.distance import squareform

In [2]:
data_path = "./data/data_with_human_TE_cellline_all_plain.csv"
df = pd.read_csv(data_path, delimiter="\t")
df.head()

Unnamed: 0,SYMBOL,transcript_id,gene_id,tx_size,utr5_size,cds_size,utr3_size,tx_sequence,bio_source_108T,bio_source_12T,...,struct_max_stem_len_UTR5,struct_max_loop_len_UTR5,struct_min_dG_CDS,struct_n_hairpins_CDS,struct_n_bifurc_CDS,struct_n_bulges_CDS,struct_start_stem_CDS,struct_max_stem_len_CDS,struct_max_loop_len_CDS,fold
0,SAMD11,ENST00000342066.8,ENSG00000187634.12,2557,90,2046,421,GCAGAGCCCAGCAGATCCCTGCGGCGTTCGCGAGGGTGGGACGGGA...,-3.644472,0.98672,...,10.0,5.0,-13.9,2.0,1.0,0.0,9.0,4.0,4.0,4
1,NOC2L,ENST00000327044.7,ENSG00000188976.11,2757,16,2250,491,GCTTCGGGTTGGTGTCATGGCAGCTGCGGGGAGCCGCAAGAGGCGC...,1.06019,0.701399,...,10.0,6.0,-24.5,1.0,0.0,0.0,21.0,10.0,6.0,8
2,KLHL17,ENST00000338591.8,ENSG00000187961.14,2567,110,1929,528,GGGAGTGAGCGACACAGAGCGGGCCGCCACCGCCGAGCAGCCCTCC...,-1.198005,-1.178952,...,10.0,4.0,-23.0,1.0,1.0,1.0,3.0,9.0,5.0,9
3,HES4,ENST00000304952.11,ENSG00000188290.11,885,124,666,95,GCGGGCCTGGAGCCGGGATCCGCCCTAGGGGCTCGGATCGCCGCGC...,-1.1074,0.158079,...,16.0,3.0,-24.8,1.0,0.0,0.0,3.0,11.0,3.0,7
4,ISG15,ENST00000649529.1,ENSG00000187608.10,637,77,498,62,GGCGGCTGAGAGGCAGCGAACTCATCTTTGCCAGTACAGGAGCTTG...,0.631561,2.013887,...,8.0,5.0,-28.6,1.0,0.0,2.0,2.0,13.0,5.0,2


In [19]:
# print different lengths of the tx_sequence column
seq_lengths = df['tx_sequence'].apply(len).value_counts().sort_index()
#print the number of sequences greater than 1000
print("Number of sequences greater than 1000:", len(seq_lengths[seq_lengths.index > 1000]))
#print the number of sequences less than 5000
print("Number of sequences greater than 10000:", len(seq_lengths[seq_lengths.index < 5000]))

Number of sequences greater than 1000: 5588
Number of sequences greater than 10000: 3661


In [3]:
#number of unique cell lines -- number of columns that have "bio_source" in the name

print("Number of columns: ", len(df.columns))
print("Number of rows: ", len(df))

na_rows = df[df.isna().any(axis=1)]
print("Number of rows that have NA: ", len(na_rows))

bio_source_cols = [col for col in df.columns if 'bio_source' in col]
print(f"Number of unique human cell lines: {len(bio_source_cols)}")

Number of columns:  102
Number of rows:  11153
Number of rows that have NA:  354
Number of unique human cell lines: 78


In [4]:
#fill in the null values
df = df.dropna()
na_rows = df[df.isna().any(axis=1)]
print("Number of rows that have NA: ", len(na_rows))
print("Number of rows: ", len(df))
df.head()


Number of rows that have NA:  0
Number of rows:  10799


Unnamed: 0,SYMBOL,transcript_id,gene_id,tx_size,utr5_size,cds_size,utr3_size,tx_sequence,bio_source_108T,bio_source_12T,...,struct_max_stem_len_UTR5,struct_max_loop_len_UTR5,struct_min_dG_CDS,struct_n_hairpins_CDS,struct_n_bifurc_CDS,struct_n_bulges_CDS,struct_start_stem_CDS,struct_max_stem_len_CDS,struct_max_loop_len_CDS,fold
0,SAMD11,ENST00000342066.8,ENSG00000187634.12,2557,90,2046,421,GCAGAGCCCAGCAGATCCCTGCGGCGTTCGCGAGGGTGGGACGGGA...,-3.644472,0.98672,...,10.0,5.0,-13.9,2.0,1.0,0.0,9.0,4.0,4.0,4
1,NOC2L,ENST00000327044.7,ENSG00000188976.11,2757,16,2250,491,GCTTCGGGTTGGTGTCATGGCAGCTGCGGGGAGCCGCAAGAGGCGC...,1.06019,0.701399,...,10.0,6.0,-24.5,1.0,0.0,0.0,21.0,10.0,6.0,8
2,KLHL17,ENST00000338591.8,ENSG00000187961.14,2567,110,1929,528,GGGAGTGAGCGACACAGAGCGGGCCGCCACCGCCGAGCAGCCCTCC...,-1.198005,-1.178952,...,10.0,4.0,-23.0,1.0,1.0,1.0,3.0,9.0,5.0,9
3,HES4,ENST00000304952.11,ENSG00000188290.11,885,124,666,95,GCGGGCCTGGAGCCGGGATCCGCCCTAGGGGCTCGGATCGCCGCGC...,-1.1074,0.158079,...,16.0,3.0,-24.8,1.0,0.0,0.0,3.0,11.0,3.0,7
4,ISG15,ENST00000649529.1,ENSG00000187608.10,637,77,498,62,GGCGGCTGAGAGGCAGCGAACTCATCTTTGCCAGTACAGGAGCTTG...,0.631561,2.013887,...,8.0,5.0,-28.6,1.0,0.0,2.0,2.0,13.0,5.0,2


In [5]:
bio_source_cols = [col for col in df.columns if 'bio_source' in col]
bio_source_df = df[bio_source_cols]
bio_source_df.columns = bio_source_df.columns.str.replace('bio_source_', '')

In [6]:
bio_source_df.head()

Unnamed: 0,108T,12T,A2780,A549,BJ,BRx-142,C643,CRL-1634,Calu-3,Cybrid Cells,...,human brain tumor,iPSC-differentiated dopamine neurons,megakaryocytes,muscle tissue,neuronal precursor cells,neurons,normal brain tissue,normal prostate,primary macrophages,skeletal muscle
0,-3.644472,0.98672,-1.055178,-1.233638,-0.960594,0.841014,-0.348092,-1.720267,-1.086043,-1.592667,...,-3.885123,-0.574646,-0.692016,-3.5017,-1.746209,-1.245533,-1.707111,-2.462252,0.452824,0.25038
1,1.06019,0.701399,0.369884,0.293364,0.843029,0.208226,2.440129,0.004327,-0.557032,0.182059,...,0.258688,0.31232,-0.094364,0.97606,0.427657,0.459249,0.359501,-0.12409,-0.296476,1.300391
2,-1.198005,-1.178952,-0.258502,-0.60009,-1.093266,0.861657,-0.905916,-2.232229,-2.671289,-1.684484,...,-4.129583,0.129749,-1.898114,-2.629844,-0.089178,-0.020237,-2.522979,-1.49552,-0.819205,0.09242
3,-1.1074,0.158079,0.136836,-0.295218,0.581416,2.500717,-0.178079,-0.364804,-1.00164,-0.293381,...,0.511196,-0.397284,-2.109207,-2.325545,0.547415,0.572689,0.590703,-1.732131,-1.060533,0.949251
4,0.631561,2.013887,0.868647,0.751229,1.285464,0.141643,3.157754,1.597229,0.370445,0.718374,...,1.023844,0.654432,0.843854,-0.018885,0.913957,1.403891,0.793824,0.201445,1.237714,1.672883


## Create Embeddings for the Transformer model

### Only considering sequences of length 500-1500 for CPU embedding attempt

In [20]:
import pandas as pd

# Filter sequences between 500 bp and 1500 bp
df_filtered = df[df['tx_sequence'].apply(len).between(500, 1500)].reset_index(drop=True)

print(f"Remaining sequences: {len(df_filtered)}")

Remaining sequences: 1267


### Try making 3-mers

In [21]:
# Function to turn a sequence into a list of 3-mers
def kmers(sequence, k=3):
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

# Apply kmer function
df_filtered['kmers'] = df_filtered['tx_sequence'].apply(lambda x: ' '.join(kmers(x, k=3)))

### Test TF-IDF vectorizer just to establish pipeline

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\S+', max_features=5000)

# Fit and transform
X_embed = vectorizer.fit_transform(df_filtered['kmers'])

print(f"Embedding shape: {X_embed.shape}")

Embedding shape: (1267, 64)
