# Human Genome Language Model

This notebook trains a language model on the human genome, using data prepared in the Data Processing notebook. The language model trained is based on the AWD-LSTM architecture. the genomic input information is split into 5-mers with a stride of 2 bases between each 5-mer. The model is trained to take an input sequence of 5-mers and predict the next 5-mer. This allows us to train a model that learns the structure of genomic information in a totally unsupervised way.

The base of the language model (token embedding + LSTM layers) will then be used to initialize a classification model.

For more detail on how genomic data is processed and how these language models are trained, see the following notebooks:

[E. coli 1 Naive Model](https://github.com/tejasvi/DNAish/blob/master/Bacteria/E.%20Coli/E.%20coli%201%20Naive%20Model.ipynb)

[E. coli 2 Genomic Pretraining](https://github.com/tejasvi/DNAish/blob/master/Bacteria/E.%20Coli/E.%20coli%202%20Genomic%20Pretraining.ipynb)

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.text import *
from Bio import Seq
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation, CompoundLocation
import networkx as nx

In [3]:
sys.path.append("../../..")
from utils import *

In [4]:
path = Path('F:/genome/human/')

# Language Model

Tokenizing and processing the genomic text is memory intensive. I had to break it into chunks.

In [5]:
def get_init_vocab(ret='data'):
    df = next(pd.read_csv(path/'human_genome_data.csv', chunksize=100000))
    cut = int(len(df)*0.8) + 1
    train_df = df[:cut]
    valid_df = df[cut:]
    data = GenomicTextLMDataBunch.from_df(path, train_df, valid_df, bs=400, tokenizer=tok, 
                                  chunksize=10000, text_cols=0, label_cols=1)
    if ret=='data':
        return data
    else:
        return data.vocab

In [9]:
tok = Tokenizer(GenomicTokenizer, n_cpus=1, pre_rules=[], post_rules=[], special_cases=['xxpad'])

In [7]:
init_voc = get_init_vocab()
np.save(path/'human_vocab_5mer.npy', init_voc.vocab.itos)

In [10]:
config = dict(emb_sz=400, n_hid=1150, n_layers=3, pad_token=0, qrnn=False, output_p=0.25, 
                          hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15, tie_weights=True, out_bias=True)

In [16]:
drop_mult = 0.2
lr = 2e-2
count = 1
df_chunks = pd.read_csv(path/'human_genome_data.csv', chunksize=100000)
for df in df_chunks:
    
    if count == 1:
        voc = np.load(path/'human_vocab_5mer.npy')
        model_vocab = GenomicVocab(voc)
        
        cut = int(len(df)*0.8) + 1
        train_df = df[:cut]
        valid_df = df[cut:]
        data = GenomicTextLMDataBunch.from_df(path, train_df, valid_df, bs=400, tokenizer=tok, vocab=model_vocab,
                                      chunksize=10000, text_cols=0, label_cols=1)
        learn = get_model_LM(data, drop_mult, config)
        
    else:
        data = GenomicTextLMDataBunch.from_df(path, df, valid_df, bs=400, tokenizer=tok, 
                                      chunksize=10000, text_cols=0, label_cols=1, vocab=model_vocab)
        
    learn.data = data
    lr_iter = lr/(1.5**count)
    print(f'Learning Rate: {lr_iter}')
    learn.fit_one_cycle(1, lr_iter, moms=(0.8,0.7), pct_start=0.4)
    count += 1
learn.save('human_genome_full')
learn.save_encoder('human_genome_full_enc')

Learning Rate: 0.013333333333333334


epoch,train_loss,valid_loss,accuracy
1,2.337749,2.394770,0.221610


Learning Rate: 0.008888888888888889


epoch,train_loss,valid_loss,accuracy
1,2.352073,2.358909,0.234007


Learning Rate: 0.005925925925925926


epoch,train_loss,valid_loss,accuracy
1,2.323168,2.340079,0.240859


Learning Rate: 0.003950617283950617


epoch,train_loss,valid_loss,accuracy
1,2.319729,2.324328,0.246606


Learning Rate: 0.0026337448559670784


epoch,train_loss,valid_loss,accuracy
1,2.274003,2.314659,0.250099


Learning Rate: 0.0017558299039780521


epoch,train_loss,valid_loss,accuracy
1,2.284473,2.307107,0.253027


Learning Rate: 0.0011705532693187014


epoch,train_loss,valid_loss,accuracy
1,2.242174,2.303578,0.254368


Learning Rate: 0.0007803688462124676


epoch,train_loss,valid_loss,accuracy
1,2.247751,2.299773,0.255692


Learning Rate: 0.0005202458974749785


epoch,train_loss,valid_loss,accuracy
1,2.275886,2.297826,0.256589


Learning Rate: 0.00034683059831665226


epoch,train_loss,valid_loss,accuracy
1,2.211897,2.295877,0.257219


Learning Rate: 0.0002312203988777682


epoch,train_loss,valid_loss,accuracy
1,2.254941,2.294839,0.257583


Learning Rate: 0.00015414693258517878


epoch,train_loss,valid_loss,accuracy
1,2.229172,2.295044,0.257711


Learning Rate: 0.00010276462172345253


epoch,train_loss,valid_loss,accuracy
1,2.123421,2.295023,0.257664


Learning Rate: 6.850974781563502e-05


epoch,train_loss,valid_loss,accuracy
1,2.220374,2.294481,0.257942


Learning Rate: 4.5673165210423346e-05


epoch,train_loss,valid_loss,accuracy
1,2.112300,2.294044,0.258017


In [23]:
lr = 1e-2
count = 1
df_chunks = pd.read_csv(path/'human_genome_data.csv', chunksize=100000)
for df in df_chunks:
    
    if count == 1:
        voc = np.load(path/'human_vocab_5mer.npy')
        model_vocab = GenomicVocab(voc)
        
        cut = int(len(df)*0.8) + 1
        train_df = df[:cut]
        valid_df = df[cut:]
        data = GenomicTextLMDataBunch.from_df(path, train_df, valid_df, bs=400, tokenizer=tok, vocab=model_vocab,
                                      chunksize=10000, text_cols=0, label_cols=1)
        
    else:
        data = GenomicTextLMDataBunch.from_df(path, df, valid_df, bs=400, tokenizer=tok, 
                                      chunksize=10000, text_cols=0, label_cols=1, vocab=model_vocab)
        
    learn.data = data
    lr_iter = lr/(1.5**count)
    print(f'Learning Rate: {lr_iter}')
    learn.fit_one_cycle(1, lr_iter, moms=(0.8,0.7), pct_start=0.4)
    count += 1
    
learn.save('human_genome_full2')
learn.save_encoder('human_genome_full_enc2')

Learning Rate: 0.006666666666666667


epoch,train_loss,valid_loss,accuracy
1,2.243383,2.298292,0.256870


Learning Rate: 0.0044444444444444444


epoch,train_loss,valid_loss,accuracy
1,2.277259,2.285290,0.261625


Learning Rate: 0.002962962962962963


epoch,train_loss,valid_loss,accuracy
1,2.250492,2.276577,0.264885


Learning Rate: 0.0019753086419753087


epoch,train_loss,valid_loss,accuracy
1,2.259301,2.270150,0.267150


Learning Rate: 0.0013168724279835392


epoch,train_loss,valid_loss,accuracy
1,2.227532,2.266340,0.268569


Learning Rate: 0.0008779149519890261


epoch,train_loss,valid_loss,accuracy
1,2.262399,2.263004,0.269876


Learning Rate: 0.0005852766346593507


epoch,train_loss,valid_loss,accuracy
1,2.238797,2.261682,0.270354


Learning Rate: 0.0003901844231062338


epoch,train_loss,valid_loss,accuracy
1,2.209918,2.259972,0.270942


Learning Rate: 0.00026012294873748923


epoch,train_loss,valid_loss,accuracy
1,2.231220,2.259232,0.271323


Learning Rate: 0.00017341529915832613


epoch,train_loss,valid_loss,accuracy
1,2.173658,2.258366,0.271563


Learning Rate: 0.0001156101994388841


epoch,train_loss,valid_loss,accuracy
1,2.240291,2.258017,0.271673


Learning Rate: 7.707346629258939e-05


epoch,train_loss,valid_loss,accuracy
1,2.207231,2.258447,0.271728


Learning Rate: 5.1382310861726263e-05


epoch,train_loss,valid_loss,accuracy
1,2.107263,2.258441,0.271667


Learning Rate: 3.425487390781751e-05


epoch,train_loss,valid_loss,accuracy
1,2.201458,2.258209,0.271785


Learning Rate: 2.2836582605211673e-05


epoch,train_loss,valid_loss,accuracy
1,2.069442,2.257890,0.271840


In [24]:
lr = 5e-3
count = 1
df_chunks = pd.read_csv(path/'human_genome_data.csv', chunksize=100000)
for df in df_chunks:
    
    if count == 1:
        voc = np.load(path/'human_vocab_5mer.npy')
        model_vocab = GenomicVocab(voc)
        
        cut = int(len(df)*0.8) + 1
        train_df = df[:cut]
        valid_df = df[cut:]
        data = GenomicTextLMDataBunch.from_df(path, train_df, valid_df, bs=400, tokenizer=tok, 
                                      chunksize=10000, text_cols=0, label_cols=1)
        
    else:
        data = GenomicTextLMDataBunch.from_df(path, df, valid_df, bs=400, tokenizer=tok, 
                                      chunksize=10000, text_cols=0, label_cols=1, vocab=model_vocab)
        
    learn.data = data
    lr_iter = lr/(1.5**count)
    print(f'Learning Rate: {lr_iter}')
    learn.fit_one_cycle(1, lr_iter, moms=(0.8,0.7), pct_start=0.4)
    count += 1
    
learn.save('human_genome_full3')
learn.save_encoder('human_genome_full_enc3')

Learning Rate: 0.0033333333333333335


epoch,train_loss,valid_loss,accuracy
1,2.191198,2.260594,0.271027


Learning Rate: 0.0022222222222222222


epoch,train_loss,valid_loss,accuracy
1,2.246105,2.255090,0.272999


Learning Rate: 0.0014814814814814814


epoch,train_loss,valid_loss,accuracy
1,2.248405,2.251490,0.274273


Learning Rate: 0.0009876543209876543


epoch,train_loss,valid_loss,accuracy
1,2.204687,2.248657,0.275271


Learning Rate: 0.0006584362139917696


epoch,train_loss,valid_loss,accuracy
1,2.225902,2.247021,0.275781


Learning Rate: 0.00043895747599451303


epoch,train_loss,valid_loss,accuracy
1,2.240495,2.245379,0.276476


Learning Rate: 0.00029263831732967535


epoch,train_loss,valid_loss,accuracy
1,2.213612,2.244945,0.276673


Learning Rate: 0.0001950922115531169


epoch,train_loss,valid_loss,accuracy
1,2.200437,2.244047,0.276906


Learning Rate: 0.00013006147436874462


epoch,train_loss,valid_loss,accuracy
1,2.244316,2.243793,0.277127


Learning Rate: 8.670764957916306e-05


epoch,train_loss,valid_loss,accuracy
1,2.163404,2.243298,0.277244


Learning Rate: 5.780509971944205e-05


epoch,train_loss,valid_loss,accuracy
1,2.215231,2.243161,0.277257


Learning Rate: 3.8536733146294696e-05


epoch,train_loss,valid_loss,accuracy
1,2.200229,2.243621,0.277245


Learning Rate: 2.5691155430863132e-05


epoch,train_loss,valid_loss,accuracy
1,2.075210,2.243615,0.277204


Learning Rate: 1.7127436953908756e-05


epoch,train_loss,valid_loss,accuracy
1,2.165816,2.243500,0.277280


Learning Rate: 1.1418291302605837e-05


epoch,train_loss,valid_loss,accuracy
1,2.052325,2.243284,0.277236


In [14]:
voc = np.load(path/'human_vocab_5mer.npy')
model_vocab = GenomicVocab(voc)

In [15]:
df_chunks = pd.read_csv(path/'human_genome_data_fa.csv', chunksize=250000)

In [20]:
config = dict(emb_sz=400, n_hid=1150, n_layers=3, pad_token=0, qrnn=False, output_p=0.25, 
                          hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15, tie_weights=True, out_bias=True)

In [21]:
tok = Tokenizer(GenomicTokenizer, n_cpus=1, pre_rules=[], post_rules=[], special_cases=['xxpad'])

In [22]:
lr = 1e-3
count = 0
drop_mult = 0.3

for df in df_chunks:
    if count == 0:
        valid_df = df[80000:100000]
        df = df[~df.index.isin(valid_df.index)]
        data = GenomicTextLMDataBunch.from_df(path, df, valid_df, bs=400, tokenizer=tok, 
                                  chunksize=10000, text_cols=0, label_cols=1, vocab=model_vocab)
        
        learn = get_model_LM(data, drop_mult, config)
        learn.load('human_genome_full3');
        
    else:
        data = GenomicTextLMDataBunch.from_df(path, df, valid_df, bs=400, tokenizer=tok, 
                          chunksize=10000, text_cols=0, label_cols=1, vocab=model_vocab)
        
    learn.data = data
    lr_iter = lr/1.5**count
    print(f'Learning Rate: {lr_iter}')
    learn.fit_one_cycle(2, lr_iter, moms=(0.8,0.7), pct_start=0.4)
    count += 1
    
learn.save('human_genome_full4')
learn.save_encoder('human_genome_full_enc4')

Learning Rate: 0.001


epoch,train_loss,valid_loss,accuracy
1,2.216854,2.251876,0.275347
2,2.202513,2.245164,0.277677


Learning Rate: 0.0006666666666666666


epoch,train_loss,valid_loss,accuracy
1,2.213141,2.245398,0.277687
2,2.193841,2.240465,0.279377


Learning Rate: 0.00044444444444444447


epoch,train_loss,valid_loss,accuracy
1,2.184895,2.240976,0.279324
2,2.148622,2.238027,0.280425


Learning Rate: 0.0002962962962962963


epoch,train_loss,valid_loss,accuracy
1,2.096767,2.238211,0.280281
2,2.093635,2.236222,0.281058
