<a href="https://colab.research.google.com/github/vhaghani26/BST_227_Code/blob/main/SimpleLatentSequenceModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# *Utility function for reading in data from course Box folder

In [1]:
import pandas as pd
import io
import requests
from tqdm import tqdm
import numpy as np

from sklearn.preprocessing import OneHotEncoder

#----------------------------------------
#from Chenxi Liu
#function for downloading sequence data from Box and converting to one-hot encoding
#note this function actually returns the summary statistics over the data, which is
#just the total number of each bases across all input sequences for our simple 
#latent variable model.

def get_sequence(url, categories=['A', 'C', 'G', 'T']):
  r = requests.get(url)
  df = pd.read_csv(io.StringIO(r.text), sep=" ", header=None)
  s1 = np.array(list(str(df.to_numpy()[0, :][0])), dtype=object).reshape(-1, 1)

  m = len(df)
  sequence_len = len(list(df.iloc[0, :].values)[0])
  data = np.zeros((m, sequence_len, len(categories)))
  data_ss = np.zeros((1, len(categories)))

  ohe = OneHotEncoder(sparse=False, categories=[np.array(categories, dtype=object)])
  example_sequence = np.array(list(str(df.to_numpy()[0, :][0])), dtype=object).reshape(-1, 1)

  ohe.fit(s1)

  for ii in tqdm(range(m)):
    
    s = list(str(df.to_numpy()[ii, :][0]))
    s_a = np.array(s).reshape(-1, 1)
    data[ii, :, :] = ohe.transform(s_a)
  
  for ii in range(len(categories)):
    data_ss[0,ii] = np.sum(data[:,:,ii])

  return data_ss


# Core EM code

In [None]:
#----------------------------------------
# parameter initialization
# psi: a 4xNUM_MODELS matrix of psi parameters
# lmbda: an array of length NUM_MODELS

import math

def init_EM(NUM_MODELS):
  lmbda = np.random.uniform(0, 1,size=(NUM_MODELS,))
  lmbda = lmbda/np.sum(lmbda)
  psi = np.random.uniform(0,1,size=(4,NUM_MODELS))
  psi = psi/psi.sum(axis=0)
  theta = {'lmbda': lmbda, 'psi': psi}
  return(theta)

#----------------------------------------
# compute posteriors P(C_ij | X_ij, theta)
# return a 4 x NUM_MODELS object PP, where PP[aa,bb] = P(C_ij = bb | X_ij = aa, theta)
def E_step(theta):
  unnormalized_posteriors = theta['psi']*theta['lmbda']
  normalized_posteriors = (unnormalized_posteriors.T/unnormalized_posteriors.sum(axis=1)).T
  return(normalized_posteriors)

#----------------------------------------
# compute MLE of psi, lambda
# theta: current parameter set used to calculate posteriors
# XXss: summary statistics of # of bases across all training sequences
# return a dictionary containing psi, lambda
def M_step(XXss, posteriors):
  unnormalized_psi = (posteriors.T*XXss).T
  unnormalized_lambda = unnormalized_psi.sum(axis=0)
  psi = unnormalized_psi/unnormalized_psi.sum(axis=0)
  lmbda = unnormalized_lambda/np.sum(unnormalized_lambda)
  return({'lmbda': lmbda, 'psi': psi})

#----------------------------------------
# compute log likelihood, given posteriors based off the current parameter set theta (so KL = 0)
def loglikelihood(XXss, theta, posteriors):
  #matrix JP of weighted joint log probabilities, size 4xNUM_MODELS, where JP[aa,bb] = E[Cij=bb] log P(Xij = aa, Cij = bb | theta) 
  weighted_log_joint_prob = (np.log(theta['psi'])+np.log(theta['lmbda']))*posteriors  
  weighted_log_joint_prob[np.where(posteriors == 0)]=0; #putting in a check in case some posteriors are 0 (because some psi are 0) - 0log0 = 0 (otherwise we will get nan)
  
  expected_complete_LL = np.sum(weighted_log_joint_prob.T*XXss)
  #entropy term calculation needs to be careful since 0log0 = 0, but coded naively will give nan
  qlogq = posteriors * np.log(posteriors);
  qlogq[np.where(posteriors == 0)] = 0 #0log0 = 0
  return (expected_complete_LL - np.sum(np.sum(qlogq,axis=1) * XXss)) #log likelihood = ELBO + entropy, when q=p

# Get data

In [None]:
#URL for at_gc_sequences.txt - this is a single sequence:
# ATTTAATATAAAATTTGGCCGCCATAAAAAAA
#https://ucdavis.box.com/shared/static/s8g6zx9vwxbbfdxdj2uqzhlvslc1jhsy.txt
#URL for sequence.padded.txt - the real binding site data
#https://ucdavis.box.com/shared/static/0cacx2xvn4ugxo9h21ci2ngesryigf43.txt

XXss_at_gc = get_sequence('https://ucdavis.box.com/shared/static/s8g6zx9vwxbbfdxdj2uqzhlvslc1jhsy.txt')
XXss_sequences_padded = get_sequence('https://ucdavis.box.com/shared/static/0cacx2xvn4ugxo9h21ci2ngesryigf43.txt')

100%|██████████| 1/1 [00:00<00:00, 934.77it/s]
100%|██████████| 357/357 [00:00<00:00, 3175.15it/s]


# Run EM

In [None]:
np.random.seed(255)
XXss = XXss_at_gc
#XXss = XXss_sequences_padded
theta = init_EM(NUM_MODELS = 2)

for ii in range(5):  
  posteriors = E_step(theta);  
  print(loglikelihood(XXss, theta, posteriors))
  theta = M_step(XXss, posteriors);

-53.615564837127714
-37.92609382724488
-37.92609382724488
-37.92609382724488
-37.92609382724488


# Some exercises to do

1. Try different random initializations. How do the final log likelihoods compare? How do the psi's and lambdas compare? (e.g. how many unique solutions did you find, and how were their corresponding likelihoods?) Explain your observations. What can we conclude about the utility of such a model?

2. Train the model using the dataset "XXss_at_gc", which is just a dataset consisting of a single sequence, "ATTTAATATAAAATTTGGCCGCCATAAAAAAA". Looking carefully, this is just a GC-rich sequence, flanked by AT-rich sequences. What do you expect the two models to learn? Try training it now, and look at the psi's. Did the model learn what you expected? Why or why not?
