<a href="https://colab.research.google.com/github/vhaghani26/BST_227_Code/blob/main/Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **The EM Algorithm for a more complex sequence model**

First, we need to import the modules we plan to use to run the code.

In [7]:
# Used for sure
import requests
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Not used yet, TBD
import io
from tqdm import tqdm

# **Download and Process Sequence Data**

Set data file URL locations

In [None]:
# URL for at_gc_sequences.txt - this is a single sequence: ATTTAATATAAAATTTGGCCGCCATAAAAAAA
at_gc_seqeuences_txt = 'https://ucdavis.box.com/shared/static/s8g6zx9vwxbbfdxdj2uqzhlvslc1jhsy.txt'
# URL for sequence.padded.txt - the real binding site data
sequence_padded_txt = 'https://ucdavis.box.com/shared/static/0cacx2xvn4ugxo9h21ci2ngesryigf43.txt'
# URL for sequence.motiflocation.padded.txt - the location of the binding sites from sequence.padded.txt
sequence_motiflocation_padded_txt = 'https://ucdavis.box.com/shared/static/gd0r12mdkhix86bo9ffbn3dy0fy0prmn.txt'

Write a function to download and one-hot encoded the sequence data

In [37]:
# Function written by Chenxi Liu (slight modifications made)

def get_sequence(url, categories=['A', 'C', 'G', 'T']):
  # Send a GET request to a specified URL, categories=['A', 'C', 'G', 'T']):
  r = requests.get(url)
  # Convert sequence data to data frame
  df = pd.read_csv(io.StringIO(r.text), sep=" ", header=None)
  # Turn the first sequence into a 2D array where each index is an independent array with a single base pair
  s1 = np.array(list(str(df.to_numpy()[0, :][0])), dtype=object).reshape(-1, 1)

  # Determine how many sequences there are in the text file
  num_seqs = len(df)
  # Assume all input sequences are equal length
  # Determine how long each sequence is
  seq_len = len(list(df.iloc[0, :].values)[0])
  # Start to make a one-hot encoded 3D matrix for the seqeunces
  # Not one-hot encoded yet, just zeroes 
  data = np.zeros((m, seq_len, len(categories)))
  # Make a matrix repersenting each category ['A', 'C', 'G', 'T']
  bp_counts = np.zeros((1, len(categories)))

  # Encode categorical feautures in a one-hot numeric array
  # Assign categories to be used
  ohe = OneHotEncoder(sparse=False, categories=[np.array(categories, dtype=object)])
  # Apply OneHotEncoder to example sequence
  ohe.fit(s1)
  
  # Apply OneHotEncoder to all sequences
  for ii in tqdm(range(num_seqs)):
    s = list(str(df.to_numpy()[ii, :][0]))
    s_a = np.array(s).reshape(-1, 1)
    data[ii, :, :] = ohe.transform(s_a)
  
  # Count the number of each base in the sequence data
  for ii in range(len(categories)):
    bp_counts[0,ii] = np.sum(data[:,:,ii])
  
  # Return the one-hot encoded matrix (data) and the counts for each base pair (bp_counts)
  return data, bp_counts                                                    

Write a modified version of the above function that returns a randomly split training and test data set

In [57]:
# Function written by Chenxi Liu and Gerald Quon

def get_sequence_traintest(url, categories=['A', 'C', 'G', 'T'], FRACTION_TRAINING=0.8):
  # Send a GET request to a specified URL, categories=['A', 'C', 'G', 'T']):
  r = requests.get(url)
  # Convert sequence data to data frame
  df = pd.read_csv(io.StringIO(r.text), sep=" ", header=None)
  # Turn the first sequence into a 2D array where each index is an independent array with a single base pair
  s1 = np.array(list(str(df.to_numpy()[0, :][0])), dtype=object).reshape(-1, 1)

  # Determine how many sequences there are in the text file
  m = len(df)
  # Assume all input sequences are equal length
  # Determine how long each sequence is
  sequence_len = len(list(df.iloc[0, :].values)[0])
  # Start to make a one-hot encoded 3D matrix for the seqeunces
  # Not one-hot encoded yet, just zeroes 
  data = np.zeros((m, sequence_len, len(categories)))
  # Make a matrix repersenting each category ['A', 'C', 'G', 'T'] for the training set
  bp_counts_train = np.zeros((1, len(categories)))
  # Make a matrix repersenting each category ['A', 'C', 'G', 'T'] for the test set
  bp_counts_test = np.zeros((1, len(categories)))

  # Encode categorical feautures in a one-hot numeric array
  # Assign categories to be used
  ohe = OneHotEncoder(sparse=False, categories=[np.array(categories, dtype=object)])
  # Apply OneHotEncoder to example sequence
  ohe.fit(s1)

  # Apply OneHotEncoder to all sequences
  for ii in tqdm(range(m)):
    s = list(str(df.to_numpy()[ii, :][0]))
    s_a = np.array(s).reshape(-1, 1)
    data[ii, :, :] = ohe.transform(s_a)

  # Randomly permute rows of matrix
  np.random.shuffle(data)

  # Split data into training and text data
  train_indices = np.arange(start=0,stop=round(FRACTION_TRAINING*data.shape[0]))
  test_indices = np.arange(start=round(FRACTION_TRAINING*data.shape[0]), stop=data.shape[0])
  
  # Count the number of each base in the test and train sets
  for ii in range(len(categories)):
    bp_counts_train[0,ii] = np.sum(data[train_indices,:,ii])
    bp_counts_test[0,ii] = np.sum(data[test_indices,:,ii])
  
  # Return the base pair counts for the test and train sets
  return bp_counts_train, bp_counts_test

# **Core EM Code**

# **Run EM**

# **Run Experiments on EM Algorithm Model**

Here, we will answer the questions to Assignment 1 Part 3 regarding our updated EM algorithm model.

## 1. Plot the log likelihood as a function of EM iteration, for 100 iterations, for 30 different random initializations of the model parameter. Does the log likelihood monotonically increase every iteration of every initialization?

## 2. Draw a sequence logo visualization of the foreground motif your model learns, $\psi^{l}_{m, k}$. You could try LogoMaker, a Python library (https://logomaker.readthedocs.io/en/latest/). Alternatively, there are a number of web servers for doing this; you could draw samples from your foreground model, and input those drawn sequences into e.g. the WebLogo server (https://weblogo.berkeley.edu).

## 3. Now run your model using model random initializations. How do the model parameters $\psi^{l}_{m, k}$ compare across runs? What about their log likelihoods?

## 4. Plot a figure that shows the distribution over $C_{ij}$ for a few of the input sequences, and compare that (in the visualization) to the ground truth. How close was your model to predicting the real motif location?

## 5. Train your model using 80% of the data, holding out the remaining 20%. Evaluate the log likelihood of your held out data using the model you implemented in this assignment, and compare it to the log likelihood from the simple latent model we used in class, using the same training/held out data. Which one is better?

## 6. Train your model on the atgcsequences.txt file (that had a GC-rich region embedded between two flanking AT-rich regions). Does the model work better?

## 7. The original training set in sequence.padded.txt has 357 sequences. Randomly sample another 357 sequences of the same length (just from a simple generator, that produces each base at equal frequency) and train the model with all data. Does it still recover the same motif? What if you add 3000 noisy sequences?

Generate 357 sequences of the same length that produces each base at equal frequency.

In [49]:
import random

seqs = 357
min = 40
max = 40
gc = 0.5

assert(seqs > 0)
assert(min > 0)
assert(max >= min)
assert(gc >= 0 and gc <= 1)

# Use probabilities for random sequence generation
# This does not guarantee that all bases are present at equal frequency
# But this means the sequence length does not have to be divisible by 4
my_seqs = []
for i in range(seqs):
    l = random.randint(min, max)
    seq = []
    for j in range(l):
        r = random.random()
        if r < gc:
            r = random.random()
            if r < 0.5: seq.append('G')
            else:       seq.append('C')
        else:
            r = random.random()
            if r < 0.5: seq.append('A')
            else:       seq.append('T')
    my_seqs.append(''.join(seq))
print(my_seqs)

['CGAGTGGGGGTTGTCCCCTTCACAGCCGTTACATTTGAGT', 'CTCTCTGACACGCCGAAGCACTGTGCGCACACCCTTGTCT', 'GTGTCAATAGCTATGGCGCAAAGATTACAGTACTATTCCG', 'GTATTCAACACCGGTTGGTCACCGGAGCGATACTTAAGCT', 'TACAAACCCTCTCACACCCCTCTCTGTCCTAGATTATGAA', 'ATTAGCGACATTCGTCGCTTTACCGGCAGGTATTTAGCAC', 'ACGCATGTATCTTTCACGTGGCACGTCAGGCCTGGGCGTA', 'ATGGGTATTTCGACCATTTCTATCGGGTACAACACTCAGT', 'CATCGCCACTTGTCTGACCGTTGCCATGCGTCATGTTGTT', 'ACCGAGGAAGTGAGTACTCACATCTAGGCGCAGGACATCT', 'GCCTTGCGGCGGATTGCGGCCCCCTGTAGGCTATGAGGGC', 'AGTCGCAAAGCACTTCACTCTCCGCATCCCGCGTCCGTCA', 'GTATCATACCGAACTCCTCTCCTTTTAACGCCTAGATTTC', 'TTTGGGCTAGAGCTCCGATAGGAAATCGGACAATGAATTT', 'CGCAAGCGATGCTCGCGTTATTAGCGTAGCGATAAATACC', 'ATGTCCGTCAACCCAGCTGTTCTCCGCAATCAGTCTAGCT', 'CACCGGCAGGGATTGCACAAGCCTTGGATCCGCCCCGGAC', 'TGGACACATCGCAAGTCTGCGAGATTCCTCCGATCGCCAG', 'CTGAATGTTCATGGGCGGGACTTTACAACTTAACCAATAA', 'CGATAGGTGACTGTGAACATTCGGGGCCGGCAGTGCCTTC', 'GAGCACTGCTTCGGAGTTTAGAATCTTCCGGACCTCCTAT', 'TCTCGGTCACATCATGTCCCGCCGGCTGCACCGTACCGAA', 'GGATCGCTAATCATGACACCGCTGGCCTGT