# PSSM Matrices

In [60]:
from Bio.Seq import Seq
import numpy as np
import matplotlib.pyplot as plt
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics

# The list of Anderson promoters
seqs = [Seq("ttgacagctagctcagtcctaggtataatgctagc"),
        Seq("ttgacagctagctcagtcctaggtataatgctagc"),
        Seq("tttacagctagctcagtcctaggtattatgctagc"),
        Seq("ttgacagctagctcagtcctaggtactgtgctagc"),
        Seq("ctgatagctagctcagtcctagggattatgctagc"),
        Seq("ttgacagctagctcagtcctaggtattgtgctagc"),
        Seq("tttacggctagctcagtcctaggtactatgctagc"),
        Seq("tttacggctagctcagtcctaggtatagtgctagc"),
        Seq("tttacggctagctcagccctaggtattatgctagc"),
        Seq("ctgacagctagctcagtcctaggtataatgctagc"),
        Seq("tttacagctagctcagtcctagggactgtgctagc"),
        Seq("tttacggctagctcagtcctaggtacaatgctagc"),
        Seq("ttgacggctagctcagtcctaggtatagtgctagc"),
        Seq("ctgatagctagctcagtcctagggattatgctagc"),
        Seq("ctgatggctagctcagtcctagggattatgctagc"),
        Seq("tttatggctagctcagtcctaggtacaatgctagc"),
        Seq("tttatagctagctcagcccttggtacaatgctagc"),
        Seq("ttgacagctagctcagtcctagggactatgctagc"),
        Seq("ttgacagctagctcagtcctagggattgtgctagc"),
        Seq("ttgacggctagctcagtcctaggtattgtgctagc")]


## Task 1: 
Given sequence collection, construct position frequency matrix (PFM) and from PFM construct PPM.

![correct](practice5_files/PFM.png)

In [61]:
def matrix_construction(seqs):
    PFM = np.zeros((4, len(seqs[0])), dtype='int')
    idx_convert = {'a': 0, 'c': 1, 'g': 2, 't': 3}
    for pos_idx in range(len(seqs[0])):
        for seq in seqs:
            PFM[idx_convert[seq[pos_idx]]][pos_idx] += 1
    PPM = PFM/len(seqs)
    return PFM, PPM

PFM, PPM = matrix_construction(seqs)


## Task 2

You have a Lactobacillus genome in "practice5_files" folder. Let's try to find the 10 top-scoring according to your PSM... 

In [64]:
from Bio import SeqIO

# Function to calculate PWM score for a sequence
def calculate_ppm_score(seq, ppm):
    idx_convert = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    score = 1.0
    for i, base in enumerate(seq):
        if score == 0.0:
            break
        score *= ppm[idx_convert[base], i]
    return score


fasta_file = '/home/pk/Desktop/BioinformaticsCourse2024PK/practice/practice5_files/Lactobacillus_bulgaris.fasta'  # Update this with your FASTA file path
scores = {}
for record in SeqIO.parse(fasta_file, "fasta"):
    seq = str(record.seq)
    for i in range(len(seq) - PPM.shape[1] + 1):
        score = calculate_ppm_score(seq[i: i + PPM.shape[1]], PPM)
        scores[(record.id, i)] = score

# Get top 10 highest scoring sequences
top_10 = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:10]

# Output top 10 highest scoring sequences
for name, score in top_10:
    print(f"{name}: Score = {score}")

('NC_014727.1', 0): Score = 0.0
('NC_014727.1', 1): Score = 0.0
('NC_014727.1', 2): Score = 0.0
('NC_014727.1', 3): Score = 0.0
('NC_014727.1', 4): Score = 0.0
('NC_014727.1', 5): Score = 0.0
('NC_014727.1', 6): Score = 0.0
('NC_014727.1', 7): Score = 0.0
('NC_014727.1', 8): Score = 0.0
('NC_014727.1', 9): Score = 0.0


## Task 3

Since your previous step did not give any meaningful results, let's try and add pseudocounts to the model (just add 0.01 probability to every cell of PPM)

In [67]:
# Function to calculate PWM score for a sequence
def calculate_ppm_score(seq, ppm):
    idx_convert = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    score = 1.0
    for i, base in enumerate(seq):
        if score == 0.0:
            break
        score *= ppm[idx_convert[base], i]
    return score

pseudocounts_PPM = PPM + 0.01

fasta_file = '/home/pk/Desktop/BioinformaticsCourse2024PK/practice/practice5_files/Lactobacillus_bulgaris.fasta'  # Update this with your FASTA file path
scores = {}
for record in SeqIO.parse(fasta_file, "fasta"):
    seq = str(record.seq)
    for i in range(len(seq) - PPM.shape[1] + 1):
        score = calculate_ppm_score(seq[i: i + PPM.shape[1]], pseudocounts_PPM)
        scores[(record.id, i)] = score

# Get top 10 highest scoring sequences
top_10 = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:10]

# Output top 10 highest scoring sequences
for name, score in top_10:
    print(f"{name}: Score = {score}")

('NC_014727.1', 1750670): Score = 1.1110471623358634e-23
('NC_014727.1', 127901): Score = 1.916035136170013e-24
('NC_014727.1', 1854462): Score = 6.27353362910947e-25
('NC_014727.1', 578770): Score = 5.31576062572989e-25
('NC_014727.1', 2037280): Score = 1.9231363382998838e-25
('NC_014727.1', 750074): Score = 8.834125403780717e-26
('NC_014727.1', 151289): Score = 3.7464568999723747e-26
('NC_014727.1', 256527): Score = 3.3146585781677367e-26
('NC_014727.1', 28676): Score = 2.9189835557286675e-26
('NC_014727.1', 95444): Score = 1.335370776227777e-26


## Task 4

Ok, now we have some top-scoring positions, however, numbers are not impressive of meaningful.

To get meaningful results let's switch to log odds Position Weight Matrix (PWM)
![correct](practice5_files/PPM.png)

b_k - frequency of nucleotide k in background model (nucleotide frequency based)

In [79]:
# Function to calculate PWM score for a sequence
def calculate_ppm_score(seq, ppm):
    score = 1.0
    for i, base in enumerate(seq):
        if score == 0.0:
            break
        score += ppm[idx_convert[base], i]
    return score

def frequency_count(sequence):
    frequency = [0 for _ in range(4)]
    for elem in sequence:
        frequency[idx_convert[elem]] += 1/len(sequence)
    return frequency

idx_convert = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
pseudocounts_PPM = PPM + 0.03

fasta_file = '/home/pk/Desktop/BioinformaticsCourse2024PK/practice/practice5_files/Lactobacillus_bulgaris.fasta'  # Update this with your FASTA file path
scores = {}
for record in SeqIO.parse(fasta_file, "fasta"):
    seq = str(record.seq)
    frequency = frequency_count(seq)
    PWM = pseudocounts_PPM.copy()
    for i in range(PWM.shape[0]):
        PWM[i] /= frequency[i]
    PWM = np.log2(PWM)
    print(PWM)
    for i in range(len(seq) - PWM.shape[1] + 1):
        score = calculate_ppm_score(seq[i: i + PWM.shape[1]], PWM)
        scores[(record.id, i)] = score

# Get top 10 highest scoring sequences
top_10 = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:10]

# Output top 10 highest scoring sequences
for name, score in top_10:
    print(f"{name}: Score = {score}")

[[-3.06323813 -3.06323813 -3.06323813  2.0382999  -3.06323813  1.3290793
  -3.06323813 -3.06323813 -3.06323813  2.0382999  -3.06323813 -3.06323813
  -3.06323813 -3.06323813  2.0382999  -3.06323813 -3.06323813 -3.06323813
  -3.06323813 -3.06323813  1.96650922 -3.06323813 -3.06323813 -3.06323813
   2.0382999  -3.06323813  0.77806413  1.43926222 -3.06323813 -3.06323813
  -3.06323813 -3.06323813  2.0382999  -3.06323813 -3.06323813]
 [-0.1113235  -3.04992296 -3.04992296 -3.04992296  1.65051676 -3.04992296
  -3.04992296  2.05161507 -3.04992296 -3.04992296 -3.04992296  2.05161507
  -3.04992296  2.05161507 -3.04992296 -3.04992296 -0.93444574  2.05161507
   2.05161507 -3.04992296 -3.04992296 -3.04992296 -3.04992296 -3.04992296
  -3.04992296  0.61304205 -3.04992296 -3.04992296 -3.04992296 -3.04992296
   2.05161507 -3.04992296 -3.04992296 -3.04992296  2.05161507]
 [-3.04221834 -3.04221834  1.35009908 -3.04221834 -3.04221834  0.79908391
   2.05931969 -3.04221834 -3.04221834 -3.04221834  2.05931969