# Compute Counts and Frequencies
A cheeky Python script to recreate the counts matrix and frequency matrix from a given set of sequences using NumPy and Pandas. The script follows the steps outlined in the Gibbs sampling approach:

In [2]:
# Import our dependencies
import numpy as np
import pandas as pd

In [6]:
# Starting data: example sequences (motifs)
sequences = ["ACGTACG", 
             "ACGTTGC", 
             "GCGTACG", 
             "TCGTAGC"]

In [7]:
# Step 1: initialise nucleotide positions
nucleotides = ['A', 'C', 'G', 'T']
motif_length = len(sequences[0])  # assume all motifs have the same length

In [8]:
# Step 2: compute Counts Matrix
counts_matrix = {nuc: [0] * motif_length for nuc in nucleotides}

for seq in sequences:
    for pos, base in enumerate(seq):
        counts_matrix[base][pos] += 1

# Convert to Pandas DataFrame
counts_df = pd.DataFrame(counts_matrix, index=[f'Pos {i}' for i in range(motif_length)])
print("Counts Matrix:")
print(counts_df)

Counts Matrix:
       A  C  G  T
Pos 0  2  0  1  1
Pos 1  0  4  0  0
Pos 2  0  0  4  0
Pos 3  0  0  0  4
Pos 4  3  0  0  1
Pos 5  0  2  2  0
Pos 6  0  2  2  0


In [12]:
# Step 3: compute Frequency Matrix (applying Pseudocount of 0.5)
pseudocount = 0.5
total_counts = counts_df.sum(axis=1) + 2 * pseudocount  # Total count per column including pseudocount

frequency_matrix = {nuc: [(counts_df[nuc].iloc[i] + pseudocount) / total_counts.iloc[i] for i in range(motif_length)] for nuc in nucleotides}

# Convert to Pandas DataFrame
frequency_df = pd.DataFrame(frequency_matrix, index=[f'Pos {i}' for i in range(motif_length)])
print("\nFrequency Matrix with Pseudocounts:")
print(frequency_df)


Frequency Matrix with Pseudocounts:
         A    C    G    T
Pos 0  0.5  0.1  0.3  0.3
Pos 1  0.1  0.9  0.1  0.1
Pos 2  0.1  0.1  0.9  0.1
Pos 3  0.1  0.1  0.1  0.9
Pos 4  0.7  0.1  0.1  0.3
Pos 5  0.1  0.5  0.5  0.1
Pos 6  0.1  0.5  0.5  0.1


Now that we've established our script works, we can establish a random sequence generator for testing. 