In [1]:
from pathlib import Path
import math
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Project paths
ROOT = Path.cwd().parents[0]   # notebooks → repo root
DATA = ROOT / "data"
SRC = ROOT / "src"

print("ROOT:", ROOT)
print("DATA:", DATA)


ROOT: C:\Users\14024\Documents\bio-projects\motif-entropy-nfkb
DATA: C:\Users\14024\Documents\bio-projects\motif-entropy-nfkb\data


In [2]:
def read_fasta(filepath):
    sequences = []
    with open(filepath) as f:
        seq = ""
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                if seq:
                    sequences.append(seq)
                    seq = ""
            else:
                seq += line
        if seq:
            sequences.append(seq)
    return sequences

fasta_path = DATA / "nfkb_example.fasta"
seqs = read_fasta(fasta_path)
print(f"Loaded {len(seqs)} sequences")
seqs[:3]  # preview first 3 sequences


Loaded 12 sequences


['GGGAACTTCC', 'GGGGATTTCC', 'GGGAACCTCC']

In [None]:
import pandas as pd
from collections import Counter
import math

L = len(seqs[0])
alphabet = ["A","C","G","T"]

def column_entropy(col):
    counts = Counter(col)
    total = len(col)
    ent = 0.0
    for c in counts.values():
        p = c/total
        ent -= p*math.log2(p)
    return ent

rows = []
for i in range(L):
    col = [s[i] for s in seqs]
    counts = Counter(col)
    freqs = {b: counts.get(b,0)/len(seqs) for b in alphabet}
    rows.append({"position": i+1, "H": column_entropy(col), **freqs})

df = pd.DataFrame(rows)
df


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,3))
plt.bar(df["position"], df["H"])
plt.xlabel("Position")
plt.ylabel("Entropy (bits)")
plt.title("NF-κB motif per-column entropy")
plt.show()
