In [30]:
import os
import csv
from collections import defaultdict
from Bio import SeqIO

folder_path = '/Users/udz/Desktop/codons_analysis/transcripts_probabilities'
file_name = 'thuringiensis.fna'
file_path = os.path.join(folder_path, file_name)

In [31]:
transcripts = []

for seq_record in SeqIO.parse(file_path, "fasta"):
    seq = seq_record.seq
    codons = [seq[i:i+3] for i in range(0, len(seq) - 1, 3)]

    transcripts.append(set(codons))

In [32]:
all_codons = set()

for transcript in transcripts:
    for codon in transcript:
        all_codons.add(codon)

# print(all_codons)

In [33]:
# res(c1,c2) = (p(c1 & c2) / p(c1) * p(c2))

p_single = defaultdict(int)
p_double = defaultdict(lambda: defaultdict(int))

for codon in all_codons:
    counts_of_occurs = 0
    for transcript in transcripts:
        if codon in transcript:
            counts_of_occurs += 1

    p_single[codon] = counts_of_occurs / len(transcripts)

# print(p_single)

for codon1 in all_codons:
    for codon2 in all_codons:
        counts_of_occurs = 0
        for transcript in transcripts:
            if (codon1 in transcript) and (codon2 in transcript):
                counts_of_occurs += 1
        p_double[codon1][codon2] = counts_of_occurs / len(transcripts)

# print(p_double)

In [34]:
res = defaultdict(lambda: defaultdict(int))

for codon1 in all_codons:
    for codon2 in all_codons:
        res[codon1][codon2] = p_double[codon1][codon2] / (p_single[codon1] * p_single[codon2])

# print(res)

In [36]:
csv_file_path = file_path + '_probabilities.csv'

with open(csv_file_path, 'w') as f:
    w = csv.writer(f)
    w.writerow(['codon1', 'codon2', 'probability'])
    written_codons = set()

    for codon1 in all_codons:
        for codon2 in all_codons:
            if (codon2 in written_codons) or (codon1 == codon2):
                continue
            w.writerow([codon1, codon2, res[codon1][codon2]])
        written_codons.add(codon1)