In [6]:
import gzip

def read_genome_from_rtf(file_path):
    genome_sequence = ''
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            if not line.startswith('>'):
                genome_sequence += line.strip()
    return genome_sequence

def gzip_compress(sequence):
    with gzip.open('temp.txt.gz', 'wb') as f:
        f.write(sequence.encode())
    compressed_size = len(open('temp.txt.gz', 'rb').read())
    return compressed_size

def calculate_distance(genome_A, genome_B):
    compressed_A = gzip_compress(genome_A)
    compressed_B = gzip_compress(genome_B)
    concatenated = genome_A + genome_B
    compressed_concatenated = gzip_compress(concatenated)
    
    distance = 1 - ((compressed_A + compressed_B - compressed_concatenated) / max(compressed_A, compressed_B))
    return distance

file_names = [
    'Canis_lupus.rtf',
    'Felis_catus.rtf',
    'Mus_Musculus.rtf',
    'Sus_scrofa.rtf',
    'Ursus_arctos.rtf'
]

readable_names = {
    'Canis_lupus.rtf': 'Canis Lupus',
    'Felis_catus.rtf': 'Felis Catus',
    'Mus_Musculus.rtf': 'Mus Musculus',
    'Sus_scrofa.rtf': 'Sus Scrofa',
    'Ursus_arctos.rtf': 'Ursus Arctos'
}

genome_sequences = [read_genome_from_rtf(file) for file in file_names]

distances = {}
for i in range(len(genome_sequences)):
    for j in range(i + 1, len(genome_sequences)):
        distance = calculate_distance(genome_sequences[i], genome_sequences[j])
        key = (file_names[i], file_names[j])
        distances[key] = distance

output_file = 'genome_distances.txt'
with open(output_file, 'w') as file:
    file.write("Distances between mitochondrial genomes were calculated using the gzip-based compression method and the numbers on page 26 of the Beyond Sequence Alignment presentation.:\n")
    method_written = False
    for pair, distance in distances.items():
        if not method_written:
            file.write("\n\n")
            method_written = True
        
        file_name_1, file_name_2 = pair
        name_1 = readable_names.get(file_name_1, file_name_1[:-4].replace('_', ' '))
        name_2 = readable_names.get(file_name_2, file_name_2[:-4].replace('_', ' '))
        file.write(f"Distances between mitochondrial genomes of {name_1} and {name_2} is {distance:.4f}\n")

print(f"Distances between mitochondrial genomes saved in '{output_file}'.")


Distances between mitochondrial genomes saved in 'genome_distances.txt'.
