In [5]:
from Bio import AlignIO
import pandas as pd
import re

# Load alignment
alignment = AlignIO.read("alignf_out.fasta", "fasta")

# Step 1: Identify reference sequence
ref = alignment[0]
ref_seq = ref.seq

# Step 2: Map ungapped positions
positions = []
for i, aa in enumerate(ref_seq):
    if aa != "-":
        pos_num = len([r for r in ref_seq[:i] if r != "-"]) + 1
        positions.append((i, aa, pos_num))

# Step 3: Build mutation -> set of sequence IDs
mut_map = {}


# Mapping from seq IDs to variants (based on second dash-separated element)
seq_to_variant = {}

for record in alignment[1:]:
    seq_id = record.id
    header = record.description
    parts = header.split("-")
    variant = parts[0] + "-" + parts[1] if len(parts) > 1 else "Unknown"
    seq_to_variant[seq_id] = variant


    seq = record.seq
    for i, ref_aa, pos in positions:
        # if pos < 319 or pos > 531:
        #     continue
        var_aa = seq[i]
        if var_aa == ref_aa:
            continue
        if var_aa == "-":
            label = f"{ref_aa}{pos}-"
        elif ref_aa == "-":
            label = f"-{pos}{var_aa}"
        elif var_aa == "X":
            continue
        else:
            label = f"{ref_aa}{pos}{var_aa}"
        mut_map.setdefault(label, set()).add(seq_id)

# Step 4: Build dataframe
seq_ids = [rec.id for rec in alignment[1:]]
mut_labels = sorted(mut_map.keys(), key=lambda x: (int(re.findall(r'\d+', x)[0]), x))

data = {'sequence': seq_ids,
        'variant': [seq_to_variant[s] for s in seq_ids]}

for label in mut_labels:
    data[label] = [1 if seq in mut_map[label] else 0 for seq in seq_ids]

df = pd.DataFrame(data)


# Save to CSV
# Generate safe variant tag for filename
variant_tag = sorted(set(seq_to_variant.values()))[0]

# Save to CSV with variants in the filename
df.to_csv(f"mutation/mutation-{variant_tag}.csv", index=False, header=True)



In [6]:
import pandas as pd
import os
mutation_folder = "mutation"
csv_files = [f for f in os.listdir(mutation_folder) if f.endswith('.csv') and "mutation" in f and "all" not in f]
data = pd.DataFrame()
for csv_file in sorted(csv_files):
    data = pd.concat([data, pd.read_csv(os.path.join(mutation_folder, csv_file))])
data.fillna(0, inplace=True)

for col in data.columns:
    if col == "sequence" or col == "variant":
        continue
    data[col] = data[col].astype(int)
data.to_csv("mutation/mutation-all.csv", index=False, header=True)

In [117]:
spike_locations = """
N487(ND2)
K417(NZ)
Q493(NE2)
Y505(OH)
Y449(OH)
T500(OG1)
N501(N)
G446(O)
Y449(OH)
Y489(OH)
N487(OD1)
G502(N)
Y505(OH)
K417(NZ)
K417(NZ)
"""
ace2_locations = """
Q24(OE1)
D30(OD2)
E35(OE2)
E37(OE2)
D38(OD2)
Y41(OH)
Y41(OH)
Q42(NE2)
Q42(NE2)
Y83(OH)
Y83(OH)
K353(O)
R393(NH2)
D30(OD1)
D30(OD2)
"""

spike_residues = set()
for line in spike_locations.split("\n"):
    loc = ""
    for c in line:
        if c in "0123456789":
            loc += c
        elif c == "(":
            break
    if loc:
        spike_residues.add(int(loc) - 319)

spike_residues


{98, 127, 130, 168, 170, 174, 181, 182, 183, 186}

In [118]:
ace2_residues = set()
for line in ace2_locations.split("\n"):
    loc = ""
    for c in line:
        if c in "0123456789":
            loc += c
        elif c == "(":
            break
    if loc:
        ace2_residues.add(int(loc))
ace2_residues


{24, 30, 35, 37, 38, 41, 42, 83, 353, 393}

In [None]:

ACE2: 24, 30, 35, 37, 38, 41, 42, 83, 353, 393

Spike: 98, 127, 130, 168, 170, 174, 181, 182, 183, 186

In [2]:
import pandas as pd
pd.read_csv("tmalign/residue_distances_by_sequence.csv")

Unnamed: 0,sequence,residue_344,residue_345,residue_346,residue_347,residue_348,residue_349,residue_350,residue_351,residue_352,...,residue_503,residue_504,residue_505,residue_506,residue_507,residue_508,residue_509,residue_510,residue_511,residue_512
0,Alpha-B.1.1.7,2.116348,1.429974,0.6778,0.281384,0.750471,1.471605,0.919854,1.289429,1.17821,...,0.926989,0.590774,0.662051,0.59006,0.520785,0.529977,0.416832,0.268366,0.55308,0.592604
1,Beta-B.1.351,0.830077,0.62891,0.534827,0.771157,1.030547,0.914569,0.645193,0.869253,0.777212,...,1.71665,1.893342,1.450881,1.058686,0.443724,0.219132,0.301809,0.68306,0.758964,0.677333
2,Beta-B.1.351-S1,1.385671,0.933451,0.783534,0.738919,1.294587,0.658058,1.232224,1.781214,1.427769,...,1.198405,1.075551,0.902345,1.107471,0.500225,0.393026,0.395608,0.266284,0.714241,0.647396
3,Beta-B.1.351-S2,1.374623,0.905769,0.868178,0.76647,1.214912,0.641602,1.254044,1.753571,1.371806,...,1.224886,1.127278,0.87259,0.936486,0.359986,0.337599,0.411721,0.328308,0.646537,0.534667
4,Beta-B.1.351-S3,1.381169,0.959615,0.728492,0.694413,1.351441,0.626024,1.233373,1.783862,1.434206,...,1.38904,1.259048,0.887023,1.020848,0.506156,0.389931,0.370023,0.285526,0.745737,0.689535
5,Beta-B.1.351-S4,1.374623,0.905769,0.868178,0.76647,1.214912,0.641602,1.254044,1.753571,1.371806,...,1.224886,1.127278,0.87259,0.936486,0.359986,0.337599,0.411721,0.328308,0.646537,0.534667
6,Beta-B.1.351-S5,1.374623,0.905769,0.868178,0.76647,1.214912,0.641602,1.254044,1.753571,1.371806,...,1.224886,1.127278,0.87259,0.936486,0.359986,0.337599,0.411721,0.328308,0.646537,0.534667
7,Beta-B.1.351-S6,1.356943,0.894475,0.879776,0.775315,1.210021,0.651826,1.259537,1.757098,1.362183,...,1.240867,1.165511,0.906085,0.94369,0.355601,0.343477,0.424741,0.317134,0.630508,0.522289
8,Beta-B.1.351-S7,1.347396,0.893488,0.872267,0.782123,1.216988,0.645998,1.28752,1.780511,1.395505,...,1.15949,1.077051,0.847165,0.962811,0.381063,0.390716,0.422843,0.273717,0.635233,0.530681
9,Beta-B.1.351-S8,1.410475,0.940179,0.808705,0.767417,1.343992,0.6878,1.263239,1.81993,1.447206,...,1.227816,1.097461,0.888166,1.047363,0.452187,0.341147,0.421848,0.303513,0.743002,0.679704
