In [2]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import os

In [7]:
def process_mutation_paths(dir, strain):
    """
    mutation-paths.txtを処理し、複数のTSVファイルを生成する関数。
    strainフォルダ内にmutation-paths.txtがある場合と、さらにフォルダがある場合の両方に対応。
    """
    def handle_mutation_paths(file_path, output_prefix):
        # 1 ファイル入力
        print(f"Processing file: {file_path}")
        df = pd.read_csv(file_path, sep='\t', header=None)
        df = df.set_axis(['name', 'mutation_path'], axis=1)

        clades_file = os.path.join(os.path.dirname(file_path), 'clades.txt')
        with open(clades_file, 'r', encoding="shift-jis") as f:
            datalist = f.readlines()
        clades = [data.split('\t')[2].rstrip() for data in datalist]

        # 2 mutation_path抽出
        df_mutation = df.loc[:, 'mutation_path']
        name = df.loc[:, 'name']

        # 3 変異ごとに配列化
        mutation_paths = [path.split(' ') for path in df_mutation.values.tolist()]

        # 4 node_*:消す
        for path in mutation_paths:
            path[:] = [re.sub('.*:', '', mutation) for mutation in path]
            path.pop(-1)

        mutation_paths_strain = []
        name_strain = []
        mutation_paths_other = []
        name_other = []

        for i, clade in enumerate(clades):
            if clade == strain:
                mutation_paths_strain.append(mutation_paths[i])
                name_strain.append(name[i])
            else:
                mutation_paths_other.append(mutation_paths[i])
                name_other.append(name[i])

        def write_tsv(file_name, names, paths):
            with open(file_name, 'w') as f:
                f.write("name\tlength(「>」separate)\tmutation_path\n")
                for name, path in zip(names, paths):
                    temp = '>'.join(path)
                    f.write(f"{name}\t{len(path)}\t{temp}\n")

        write_tsv(output_prefix+'mutation_paths.tsv', name, mutation_paths)
        write_tsv(output_prefix+f'mutation_paths_{strain}.tsv', name_strain, mutation_paths_strain)
        write_tsv(output_prefix+'mutation_paths_other.tsv', name_other, mutation_paths_other)

    # strainフォルダ内の数字フォルダを検索
    numeric_dirs = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and d.isdigit()]
    numeric_dirs = sorted(set(numeric_dirs), key=int)  # 重複を削除して数字順にソート

    if numeric_dirs:
        # 数字フォルダが存在する場合、各フォルダ内のmutation-paths.txtを処理
        for numeric_dir in numeric_dirs:
            file_path = os.path.join(dir, numeric_dir, 'mutation-paths.txt')
            if os.path.exists(file_path):
                handle_mutation_paths(file_path, os.path.join(dir, numeric_dir)+'/')
    else:
        # strainフォルダ内のmutation-paths.txtを処理
        file_path = os.path.join(dir, 'mutation-paths.txt')
        if os.path.exists(file_path):
            handle_mutation_paths(file_path, dir)
        else:
            raise FileNotFoundError("mutation-paths.txtが見つかりません。")

In [8]:
strain = 'BA.1.1'
dir = os.path.expanduser('~/usher_output/' + strain)  # ホームディレクトリを展開

process_mutation_paths(dir, strain)

Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/0/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/1/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/1/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/2/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/2/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/3/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/3/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/4/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/4/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/5/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/5/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/BA.1.1/6/mutation-paths.txt
Processing file: /mnt/ssd1/home3/aiba/usher_output/B