In [1]:
import pandas as pd
import re
import numpy as np
import os

In [2]:
# usher_outputディレクトリのパス
usher_output_dir = os.path.expanduser('~/usher_output')

# 方法1: osモジュールを使用
try:
    # ディレクトリが存在するかチェック
    if os.path.exists(usher_output_dir):
        # フォルダのみを取得（ファイルは除外）
        folder_names = [item for item in os.listdir(usher_output_dir) 
                       if os.path.isdir(os.path.join(usher_output_dir, item))]
        folder_names.sort()  # アルファベット順にソート
        
        print(f"usher_outputディレクトリ内のフォルダ数: {len(folder_names)}")
        print(f"パス: {usher_output_dir}")
        
        # 最初の20個のフォルダ名を表示
        print("\n最初の20個のフォルダ名:")
        for i, folder in enumerate(folder_names[:20]):
            print(f"{i+1:3d}. {folder}")
        
        if len(folder_names) > 20:
            print(f"... (残り{len(folder_names) - 20}個)")
        
        # 全フォルダ名をリストとして保存
        usher_folders = folder_names
        print(f"\n変数 'usher_folders' に{len(usher_folders)}個のフォルダ名を保存しました。")
        
    else:
        print(f"ディレクトリが見つかりません: {usher_output_dir}")
        usher_folders = []
        
except Exception as e:
    print(f"エラーが発生しました: {e}")
    usher_folders = []

usher_outputディレクトリ内のフォルダ数: 196
パス: /mnt/ssd1/home3/aiba/usher_output

最初の20個のフォルダ名:
  1. A
  2. A.1
  3. AD.2
  4. AY.1
  5. AY.10
  6. AY.100
  7. AY.107
  8. AY.109
  9. AY.110
 10. AY.111
 11. AY.112
 12. AY.113
 13. AY.114
 14. AY.116.1
 15. AY.117
 16. AY.118
 17. AY.119
 18. AY.119.2
 19. AY.120
 20. AY.120.1
... (残り176個)

変数 'usher_folders' に196個のフォルダ名を保存しました。


In [3]:
print(usher_folders)

['A', 'A.1', 'AD.2', 'AY.1', 'AY.10', 'AY.100', 'AY.107', 'AY.109', 'AY.110', 'AY.111', 'AY.112', 'AY.113', 'AY.114', 'AY.116.1', 'AY.117', 'AY.118', 'AY.119', 'AY.119.2', 'AY.120', 'AY.120.1', 'AY.120.2.1', 'AY.121', 'AY.121.1', 'AY.122', 'AY.122.3', 'AY.124', 'AY.125', 'AY.126', 'AY.127', 'AY.129', 'AY.13', 'AY.14', 'AY.16', 'AY.2', 'AY.20', 'AY.23', 'AY.25', 'AY.25.1', 'AY.25.3', 'AY.26', 'AY.29', 'AY.3', 'AY.3.1', 'AY.32', 'AY.33', 'AY.34', 'AY.34.1', 'AY.35', 'AY.36', 'AY.36.1', 'AY.37', 'AY.39', 'AY.39.1', 'AY.39.1.1', 'AY.4', 'AY.4.1', 'AY.4.10', 'AY.4.15', 'AY.4.2', 'AY.4.2.1', 'AY.4.2.2', 'AY.4.2.3', 'AY.4.3', 'AY.4.4', 'AY.4.5', 'AY.4.6', 'AY.4.7', 'AY.4.8', 'AY.4.9', 'AY.41', 'AY.42', 'AY.43', 'AY.43.4', 'AY.43.6', 'AY.43.8', 'AY.44', 'AY.45', 'AY.46', 'AY.46.1', 'AY.46.4', 'AY.46.5', 'AY.46.6', 'AY.47', 'AY.48', 'AY.5', 'AY.5.3', 'AY.5.4', 'AY.54', 'AY.57', 'AY.6', 'AY.62', 'AY.64', 'AY.65', 'AY.67', 'AY.7', 'AY.7.1', 'AY.70', 'AY.8', 'AY.80', 'AY.84', 'AY.9', 'AY.9.2', 'AY

In [4]:
def process_mutation_paths_safe(dir, strain):
    """
    mutation-paths.txtを処理し、複数のTSVファイルを生成する関数。
    エラーハンドリングとNaN値処理を追加した安全版。
    """
    def handle_mutation_paths(file_path, output_prefix):
        try:
            # 1 ファイル入力
            print(f"Processing file: {file_path}")
            df = pd.read_csv(file_path, sep='\t', header=None)
            df = df.set_axis(['name', 'mutation_path'], axis=1)
            
            # NaN値を含む行を除外
            df = df.dropna(subset=['mutation_path'])
            print(f"  Data rows after NaN removal: {len(df)}")

            clades_file = os.path.join(os.path.dirname(file_path), 'clades.txt')
            
            # clades.txtの存在チェック
            if not os.path.exists(clades_file):
                print(f"  Warning: clades.txt not found at {clades_file}")
                return
                
            with open(clades_file, 'r', encoding="shift-jis") as f:
                datalist = f.readlines()
            clades = [data.split('\t')[2].rstrip() for data in datalist if len(data.split('\t')) > 2]
            
            # データ数とclades数の整合性チェック
            if len(df) != len(clades):
                print(f"  Warning: Data length ({len(df)}) != Clades length ({len(clades)})")
                min_len = min(len(df), len(clades))
                df = df.head(min_len)
                clades = clades[:min_len]
                print(f"  Truncated to {min_len} rows")

            # 2 mutation_path抽出
            df_mutation = df.loc[:, 'mutation_path']
            name = df.loc[:, 'name']

            # 3 変異ごとに配列化（NaN値と空文字列の処理を追加）
            mutation_paths = []
            for path in df_mutation.values.tolist():
                if pd.isna(path) or path == '' or not isinstance(path, str):
                    mutation_paths.append([])  # 空のリストを追加
                else:
                    mutation_paths.append(path.split(' '))

            # 4 node_*:消す
            for path in mutation_paths:
                if path:  # 空のリストでない場合のみ処理
                    path[:] = [re.sub('.*:', '', mutation) for mutation in path]
                    if path:  # まだ要素がある場合のみpop
                        path.pop(-1)

            mutation_paths_strain = []
            name_strain = []
            mutation_paths_other = []
            name_other = []

            for i, clade in enumerate(clades):
                if i < len(mutation_paths) and i < len(name):
                    if clade == strain:
                        mutation_paths_strain.append(mutation_paths[i])
                        name_strain.append(name.iloc[i])
                    else:
                        mutation_paths_other.append(mutation_paths[i])
                        name_other.append(name.iloc[i])

            def write_tsv(file_name, names, paths):
                try:
                    with open(file_name, 'w') as f:
                        f.write("name\tlength(「>」separate)\tmutation_path\n")
                        for name, path in zip(names, paths):
                            temp = '>'.join(path) if path else ''
                            f.write(f"{name}\t{len(path)}\t{temp}\n")
                    print(f"  Saved: {file_name}")
                except Exception as e:
                    print(f"  Error writing {file_name}: {e}")

            write_tsv(output_prefix+'mutation_paths.tsv', name, mutation_paths)
            write_tsv(output_prefix+f'mutation_paths_{strain}.tsv', name_strain, mutation_paths_strain)
            write_tsv(output_prefix+'mutation_paths_other.tsv', name_other, mutation_paths_other)
            
        except Exception as e:
            print(f"  Error processing {file_path}: {e}")
            return

    try:
        print(f"\n=== Processing strain: {strain} ===")
        
        # ディレクトリの存在チェック
        if not os.path.exists(dir):
            print(f"  Directory not found: {dir}")
            return
            
        # strainフォルダ内の数字フォルダを検索
        try:
            numeric_dirs = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and d.isdigit()]
            numeric_dirs = sorted(set(numeric_dirs), key=int)  # 重複を削除して数字順にソート
        except Exception as e:
            print(f"  Error reading directory {dir}: {e}")
            return

        if numeric_dirs:
            print(f"  Found numeric subdirs: {numeric_dirs}")
            # 数字フォルダが存在する場合、各フォルダ内のmutation-paths.txtを処理
            for numeric_dir in numeric_dirs:
                file_path = os.path.join(dir, numeric_dir, 'mutation-paths.txt')
                if os.path.exists(file_path):
                    handle_mutation_paths(file_path, os.path.join(dir, numeric_dir)+'/')
                else:
                    print(f"  File not found: {file_path}")
        else:
            # strainフォルダ内のmutation-paths.txtを処理
            file_path = os.path.join(dir, 'mutation-paths.txt')
            if os.path.exists(file_path):
                handle_mutation_paths(file_path, dir+'/')
            else:
                print(f"  File not found: {file_path}")
                
    except Exception as e:
        print(f"Error processing strain {strain}: {e}")
        return

In [5]:
# 安全版の関数を使用してすべてのstrainを処理
# まず少数のstrainでテスト
test_strains = usher_folders[:5]  # 最初の5個でテスト

print("=== Testing with first 5 strains ===")
for strain in test_strains:
    dir = os.path.expanduser('~/usher_output/' + strain)
    process_mutation_paths_safe(dir, strain)

=== Testing with first 5 strains ===

=== Processing strain: A ===
  Found numeric subdirs: ['0']
Processing file: /mnt/ssd1/home3/aiba/usher_output/A/0/mutation-paths.txt
  Data rows after NaN removal: 1156
  Truncated to 1156 rows
  Saved: /mnt/ssd1/home3/aiba/usher_output/A/0/mutation_paths.tsv
  Saved: /mnt/ssd1/home3/aiba/usher_output/A/0/mutation_paths_A.tsv
  Saved: /mnt/ssd1/home3/aiba/usher_output/A/0/mutation_paths_other.tsv

=== Processing strain: A.1 ===
  Found numeric subdirs: ['0']
Processing file: /mnt/ssd1/home3/aiba/usher_output/A.1/0/mutation-paths.txt
  Data rows after NaN removal: 3066
  Saved: /mnt/ssd1/home3/aiba/usher_output/A.1/0/mutation_paths.tsv
  Saved: /mnt/ssd1/home3/aiba/usher_output/A.1/0/mutation_paths_A.1.tsv
  Saved: /mnt/ssd1/home3/aiba/usher_output/A.1/0/mutation_paths_other.tsv

=== Processing strain: AD.2 ===
  Found numeric subdirs: ['0']
Processing file: /mnt/ssd1/home3/aiba/usher_output/AD.2/0/mutation-paths.txt
  Data rows after NaN removal: 

In [24]:
# すべてのstrainを処理（進行状況付き）
import time

print(f"=== Processing all {len(usher_folders)} strains ===")
start_time = time.time()
processed_count = 0
success_count = 0
error_count = 0

for i, strain in enumerate(usher_folders):
    try:
        print(f"\n[{i+1}/{len(usher_folders)}] Processing: {strain}")
        dir = os.path.expanduser('~/usher_output/' + strain)
        process_mutation_paths_safe(dir, strain)
        success_count += 1
    except Exception as e:
        print(f"  Fatal error for {strain}: {e}")
        error_count += 1
    
    processed_count += 1
    
    # 進行状況を10個ごとに表示
    if (i + 1) % 10 == 0:
        elapsed = time.time() - start_time
        print(f"\n--- Progress: {i+1}/{len(usher_folders)} ({(i+1)/len(usher_folders)*100:.1f}%) ---")
        print(f"    Elapsed time: {elapsed:.1f}s")
        print(f"    Success: {success_count}, Errors: {error_count}")

# 最終サマリー
total_time = time.time() - start_time
print(f"\n=== Processing Complete ===")
print(f"Total time: {total_time:.1f}s")
print(f"Total processed: {processed_count}")
print(f"Successful: {success_count}")
print(f"Errors: {error_count}")
print(f"Success rate: {success_count/processed_count*100:.1f}%")

=== Processing all 196 strains ===

[1/196] Processing: A

=== Processing strain: A ===
  Found numeric subdirs: ['0']
Processing file: /mnt/ssd1/home3/aiba/usher_output/A/0/mutation-paths.txt
  Data rows after NaN removal: 1156
  Truncated to 1156 rows
  Saved: /mnt/ssd1/home3/aiba/usher_output/A/0/mutation_paths.tsv
  Saved: /mnt/ssd1/home3/aiba/usher_output/A/0/mutation_paths_A.tsv
  Saved: /mnt/ssd1/home3/aiba/usher_output/A/0/mutation_paths_other.tsv

[2/196] Processing: A.1

=== Processing strain: A.1 ===
  Found numeric subdirs: ['0']
Processing file: /mnt/ssd1/home3/aiba/usher_output/A.1/0/mutation-paths.txt
  Data rows after NaN removal: 3066
  Saved: /mnt/ssd1/home3/aiba/usher_output/A.1/0/mutation_paths.tsv
  Saved: /mnt/ssd1/home3/aiba/usher_output/A.1/0/mutation_paths_A.1.tsv
  Saved: /mnt/ssd1/home3/aiba/usher_output/A.1/0/mutation_paths_other.tsv

[3/196] Processing: AD.2

=== Processing strain: AD.2 ===
  Found numeric subdirs: ['0']
Processing file: /mnt/ssd1/home3/aib