C. elegans Lineage Data
- Title:    A lineage-resolved molecular atlas of C. elegans embryogenesis at single-cell resolution
- Author:   Packer et al.
- Year:     2019
- doi:      https://doi.org/10.1126/science.aax1971
- GEO:      https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE126954
- file:
  - `.csv`: Supplementary file -> `cell_annotation.csv.gz` -> Column: `lineage`;
  - ~~`.h5ad`: https://zenodo.org/records/15236812~~

Worm Atlas Complete Cell List
- url:      https://www.wormatlas.org/celllistsulston.html
- ref:      Sulston, JE and White, JG (1988), "Parts list", in "The Nematode Caenorhabditis elegans, eds WB Wood et al, Cold Spring Harbor Laboratory Press, Cold Spring Harbor, New York, USA, pp 415 - 431.
  - Title:  The embryonic cell lineage of the nematode Caenorhabditis elegans
  - Author: J.E. Sulston, E. Schierenberg, J.G. White, J.N. Thomson
  - Year:   1983
  - doi:    https://doi.org/10.1016/0012-1606(83)90201-4

Method
1. Generate Tree.
2. a

In [1]:
import os
os.chdir('/Users/yongjunchoi/Documents/GitHub/Research/Research')

# Packer et al., (2019)

## 0. Read `.gz` -> `.nwk` Tree

In [32]:
import pandas as pd
import gzip

file_path = "Polytomy/Codes/Data/GSE126954_cell_annotation.csv.gz"
output_path = "Polytomy/Codes/Data/lineages.csv"

# 1. 데이터 읽기
df = pd.read_csv(file_path, compression="gzip")

# 2. lineage 컬럼 가져오기, 결측 제외
lineages = df['lineage'].dropna()

processed = []

for lineage in lineages:
    # ':' 뒤 제거
    lineage = lineage.split(':')[0]
    # '/' 기준으로 분리
    parts = lineage.split('/')
    processed.extend(parts)

processed_unique_sorted = sorted(set(processed))

# 3. 결과를 DataFrame으로 만들어 csv로 저장
processed_df = pd.DataFrame(processed_unique_sorted, columns=['lineage'])
processed_df.to_csv(output_path, index=False)

print(f"저장 완료: {output_path}")

저장 완료: Polytomy/Codes/Data/lineages.csv


In [None]:
import pandas as pd
import gzip

file_path = "Polytomy/Codes/Data/GSE126954_cell_annotation.csv.gz"

# 0. pandas에서 gzip 바로 읽기
cell_anno = pd.read_csv(file_path, compression='gzip')

# 1. 데이터 읽기
df = pd.read_csv(file_path, compression="gzip")
lineages = df['lineage'].dropna().unique()  # 결측 제외, 고유 lineage

# 2. lineage -> Newick tree 변환 (대문자 덩어리, 소문자 개별 노드)
def lineage_to_newick(lineages):
    root = Tree(name="zygote")
    
    for lin in lineages:
        # '/'가 있으면 여러 경로로 분리
        paths = lin.split('/')
        for p in paths:
            node = root
            i = 0
            while i < len(p):
                # 대문자인 경우: 연속 대문자 덩어리 추출
                if p[i].isupper():
                    j = i + 1
                    while j < len(p) and p[j].isupper():
                        j += 1
                    chunk = p[i:j]  # 대문자 덩어리
                    i = j
                    # 자식 존재 여부 확인 후 생성
                    existing_child = next((c for c in node.children if c.name == chunk), None)
                    if existing_child:
                        node = existing_child
                    else:
                        node = node.add_child(name=chunk)
                else:
                    # 소문자는 글자 하나씩 노드 생성
                    chunk = p[i]
                    i += 1
                    existing_child = next((c for c in node.children if c.name == chunk), None)
                    if existing_child:
                        node = existing_child
                    else:
                        node = node.add_child(name=chunk)
    return root

tree = lineage_to_newick(lineages)

# 3. Newick 파일로 저장
output_path = "Polytomy/Codes/nwk/c_elegans_lineage_tree_resolved.nwk"
tree.write(format=1, outfile=output_path)

print(f"트리 생성 완료. 저장 위치: {output_path}")

## 1.Read Raw data -> `.nwk` Tree

In [4]:
import os
os.getcwd()
os.chdir('/Users/yongjunchoi/Documents/GitHub/Research/Research')

In [5]:
import pandas as pd
from ete3 import Tree

# 0. 파일 경로
file_path = "Polytomy/Codes/Data/GSE126954_cell_annotation.csv.gz"

# 1. 데이터 읽기
df = pd.read_csv(file_path, compression="gzip")
lineages = df['lineage'].dropna().unique()  # 결측 제외, 고유 lineage

# 2. lineage -> Newick tree 변환 (대문자 덩어리, 소문자 개별 노드)
def lineage_to_newick(lineages):
    root = Tree(name="zygote")
    
    for lin in lineages:
        # '/'가 있으면 여러 경로로 분리
        paths = lin.split('/')
        for p in paths:
            node = root
            i = 0
            while i < len(p):
                # 대문자인 경우: 연속 대문자 덩어리 추출
                if p[i].isupper():
                    j = i + 1
                    while j < len(p) and p[j].isupper():
                        j += 1
                    chunk = p[i:j]  # 대문자 덩어리
                    i = j
                    # 자식 존재 여부 확인 후 생성
                    existing_child = next((c for c in node.children if c.name == chunk), None)
                    if existing_child:
                        node = existing_child
                    else:
                        node = node.add_child(name=chunk)
                else:
                    # 소문자는 글자 하나씩 노드 생성
                    chunk = p[i]
                    i += 1
                    existing_child = next((c for c in node.children if c.name == chunk), None)
                    if existing_child:
                        node = existing_child
                    else:
                        node = node.add_child(name=chunk)
    return root

tree = lineage_to_newick(lineages)

# 3. Newick 파일로 저장
output_path = "Polytomy/Codes/nwk/c_elegans_lineage_tree.nwk"
tree.write(format=1, outfile=output_path)

print(f"트리 생성 완료. 저장 위치: {output_path}")

트리 생성 완료. 저장 위치: Polytomy/Codes/nwk/c_elegans_lineage_tree.nwk


In [6]:
from ete3 import Tree, TreeStyle

# 1. 저장된 Newick 파일 불러오기
tree = Tree("Polytomy/Codes/nwk/c_elegans_lineage_tree.nwk", format=1)

# 2. TreeStyle 설정
ts = TreeStyle()
ts.show_leaf_name = True       # 리프 노드 이름 표시
ts.show_branch_length = False  # branch length 표시 여부
ts.show_branch_support = False # branch support 표시 여부
ts.mode = "r"                  # 'c' = circular, 'r' = rectangular (default)
ts.scale = 20                  # 노드 간 간격 조정

# 3. 트리 시각화
tree.show(tree_style=ts)

## 1.2. `.nwk` Tree -> Polytomy

1. Among leaves, select `n` leaves for polytomy. 
2. Polytomy leaves의 MRCA 확인.
3. Polytomy leaves를 일단 제거.
4. Polytomy leaves MRCA에 polytomy 추가

### 전처리: Polytomy 제거

In [18]:
from ete3 import Tree

# 트리 불러오기
tree = Tree("Polytomy/Codes/nwk/c_elegans_lineage_tree.nwk", format=1)

# root 바로 아래 자식 노드 이름 출력
print("Root 아래 자식 노드들:")
for child in tree.children:
    print(f"- {child.name if child.name else '(internal node)'}")

Root 아래 자식 노드들:
- MS
- D
- AB
- E
- C
- Z
- 2


In [7]:
from ete3 import Tree

# ----------------------------
# 1. Newick 파일 불러오기
# ----------------------------
tree = Tree("Polytomy/Codes/nwk/c_elegans_lineage_tree.nwk", format=1)

# ----------------------------
# 2. Polytomy 제거 함수
# ----------------------------
def remove_polytomy(node):
    # Post-order traversal
    for child in list(node.children):
        remove_polytomy(child)

    # Polytomy 제거 (child >= 3)
    if len(node.children) >= 3:
        for child in list(node.children):
            node.remove_child(child)

# ----------------------------
# 3. 트리 처리 실행
# ----------------------------
remove_polytomy(tree)

# ----------------------------
# 4. 최종 Newick 출력 및 파일 저장
# ----------------------------
output_file = "Polytomy/Codes/nwk/c_elegans_lineage_tree_polytomy_removed.nwk"
tree.write(outfile=output_file)
print(f"Polytomy-removed tree saved to {output_file}")

Polytomy-removed tree saved to Polytomy/Codes/nwk/c_elegans_lineage_tree_polytomy_removed.nwk


# Worm atlas
- url:  https://www.wormatlas.org/celllistsulston.htm
- Wormatlas -> Resource -> Cell Lists -> C. elegans Cell List
- ref:  JE and White, JG (1988), "Parts list", in "The Nematode Caenorhabditis elegans, eds WB Wood et al, Cold Spring Harbor Laboratory Press, Cold Spring Harbor, New York, USA, pp 415 - 431

## 0. 데이터 전처리
- `.`와 ` ` 동일시 (code)
- `,`가 있을 경우 밑에 추가 (code)
- `/`가 있을 경우 밑에 추가 (code)
- `l/r`가 있을 경우 `*l*`, `*r*`로 분리 (manually; #: 1359 -> 1369)
  - `Complete_cell_list.csv` -> `Complete_cell_list_LR.csv`
- Cell 이 `Z`에서 안오면 제거 (code; #: 1369 -> 1264)
- 중복 제거 (code; #: 1264 -> 1229)
  - `Complete_cell_list_LR.csv` -> `Complete_cell_list_preprocessed.csv`

In [3]:
import pandas as pd

# CSV 불러오기
complete_cell_data = pd.read_csv("Polytomy/Codes/Data/Complete_cell_list_LR.csv", quotechar='"')

# '.'을 공백으로 대체
complete_cell_data['Lineage Name'] = complete_cell_data['Lineage Name'].str.replace('.', ' ', regex=False)

# ','와 '/' 기준으로 분리 후 공백 제거하고 explode
complete_cell_data_expanded = complete_cell_data.assign(
    **{
        'Lineage Name': complete_cell_data['Lineage Name']
        .apply(lambda x: [s.strip() for part in x.split(',') for s in part.split('/')])
    }
).explode('Lineage Name').reset_index(drop=True)

# Cell 이름 집합
cell_names = set(complete_cell_data_expanded['Cell'].dropna())

# lineage_maps 생성: Cell -> list of Lineages
from collections import defaultdict

lineage_maps = defaultdict(list)
for _, row in complete_cell_data_expanded.iterrows():
    lineage_maps[row['Cell']].append(row['Lineage Name'])

# 재귀적으로 prefix 교체 및 code 합치기
def resolve_lineage(lineage):
    parts = lineage.split(' ')
    prefix = parts[0]
    codes = ''.join(parts[1:])
    
    if prefix not in lineage_maps:
        return prefix, codes
    else:
        # 첫 번째 Lineage만 재귀 적용
        resolved_prefix, resolved_codes = resolve_lineage(lineage_maps[prefix][0])
        return resolved_prefix, resolved_codes + codes

# 결과 저장
resolved_data = []
for _, row in complete_cell_data_expanded.iterrows():
    cell = row['Cell']
    lineage = row['Lineage Name']
    prefix, codes = resolve_lineage(lineage)
    resolved_data.append({
        'Cell': cell,
        'Prefix': prefix,
        'Code': codes
    })

resolved_df = pd.DataFrame(resolved_data)

# Prefix가 'Z'인 행만 선택
z_only_df = resolved_df[resolved_df['Prefix'] == 'Z'].reset_index(drop=True)

# 'Cell', 'Prefix', 'Code'가 모두 같은 행 제거
z_only_df_unique = z_only_df.drop_duplicates(subset=['Cell', 'Prefix', 'Code']).reset_index(drop=True)

# Prefix와 Code 합치기
z_only_df_unique['Lineage Name'] = z_only_df_unique['Prefix'] + z_only_df_unique['Code']

# 필요한 열만 선택
z_only_df_unique = z_only_df_unique[['Cell', 'Lineage Name']]

# 중복 이름 처리
name_count = {}
def make_unique_name(name):
    if name not in name_count:
        name_count[name] = 1
        return name
    else:
        name_count[name] += 1
        return f"{name}_{name_count[name]}"

# Cell 컬럼에 적용
z_only_df_unique['Cell'] = z_only_df_unique['Cell'].apply(make_unique_name)

# CSV로 저장
preprocessed_output_path = "Polytomy/Codes/Data/Complete_cell_list_preprocessed.csv"
z_only_df_unique.to_csv(preprocessed_output_path, index=False)

## 1. `.csv` to From-To Data

In [70]:
# csv to edge
import pandas as pd
from collections import Counter
from Bio import Phylo
from collections import defaultdict

df = pd.read_csv("Polytomy/Codes/Data/Complete_cell_list_preprocessed.csv")

edges = []

for idx, row in df.iterrows():
    lineage = row['Lineage Name']
    for i in range(1, len(lineage)):
        parent = lineage[:i]       # 부모 코드
        child  = lineage[:i+1]     # 자식 코드
        edges.append({'from': parent, 'to': child})

edges_df = pd.DataFrame(edges).drop_duplicates().reset_index(drop=True)

edges_output_path = "Polytomy/Codes/Data/edges_list_all.csv"
edges_df.to_csv(edges_output_path, index=False)


# edge to nwk
edges_df = pd.read_csv("Polytomy/Codes/Data/edges_list_all.csv")

tree_dict = defaultdict(list)
for _, row in edges_df.iterrows():
    tree_dict[row['from']].append(row['to'])

def build_newick(node):
    if node not in tree_dict or len(tree_dict[node]) == 0:
        return node
    else:
        children_newick = [build_newick(child) for child in tree_dict[node]]
        return "(" + ",".join(children_newick) + ")" + node

root = 'Z'
newick_str = build_newick(root) + ";"

newick_path = "Polytomy/Codes/nwk/Polytomy_Test/tree_all_mapped.nwk"
with open(newick_path, "w") as f:
    f.write(newick_str)

# Remove polytomy in nwk
from Bio import Phylo

newick_path = "Polytomy/Codes/nwk/Polytomy_Test/tree_all_mapped.nwk"
tree = Phylo.read(newick_path, "newick")

print(".csv to edge tree : Total daughters: ", len(tree.get_terminals()))
print(*[f"{k} daughters: {v} nodes" 
        for k, v in sorted(Counter(len(c.clades) for c in tree.find_clades()).items())], 
      sep=", ")

def num_daughters(clade):
    return len(clade.clades)

def remove_nodes(clade):
    clade.clades = [child for child in clade.clades if num_daughters(child) < 3]
    for child in clade.clades:
        remove_nodes(child)
    return None

print("\nRemoving existing polytomy nodes...\n")
remove_nodes(tree.root)

output_path = "Polytomy/Codes/nwk/Polytomy_Test/tree_pruned.nwk"
Phylo.write(tree, output_path, "newick")

tree = Phylo.read(output_path, "newick")
print(".csv to edge tree : Total daughters: ", len(tree.get_terminals()))
print(*[f"{k} daughters: {v} nodes" 
        for k, v in sorted(Counter(len(c.clades) for c in tree.find_clades()).items())], 
      sep=", ")

.csv to edge tree : Total daughters:  1099
0 daughters: 1099 nodes, 1 daughters: 170 nodes, 2 daughters: 1066 nodes, 3 daughters: 1 nodes, 4 daughters: 10 nodes

Removing existing polytomy nodes...

.csv to edge tree : Total daughters:  984
0 daughters: 984 nodes, 1 daughters: 164 nodes, 2 daughters: 983 nodes


## 2. Generate Artificial Polytomy

### Method 1: Pick n among leaves -> Remove leaves -> Add Polytomy

In [71]:
import random
from Bio import Phylo
from Bio.Phylo.Newick import Clade
from copy import deepcopy
from collections import Counter

tree_path = "Polytomy/Codes/nwk/Polytomy_Test/tree_pruned.nwk"
tree = Phylo.read(tree_path, "newick")

print("Before pruning: Total daughters: ", len(tree.get_terminals()))
print(*[f"{k} daughters: {v} nodes" 
        for k, v in sorted(Counter(len(c.clades) for c in tree.find_clades()).items())], 
      sep=", ")

all_leaves = [leaf.name for leaf in tree.get_terminals()]
polytomy_size = 16

print(f"\nRandomly selecting {polytomy_size} leaves to prune...\n")

selected_leaves = random.sample(all_leaves, polytomy_size)
mrca = tree.common_ancestor(selected_leaves)
allowed_leaves = [leaf for leaf in all_leaves if leaf not in selected_leaves] + [mrca.name]

def prune(clade, allowed_leaves):
    for child in list(clade.clades):
        prune(child, allowed_leaves)
        if child.is_terminal() and child.name not in allowed_leaves:
            clade.clades.remove(child)

prune(mrca, allowed_leaves)

print("After pruning: Total daughters: ", len(tree.get_terminals()))
print(*[f"{k} daughters: {v} nodes" 
        for k, v in sorted(Counter(len(c.clades) for c in tree.find_clades()).items())], 
      sep=", ")

print(f"\nAdding polytomy...\n")

def add_polytomy(clade, new_children):
    clade.clades.extend(new_children)

add_polytomy(mrca, [Clade(name=leaf) for leaf in selected_leaves])

print("After polytomizing MRCA: Total daughters: ", len(tree.get_terminals()))
print(*[f"{k} daughters: {v} nodes" 
        for k, v in sorted(Counter(len(c.clades) for c in tree.find_clades()).items())], 
      sep=", ")

Before pruning: Total daughters:  984
0 daughters: 984 nodes, 1 daughters: 164 nodes, 2 daughters: 983 nodes

Randomly selecting 16 leaves to prune...

After pruning: Total daughters:  968
0 daughters: 968 nodes, 1 daughters: 179 nodes, 2 daughters: 967 nodes

Adding polytomy...

After polytomizing MRCA: Total daughters:  984
0 daughters: 984 nodes, 1 daughters: 179 nodes, 2 daughters: 966 nodes, 18 daughters: 1 nodes


- 2개씩 큰게 추가됨.
- MRCA에서 Polytomy인 트리 + Polytomy 아닌 트리가 분기되게 해야하나?
  - 그러면 트리가 모양이 너무 바뀌지 않나?

### Method 2: Pick among n-leaved subtree -> Polytomize

In [72]:
from Bio import Phylo
import random
from collections import Counter
from Bio.Phylo.Newick import Clade

tree_path = "Polytomy/Codes/nwk/Polytomy_Test/tree_pruned.nwk"
tree = Phylo.read(tree_path, "newick")

print("Before polytomizing: Total daughters: ", len(tree.get_terminals()))
print(*[f"{k} daughters: {v} nodes" 
        for k, v in sorted(Counter(len(c.clades) for c in tree.find_clades()).items())], 
      sep=", ")

def find_nodes_with_m_leaves(clade, m):
    nodes = []
    def helper(node):
        if node.is_terminal():
            return 1
        leaf_count = sum(helper(c) for c in node.clades)
        if leaf_count == m:
            nodes.append(node)
        return leaf_count
    helper(clade)
    return nodes

polytomy_size = 4
nodes_with_m_leaves = find_nodes_with_m_leaves(tree.root, polytomy_size)

if len(nodes_with_m_leaves) == 0:
    print("\nUnable to polytomize!")
else:
    print("\nPolytomizing one of the subtree among {} trees...\n".format(len(nodes_with_m_leaves)))

    def count_children(node):
        counts = Counter()
        child_count = len(node.clades)
        counts[child_count] += 1
        
        for child in node.clades:
            counts.update(count_children(child))
        return counts

    child_counter = count_children(tree.root)

    picked_node = random.choice(nodes_with_m_leaves)

    def polytomize_node(node):
        leaves = node.get_terminals()
        node.clades = leaves

    polytomize_node(picked_node)
    print("After polytomizing: Total daughters: ", len(tree.get_terminals()))
    print(*[f"{k} daughters: {v} nodes" 
        for k, v in sorted(Counter(len(c.clades) for c in tree.find_clades()).items())], 
      sep=", ")
    

Before polytomizing: Total daughters:  984
0 daughters: 984 nodes, 1 daughters: 164 nodes, 2 daughters: 983 nodes

Polytomizing one of the subtree among 141 trees...

After polytomizing: Total daughters:  984
0 daughters: 984 nodes, 1 daughters: 161 nodes, 2 daughters: 980 nodes, 4 daughters: 1 nodes


## 2. Randomly Generate Resolved Polytomy

### Method 1: BD Model

### Method 2: Beta Splitting Model

### Method 3: Kingman Coalescent Model

### Method 4: Yule Model