In [8]:
import duckdb

parquet_path = "datasets/acbc_mskcc_2015/data.parquet"

conn = duckdb.connect()
conn.execute("CREATE TABLE data AS SELECT * FROM read_parquet(?)", [parquet_path])
conn.execute("SELECT * FROM data LIMIT 12").fetch_df()

Unnamed: 0,patient_id,sample_id,cancer_type,cancer_type_detailed,tumor_size,primary_site,sample_type,metastatic_site,tumor_stage,myb_nfib_fish,...,overall_patient_histology,neoadjuvant_chemo,adjuvant_chemo,radiation_therapy,adjuvant_tx,followup_years,vital_status,local_regional_recurrence,time_to_met_months,study
0,AdCC5T,AdCC5T,Breast Cancer,Adenoid Cystic Breast Cancer,40,Breast,Primary,,II,Negative,...,Cribriform/Tubular,Not performed,Not performed,Not performed,Not performed,8.0,Alive,Positive,,ACbC (MSKCC/Breast 2015)
1,AdCC4T,AdCC4T,Invasive Breast Carcinoma,Adenoid Cystic Breast Cancer,35,Breast,Primary,,I,Positive,...,Cribriform/Tubular,"Adriamycine, Endoxan, Taxotere",Not performed,Performed,Chemotherapy/Radiation Therapy,8.0,Alive,Negative,,ACbC (MSKCC/Breast 2015)
2,AdCC2T,AdCC2T,Invasive Breast Carcinoma,Adenoid Cystic Breast Cancer,25,Breast,Primary,Lung,I,Positive,...,Cribriform/Tubular,Not performed,Not performed,Performed,Radiation Therapy,12.0,Alive,Negative,120.0,ACbC (MSKCC/Breast 2015)
3,AdCC9T,AdCC9T,Invasive Breast Carcinoma,Adenoid Cystic Breast Cancer,45,Breast,Primary,,II,Positive,...,Cribriform,Not performed,Not performed,Not performed,Not performed,4.0,Alive,Negative,,ACbC (MSKCC/Breast 2015)
4,AdCC8T,AdCC8T,Invasive Breast Carcinoma,Adenoid Cystic Breast Cancer,17,Breast,Primary,,I,Positive,...,Cribriform/Tubular,Not performed,Not performed,Not performed,Not performed,3.0,Alive,Negative,,ACbC (MSKCC/Breast 2015)
5,AdCC6T,AdCC6T,Invasive Breast Carcinoma,Adenoid Cystic Breast Cancer,22,Breast,Primary,,II,Positive,...,Solid/Trabecular,Not performed,Not performed,Performed,Radiation Therapy,3.0,Died of other causes,Negative,,ACbC (MSKCC/Breast 2015)
6,AdCC32T,AdCC32T,Invasive Breast Carcinoma,Adenoid Cystic Breast Cancer,35,Breast,Primary,,I,Positive,...,Cribriform/Tubular,Not performed,Not performed,Not performed,Not performed,3.0,Alive,Negative,,ACbC (MSKCC/Breast 2015)
7,AdCC1T,AdCC1T,Invasive Breast Carcinoma,Adenoid Cystic Breast Cancer,15,Breast,Primary,,I,Positive,...,Cribriform/Tubular,Not performed,Not performed,Not performed,Not performed,11.0,Alive,Negative,,ACbC (MSKCC/Breast 2015)
8,AdCC12T,AdCC12T,Invasive Breast Carcinoma,Adenoid Cystic Breast Cancer,20,Breast,Primary,,II,Negative,...,Cribriform/Solid,Not performed,FEC 100 + taxanes,Performed,Chemotherapy/Radiation Therapy,3.0,Alive,Negative,,ACbC (MSKCC/Breast 2015)
9,AdCC3T,AdCC3T,Invasive Breast Carcinoma,Adenoid Cystic Breast Cancer,23,Breast,Primary,,I,Positive,...,Cribriform,Not performed,Not performed,Performed,Radiation Therapy,7.0,Alive,Negative,,ACbC (MSKCC/Breast 2015)


In [8]:
import pandas as pd

# 文件路径
patient_path = "/Volumes/Backup/Collections/private-datahub/cbioportal_datasets/fuscc_luad_lc1000_2024/data_clinical_patient.txt"
sample_path = "/Volumes/Backup/Collections/private-datahub/cbioportal_datasets/fuscc_luad_lc1000_2024/data_clinical_sample.txt"


# 读取前 4 行 header 注释
def load_annotated_table(path):
    with open(path, encoding="utf-8") as f:
        lines = [next(f).strip().split("\t") for _ in range(4)]
    df = pd.read_csv(path, sep="\t", skiprows=4, dtype=str)
    return df, lines


sample_df, sample_annot = load_annotated_table(sample_path)
patient_df, patient_annot = load_annotated_table(patient_path)

# 合并数据
merged_df = sample_df.merge(patient_df, on="PATIENT_ID", how="left")
columns = merged_df.columns.tolist()


# 生成前 4 行注释，每一列取 sample_annot 优先，否则取 patient_annot
def get_annotation_rows(
    sample_annot, patient_annot, sample_cols, patient_cols, final_cols
):
    result = [[] for _ in range(4)]
    for col in final_cols:
        for row in range(4):
            val = ""
            if col in sample_cols:
                idx = sample_cols.index(col)
                if idx < len(sample_annot[row]):
                    val = sample_annot[row][idx]
            elif col in patient_cols:
                idx = patient_cols.index(col)
                if idx < len(patient_annot[row]):
                    val = patient_annot[row][idx]
            result[row].append(val)
    return pd.DataFrame(result, columns=final_cols)


annot_df = get_annotation_rows(
    sample_annot,
    patient_annot,
    sample_df.columns.tolist(),
    patient_df.columns.tolist(),
    columns,
)

# 拼接注释行和真实数据
final_df = pd.concat([annot_df, merged_df], ignore_index=True)

# 预览或导出
final_df.head(10)

final_df.to_csv(
    "/Volumes/Backup/Collections/private-datahub/cbioportal_datasets/fuscc_luad_lc1000_2024/data_clinical_patient_sample.tsv",
    sep="\t",
    index=False,
)

In [2]:
import pandas as pd

df = pd.read_csv("/Volumes/Backup/Collections/public-datahub/public/acbc_mskcc_2015/data_mutations.txt", sep="\t")

# Convert the df to a parquet file
df.to_parquet("./datasets/acbc_mskcc_2015/v0.0.2/datafiles/maf.parquet")

In [10]:
import pandas as pd
import json
from collections import OrderedDict

# 读取表格
df = pd.read_csv(
    "/Volumes/Backup/Collections/public-datahub/public/acbc_mskcc_2015/data_mutations.txt",
    sep="\t",
)

# 构建字段信息列表
dictionary_entries = []
for order, col in enumerate(df.columns, start=1):
    values = df[col].dropna().unique()

    # 初始化
    dtype_str = "UNKNOWN"
    sample_values = []

    # 判断数据类型并处理 allowed_values
    if pd.api.types.is_string_dtype(df[col]):
        dtype_str = "STRING"
        sample_values = sorted([str(v) for v in values])
    elif pd.api.types.is_integer_dtype(df[col]):
        dtype_str = "NUMBER"
        numeric_values = [int(v) for v in values if pd.notnull(v)]
        sample_values = (
            [min(numeric_values), max(numeric_values)] if numeric_values else []
        )
    elif pd.api.types.is_float_dtype(df[col]):
        dtype_str = "NUMBER"
        numeric_values = [float(v) for v in values if pd.notnull(v)]
        sample_values = (
            [min(numeric_values), max(numeric_values)] if numeric_values else []
        )
    elif pd.api.types.is_bool_dtype(df[col]):
        dtype_str = "BOOLEAN"
        sample_values = sorted([bool(v) for v in values if pd.notnull(v)])

    entry = OrderedDict(
        [
            ("key", col),
            ("name", col.replace("_", " ").title()),
            ("description", f"Column for {col.replace('_', ' ')}."),
            ("data_type", dtype_str),
            ("notes", ""),
            ("allowed_values", sample_values),
            ("order", int(order)),
        ]
    )
    dictionary_entries.append(entry)

# 输出为 JSON 文件
output_path = "./datasets/acbc_mskcc_2015/v0.0.2/datafiles/maf_dictionary.json"
with open(output_path, "w") as f:
    json.dump(dictionary_entries, f, indent=2)