In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from tqdm import tqdm

import numpy as np
import pandas as pd
import re, os, sys

import seaborn as sns
sns.set_style('white')

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as mpatches
mpl.rcParams['pdf.fonttype'] = 42

import math

font_name = {'fontname':'Arial'}

plt.rcParams["font.family"] = "Arial"

In [2]:
# arguments settings
# working directory
wd = r"/Users/yingweihu/Documents/GitHub/GPNotebook/sample/PDAC"
# input path ( original supplementry table 1 from PDAC publication doi: 10.1016/j.cell.2021.08.023)
clinical_path = os.path.join(wd,"meta/pdac_sup_1.xlsx") 
# output path: standardrized meta table
meta_path = os.path.join(wd,"meta/info.tsv")

In [3]:
clinical_df = pd.read_excel(clinical_path,sheet_name="Clinical_data")
clinical_df.head(2)

Unnamed: 0,case_id,tumor_included_for_the_study,normal_included_for_the_study,histology_diagnosis,age,sex,race,participant_country,tumor_site,tumor_focality,...,Islet_fraction,Stromal_fraction,Non_neoplastic_duct,Fat_fraction,Inflammation_fraction,Muscle_fraction,follow_up_days,vital_status,is_this_patient_lost_to_follow_up,cause_of_death
0,C3L-00102,yes,yes,PDAC,42,Male,White,United States,head,Unifocal,...,2,60,1,4,7,0,249.0,Deceased,No,pancreatic carcinoma
1,C3L-00189,yes,yes,PDAC,68,Female,,Canada,head,Unifocal,...,1;3;2,75;55;53,2;1;0,0;6;0,7;30;25,0;0;0,1035.0,Deceased,No,pancreatic carcinoma


In [4]:
# build index for quick searching of data
vital_d = dict(zip(clinical_df['case_id'], clinical_df['vital_status']))
days_d = dict(zip(clinical_df['case_id'],clinical_df['follow_up_days']))
age_d = dict(zip(clinical_df['case_id'],clinical_df['age']))
gender_d = dict(zip(clinical_df['case_id'],clinical_df['sex']))
cause_d = dict(zip(clinical_df['case_id'],clinical_df['cause_of_death']))

In [5]:
normal_cases = []
tumor_cases = []


# get all tumor samples and normal samples
for index,row in clinical_df.iterrows():
    case_id = row['case_id']
    
    histology_diagnosis = row['histology_diagnosis']
    if histology_diagnosis != 'PDAC':
        continue
        
    normal_included = row['normal_included_for_the_study']
    tumor_included = row['tumor_included_for_the_study']
    
    if normal_included == "yes":
        normal_cases.append(case_id)
    
    if tumor_included == "yes":
        tumor_cases.append(case_id)
        
normal_samples = [i + ".N" for i in normal_cases]
tumor_samples = [i + ".T" for i in tumor_cases]


In [10]:
rows = []

for case, sample in zip(normal_cases + tumor_cases,normal_samples + tumor_samples):
    vital = vital_d.get(case)
    vital = 'Unknown' if pd.isna(vital) or case == 'na' else str(vital)
    days = days_d.get(case)
    days = np.nan if pd.isna(days) else int(days)
    age = age_d.get(case)
    age = np.nan if pd.isna(age) else int(age)
    gender = gender_d.get(case)
    gender = 'Unknown' if pd.isna(gender) else str(gender)
    cause = cause_d.get(case)
    cause = 'Unknown' if pd.isna(case) or cause == 'na' or cause == "unknown" else str(cause)
    row = [sample,case, vital, days, age, gender, cause]
    rows.append(row)
    
meta_df = pd.DataFrame(rows,columns=["Sample","CaseID","VitalStatus","SurvivalDays","Age","Gender","DeathCause"])

In [11]:
meta_df

Unnamed: 0,Sample,CaseID,VitalStatus,SurvivalDays,Age,Gender,DeathCause
0,C3L-00102.N,C3L-00102,Deceased,249.0,42,Male,pancreatic carcinoma
1,C3L-00189.N,C3L-00189,Deceased,1035.0,68,Female,pancreatic carcinoma
2,C3L-00277.N,C3L-00277,Deceased,610.0,69,Male,pancreatic carcinoma
3,C3L-00401.N,C3L-00401,Living,1228.0,62,Female,Unknown
4,C3L-00640.N,C3L-00640,Living,594.0,59,Female,Unknown
...,...,...,...,...,...,...,...
194,C3N-03754.T,C3N-03754,Living,483.0,44,Male,Unknown
195,C3N-03853.T,C3N-03853,Living,259.0,49,Female,Unknown
196,C3N-04126.T,C3N-04126,Deceased,348.0,42,Male,pancreatic carcinoma
197,C3N-04283.T,C3N-04283,Deceased,1.0,66,Female,complication due to medical/surgical care


In [12]:
meta_df.to_csv(meta_path,sep="\t",index=False)