In [1]:
# 导入需要使用的库
import numpy as np
import pandas as pd
import os
import json

In [2]:
# 对TCGA下载的文件进行处理

In [3]:
# 划分存储tsv文件xml文件的文件夹
def tsv_xml():
    tsv_dir = []  # 保存文件夹的文件是.tsv文件的文件夹名
    tsv_file_name = [] # 保存.tsv的文件名
    xml_dir = []  # 保存文件夹的文件是.xml文件的文件夹名
    mul_dir = []  # 保存文件夹的文件是多个文件的文件夹名
    file_dir = []  # 保存内容不是文件夹的文件名
    unknow_dir = []  # 保存文件夹的文件是不知名文件的文件夹名
    # 获取包含TCGA下载文件的所有文件夹信息
    dir_all = 'E:/cancerPrognosis/MyLUAD/LUADDownload'
    dir_all_list=os.listdir(dir_all)
    # 获取存储每一个样本的文件夹名
    for i in dir_all_list:
        dir_name = dir_all+'/'+i
        # 判断该地址下的内容是文件夹，还是文件
        if(os.path.isdir(dir_name)):
            dir_per_list = os.listdir(dir_name)
            # 判断文件夹里面的文件是一个还是多个
            if len(dir_per_list) != 1:
                mul_dir.append(dir_name)
            # 读取文件夹里面的文件，判断是TSV文件还是XML文件
            elif dir_per_list[0][-3:] == 'tsv':
                tsv_dir.append(dir_name)
                tsv_file_name.append(dir_per_list[0])
            elif dir_per_list[0][-3:] == 'xml':
                xml_dir.append(dir_name)
            else:
                unknow_dir.append(dir_name)
        else:
            # 如果不是文件夹就保存文件名
            file_dir.append(dir_name)
#     print('tsv_dir',len(tsv_dir))
#     print('tsv_file_name',len(tsv_file_name))
#     print('xml_dir',len(xml_dir))
#     print('mul_dir',len(mul_dir))
#     print('file_dir',len(file_dir))
#     print('unknow_dir',len(unknow_dir))
    return tsv_dir, xml_dir, tsv_file_name

In [4]:
# 读取文件
def read_file(file_name):
    colnames = ['gene_id','gene_name','gene_type','unstranded','stranded_first','\
               stranded_second','tpm_unstranded','fpkm_unstranded','fpkm_uq_unstranded']  
    exp = []
    # 按行读取
    with open(file_name, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip('\n').split('\t')
            exp.append(line)
    exp = pd.DataFrame(exp, columns = colnames)
    return exp

In [5]:
# 删除无用信息
def del_information(exp):
    del_index = []
    # 获取无用信息行的索引
    if exp['gene_id'][0] == '# gene-model: GENCODE v36':
        del_index.append(0)
    if exp['gene_id'][1] == 'gene_id':
        del_index.append(1)
    if exp['gene_id'][2] == 'N_unmapped':
        del_index.append(2)
    if exp['gene_id'][3] == 'N_multimapping':
        del_index.append(3)    
    if exp['gene_id'][4] == 'N_noFeature':
        del_index.append(4)   
    if exp['gene_id'][5] == 'N_ambiguous':
        del_index.append(5) 
    # 删除无用信息行
    exp.drop(labels=None, axis=0, index=del_index, columns=None, level=None, inplace=True)
    return exp

In [6]:
# 整合数据
def combine_data(exp, sample_name):
    # 定义需要的数据信息的列名
    name = ['gene_id','gene_name','unstranded']
    # 读取需要的信息
    exp = exp[name]
    # 更新列名
    new_name = ['gene_id','gene_name']
    new_name.append(sample_name)
    exp.columns = new_name
    # 重置索引
    index_name = list(exp['gene_id'])
    exp.index = index_name
    # 删除多余的gene_id与gene_name列
    exp.drop(labels=None, axis=1, index=None, columns=['gene_id',], level=None, inplace=True)
    return exp

In [7]:
# 获取样本id
def sample():
    # 定义文件的存地址
    json_file_name = 'E:/cancerPrognosis/MyLUAD/metadata.json'
    # 加载json文件
    json_data=json.load(open(json_file_name))
    # 获取样本的case_id与entity_submitter_id
    file_names = []
    entity_submitter_id = []
    for data in json_data:
        if len(data['associated_entities'][0]['entity_submitter_id']) > 12:
            # 获取case_id
            file_names.append(data['file_name'])
            # 获取entity_submitter_id
            entity_submitter_id.append(data['associated_entities'][0]['entity_submitter_id'])
    return file_names, entity_submitter_id

In [8]:
# 处理通过TCGA下载的RNA-seq的count文件
def deal_TCGA():
    # 划分存储tsv文件xml文件的文件夹
    tsv_dir, xml_dir, tsv_file_name = tsv_xml()
    # 通过json文件，获取样本名
    file_names, entity_submitter_id = sample()
    # 跟踪样本处理进度
    j = 2
    # 读取tsv文件
    # 获取存储tsv文件的文件夹名
    flage = 1
    for file_dir in tsv_dir:
        # 读取file_dir文件夹下存储的文件名
        dir_per_list = os.listdir(file_dir)
        # 划分文件的sample_id，用于给样本命名
        sample_id = dir_per_list[0]
        if sample_id in file_names:
            i = file_names.index(sample_id)
            sample_name = entity_submitter_id[i]
            # 获取文件地址
            file_name = file_dir + '/' + dir_per_list[0]
            # 读取文件,返回的是一个dataframe文件
            if flage == 1:
                express= read_file(file_name)
                # 删除无用信息,返回的是一个dataframe文件
                express = del_information(express)
                # 整合数据
                express = combine_data(express, sample_name)
                flage = 0
            else:
                exp= read_file(file_name)
                # 删除无用信息,返回的是一个dataframe文件
                exp = del_information(exp)
                # 整合数据
                exp = combine_data(exp, sample_name)
                # 按指定条件合并两个dataframe数据
                express = pd.concat([express,exp.iloc[:,1:2]],axis=1,join='inner')
                if j%50 == 0:
                    print('已经处理完成', j, '个样本')
                j = j+1
    # 存储处理完成的文件
    express.to_csv('E:/cancerPrognosis/MyLUAD/code/deal_download/express.txt', sep='\t', header=True, index=True)
    return express

In [4]:
# 测试划分存储tsv文件xml文件的文件夹的函数调用
# tsv_dir, xml_dir, tsv_file_name = tsv_xml()
# 测试函数是使用
# file_names, entity_submitter_id = sample()

In [9]:
# 基因表达数据处理的总函数
express = deal_TCGA()
express.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


已经处理完成 50 个样本
已经处理完成 100 个样本
已经处理完成 150 个样本
已经处理完成 200 个样本
已经处理完成 250 个样本
已经处理完成 300 个样本
已经处理完成 350 个样本
已经处理完成 400 个样本
已经处理完成 450 个样本
已经处理完成 500 个样本


(60660, 511)

In [10]:
express[0:10]

Unnamed: 0,gene_name,TCGA-35-5375-01A-01R-1628-07,TCGA-55-A4DF-01A-11R-A24H-07,TCGA-95-8039-01A-11R-2241-07,TCGA-MP-A4T4-01A-11R-A262-07,TCGA-62-A471-01A-12R-A24H-07,TCGA-L9-A5IP-01A-21R-A39D-07,TCGA-50-5936-01A-11R-1628-07,TCGA-49-AARR-01A-11R-A41B-07,TCGA-78-8662-01A-11R-2403-07,...,TCGA-55-8505-01A-11R-2403-07,TCGA-97-A4M5-01A-11R-A24X-07,TCGA-75-5125-01A-01R-1755-07,TCGA-62-A46U-01A-11R-A24H-07,TCGA-44-2665-01B-06R-A277-07,TCGA-MP-A4TJ-01A-51R-A262-07,TCGA-62-A46V-01A-11R-A24H-07,TCGA-50-5055-01A-01R-1628-07,TCGA-69-8453-01A-12R-2326-07,TCGA-91-6829-01A-21R-1858-07
ENSG00000000003.15,TSPAN6,3715,3943,5552,2165,4275,1113,2532,3859,2347,...,1501,4006,3214,1168,401,1460,2604,2078,1671,2018
ENSG00000000005.6,TNMD,0,0,4,0,0,0,0,2,2,...,0,3,18,0,73,1,1,5,1,0
ENSG00000000419.13,DPM1,2468,3162,2216,1587,1071,1180,616,1149,603,...,1160,1433,1552,1460,154,929,1848,1004,1100,1252
ENSG00000000457.14,SCYL3,249,726,1103,747,630,982,487,600,469,...,473,820,414,681,221,455,914,368,649,556
ENSG00000000460.17,C1orf112,395,603,425,370,542,642,310,123,301,...,282,258,316,363,102,226,511,117,226,316
ENSG00000000938.13,FGR,150,1578,1684,1634,372,361,517,1524,240,...,346,2105,1211,6236,79,1326,421,763,5589,420
ENSG00000000971.16,CFH,500,3114,19593,7351,10445,14898,4469,8972,5635,...,4178,8860,2961,2243,1688,3698,8014,4473,4520,5928
ENSG00000001036.14,FUCA2,2736,4174,5508,1783,6760,2909,3161,1373,1928,...,3972,1440,4278,3010,242,1457,1763,1755,2468,2513
ENSG00000001084.13,GCLC,8506,8421,982,2302,31744,3268,8592,2150,2281,...,16494,1393,22723,1231,154,781,179,852,1541,2693
ENSG00000001167.14,NFYA,1054,1571,3376,1811,2022,1531,1441,934,7250,...,1366,2119,1507,4227,343,1008,2145,942,970,2179


In [11]:
# 处理exposure文件
def deal_exposure():
    # 读取文件
    exposure = pd.read_csv('E:/cancerPrognosis/MyLUAD/clinical/exposure.tsv', sep="\t")
    # 获取列名
    col_names = exposure.columns
    # 遍历删除没有值的列
    for col_name in col_names:
        # 计算每一列的空值数据个数
        total = list(exposure[col_name]).count("'--")
        # 判断是否所有行都是空值，如果是则删除该列数据
        if total == exposure.shape[0]:
            exposure.drop(labels=None, axis=1, index=None, columns=col_name, level=None, inplace=True)
    # 删除没有统计意义的类数据，特指在所有样本中值是一样的数据
    del_col = ['case_id','cigarettes_per_day','pack_years_smoked']
    exposure.drop(labels=None, axis=1, index=None, columns=del_col, level=None, inplace=True)
    project_id_total = list(exposure['project_id']).count("TCGA-LUAD")
    if project_id_total == exposure.shape[0]:
        exposure.drop(labels=None, axis=1, index=None, columns='project_id', level=None, inplace=True)
    alcohol_history_total = list(exposure['alcohol_history']).count("Not Reported")
    if alcohol_history_total == exposure.shape[0]:
        exposure.drop(labels=None, axis=1, index=None, columns='alcohol_history', level=None, inplace=True)
    # 对样本诊断后是否吸烟进行处理
    # 吸烟事件发生，用1表示，反之用0表示
    exposure['years_smoked'].mask(exposure['years_smoked'] != "'--", 1, inplace=True)
    exposure['years_smoked'].mask(exposure['years_smoked'] == "'--", 0, inplace=True)
    # 修改列名
    name = ['sample', 'smoke']
    exposure.columns = name
    exposure.set_index('sample', inplace=True)
    return exposure

In [12]:
# 处理clinical文件
def deal_clinical():
    # 读取文件
    clinical = pd.read_csv('E:/cancerPrognosis/MyLUAD/clinical/clinical.tsv', sep="\t")
    # 获取列名
    col_names = clinical.columns
    # 遍历删除没有值的列
    for col_name in col_names:
        # 计算每一列的空值数据个数
        total = list(clinical[col_name]).count("'--")
        # 判断是否所有行都是空值，如果是则删除该列数据
        if total == clinical.shape[0]:
            clinical.drop(labels=None, axis=1, index=None, columns=col_name, level=None, inplace=True)
    # 获取列名
    col_names = clinical.columns
    # 删除没有统计意义的类数据
    for col_name in col_names:
        # 获取每一列的第一个数据，用于判断该数据是否有统计意义
        first = clinical[col_name][0]
        # 根据第一个值统计数据
        total = list(clinical[col_name]).count(first)
        if total == clinical.shape[0]:
            clinical.drop(labels=None, axis=1, index=None, columns=col_name, level=None, inplace=True)
    # 获取具有统计意义的列数据
    name = ['case_submitter_id', 'days_to_death', 'gender', 'vital_status',\
           'ajcc_pathologic_m', 'ajcc_pathologic_n', 'ajcc_pathologic_stage',\
           'ajcc_pathologic_t', 'days_to_last_follow_up']
    clinical = clinical[name]
    # 迭代生成一个只包含奇数的list
    L = range(1044)
    row = list(filter(lambda x: x % 2 == 1 , L ))
    # 删除clinical的重复数据行
    clinical.drop(labels=None, axis=0, index=row, columns=None, level=None, inplace=True)
    # 修改性别数据 0表示女性，1表示男性 mask函数是判断条件为True是执行语句
    clinical['gender'].mask(clinical['gender'] == "male", 1, inplace=True)
    clinical['gender'].mask(clinical['gender'] == 'female', 0, inplace=True)
    # 修改生存状态数据 0表示活着，1表示死亡
    clinical['vital_status'].mask(clinical['vital_status'] == "Dead", 1, inplace=True)
    clinical['vital_status'].mask(clinical['vital_status'] == 'Alive', 0, inplace=True)
    # 修改生存时间数据 '-- 用空值替代
    clinical['days_to_death'].mask(clinical['days_to_death'] == "'--", np.nan, inplace=True)
    clinical['days_to_last_follow_up'].mask(clinical['days_to_last_follow_up'] == "'--", np.nan, inplace=True)
    # 计算生存时间
    time = []
    data1 = list(clinical['days_to_death'])
    data2 = list(clinical['days_to_last_follow_up'])
    for i in range(len(data1)):
        if pd.isnull(data1[i]) and pd.isnull(data2[i]):
            time.append(np.nan)
        elif pd.isnull(data1[i]):
            time.append(float(data2[i]))
        elif pd.isnull(data2[i]):
            time.append(float(data1[i]))
        else:
            time.append(float(data2[i]) + float(data1[i]))
    clinical['time'] = time
    # 删除days_to_death与days_to_last_follow_up列 删除列时axis=1
    clinical.drop(labels=None, axis=1, index=None, columns=['days_to_death','days_to_last_follow_up'], level=None, inplace=True)
    # 修改列名
    new_name = ['sample','gender','status','M','N','stage','T','time']
    clinical.columns = new_name
    # 调整列的顺序
    order = ['sample', 'time', 'status', 'gender', 'stage', 'T', 'M', 'N']
    clinical = clinical[order]
    # 设置第一列为索引
    clinical.set_index('sample', inplace=True)
    # 根据索引删除生存时间小于30或者生存时间为NAN的数据
    time1 = clinical[clinical['time']<30].index.to_list()
    time2 = clinical[clinical['time'].isnull()].index.to_list()
    time12 = time1 + time2
    clinical.drop(labels=None, axis=0, index=time12, columns=None, level=None, inplace=True)
    # 更具肿瘤分期TMN对分期进行规范化 
    # stage规范为 Ⅰ，Ⅱ，Ⅲ，Ⅳ，
    # T规范化为T1，T2，T3，TX
    # M规范化为M0，M1，MX
    # N规范化为N0，N1，N2，N3，NX
    # stage 规范化开始
    stage1 = clinical[clinical['stage'] == "'--"].index.to_list()
    clinical.drop(labels=None, axis=0, index=stage1, columns=None, level=None, inplace=True)
    clinical['stage'].mask(clinical['stage'] == "Stage I", 'I', inplace=True)
    clinical['stage'].mask(clinical['stage'] == "Stage IB", 'I', inplace=True)
    clinical['stage'].mask(clinical['stage'] == "Stage IA", 'I', inplace=True)
    clinical['stage'].mask(clinical['stage'] == "Stage II", 'II', inplace=True)
    clinical['stage'].mask(clinical['stage'] == "Stage IIA", 'II', inplace=True)
    clinical['stage'].mask(clinical['stage'] == "Stage IIB", 'II', inplace=True)
    clinical['stage'].mask(clinical['stage'] == "Stage IIIB", 'III', inplace=True)
    clinical['stage'].mask(clinical['stage'] == "Stage IIIA", 'III', inplace=True)
    clinical['stage'].mask(clinical['stage'] == "Stage IV", 'IV', inplace=True)
    # T 规范化开始
    clinical['T'].mask(clinical['T'] == "T1", 'T1', inplace=True)
    clinical['T'].mask(clinical['T'] == "T1b", 'T1', inplace=True)
    clinical['T'].mask(clinical['T'] == "T1a", 'T1', inplace=True)
    clinical['T'].mask(clinical['T'] == "T2", 'T2', inplace=True)
    clinical['T'].mask(clinical['T'] == "T2a", 'T2', inplace=True)
    clinical['T'].mask(clinical['T'] == "T2b", 'T2', inplace=True)
    clinical['T'].mask(clinical['T'] == "T3", 'T3', inplace=True)
    clinical['T'].mask(clinical['T'] == "T4", 'T4', inplace=True)
    clinical['T'].mask(clinical['T'] == "TX", 'TX', inplace=True)
    # M 规范化开始
    clinical['M'].mask(clinical['M'] == "M0", 'M0', inplace=True)
    clinical['M'].mask(clinical['M'] == "M1", 'M1', inplace=True)
    clinical['M'].mask(clinical['M'] == "M1b", 'M1', inplace=True)
    clinical['M'].mask(clinical['M'] == "M1a", 'M1', inplace=True)
    clinical['M'].mask(clinical['M'] == "MX", 'MX', inplace=True)
    clinical['M'].mask(clinical['M'] == "'--", 'MX', inplace=True)
    # N 规范化开始
    clinical['N'].mask(clinical['N'] == "N0", 'N0', inplace=True)
    clinical['N'].mask(clinical['N'] == "N1", 'N1', inplace=True)
    clinical['N'].mask(clinical['N'] == "N2", 'N2', inplace=True)
    clinical['N'].mask(clinical['N'] == "N3", 'N3', inplace=True)
    clinical['N'].mask(clinical['N'] == "NX", 'NX', inplace=True)
    return clinical

In [14]:
# 整合clinical文件与exposure文件
def clinical_exposure():
    # 获取经过处理的clinical文件与exposure文件
    clinical = deal_clinical()
    exposure = deal_exposure()
    # 将两个文件进行合并
    clinical = pd.concat([clinical,exposure],axis=1,join='inner')
    # 调整列的顺序
    order = ['time', 'status', 'gender', 'smoke','stage', 'T', 'M', 'N']
    clinical = clinical[order]
    # 存储整理完成后的临床数据
    clinical.to_csv('E:/cancerPrognosis/MyLUAD/code/deal_download/clinical.txt', sep='\t', header=True, index=True)
    return clinical

In [14]:
# 函数测试
# exposure = deal_exposure()
# print(exposure.shape)

In [15]:
# 对临床数据的处理总函数
clinical = clinical_exposure()
clinical.shape

(492, 8)

In [16]:
clinical.head()

Unnamed: 0_level_0,time,status,gender,smoke,stage,T,M,N
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TCGA-97-8172,545.0,0,0,1,I,T2,M0,N0
TCGA-78-8655,2360.0,0,0,1,I,T1,M0,N0
TCGA-91-6829,1258.0,1,1,1,I,T2,MX,N0
TCGA-86-8672,34.0,1,1,0,II,T3,M0,N0
TCGA-62-8398,444.0,1,1,1,III,T2,M0,N2
