# 变量设置

In [None]:
# 设置数据集名称
dataset_name = 'TCGA-LUSC'

# 数据导入

## 库

In [None]:
import requests
import os
import gzip
import shutil
import pandas as pd
import numpy as np

In [10]:
# 检查.pkl文件是否已经存在
pkl_file_path = './datasets/' + dataset_name + '.mirna_transposed.pkl'
if not os.path.exists(pkl_file_path):
    # 下载文件
    url = "https://gdc-hub.s3.us-east-1.amazonaws.com/download/" + dataset_name + ".mirna.tsv.gz"
    response = requests.get(url)

    # 确保目标文件夹存在
    os.makedirs('./datasets', exist_ok=True)

    # 保存.gz文件
    gz_file_path = './datasets/' + dataset_name + '.mirna.tsv.gz'
    with open(gz_file_path, 'wb') as f_out:
        f_out.write(response.content)

    # 解压.gz文件
    tsv_file_path = './datasets/' + dataset_name + '.mirna.tsv'
    with gzip.open(gz_file_path, 'rb') as f_in:
        with open(tsv_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    # 读取.tsv文件并转置
    df = pd.read_csv(tsv_file_path, sep='\t')
    df_transposed = df.transpose()

    # 使用第一行作为列名
    df_transposed.columns = df_transposed.iloc[0]
    df_transposed = df_transposed.drop(df_transposed.index[0])

    # 还原数据并保持四位有效数字
    df_transposed = np.power(2, df_transposed) - 1
    df_transposed = df_transposed.round(4)

    # 添加癌症状态列并将其移动到最左边
    df_transposed['Status'] = df_transposed.index.map(lambda x: 1 if x.split('-')[3][:2] == '01' else 0)
    df_transposed = df_transposed[['Status'] + [col for col in df_transposed.columns if col != 'Status']]

    # 保存转置后的数据为.pkl文件
    df_transposed.to_pickle(pkl_file_path)

    # 删除原始的.gz和.tsv文件
    os.remove(gz_file_path)
    os.remove(tsv_file_path)
