Install IPython libraries (If necessary)

`pip install ipython` \
`pip install ipykernel` \
`pip install ipywidgets`

import libraries

In [1]:
import glob
import gzip
import os
import tarfile
import urllib

Set constants

In [2]:
DATASET_INPUT_ROOT_PATH = os.path.join(os.getcwd(), "dataset/")
DAY_NAMES = ["day0", "day0.5", "day1", "day1.5", "day2"]

In case of `Anndata`:

In [3]:
ANNDATA_URL = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE241nnn/GSE241287/suppl/GSE241287_scRNAseq_hPGCLC_induction.h5ad.gz'
ANNDATA_FILE_NAME = 'GSE241287_scRNAseq_hPGCLC_induction'
ANNDATA_GZ_FILE_PATH = os.path.join(DATASET_INPUT_ROOT_PATH, ANNDATA_FILE_NAME) + '.h5ad.gz'
ANNDATA_DATASET_PATH = os.path.join(DATASET_INPUT_ROOT_PATH, ANNDATA_FILE_NAME) + '.h5ad'

In case of `DataFrame(CSV)` :

In [4]:
DATAFRAME_URL = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE241nnn/GSE241287/suppl/GSE241287_RAW.tar'
PRE_DATAFRAME_FOLDER_NAME = 'PRE_GSE241287_RAW'
DATAFRAME_FOLDER_NAME = 'GSE241287_RAW'
PRE_DATAFRAME_TAR_FOLDER_PATH = os.path.join(DATASET_INPUT_ROOT_PATH, PRE_DATAFRAME_FOLDER_NAME) + '.tar'
PRE_DATAFRAME_DATASET_FOLDER_PATH = os.path.join(DATASET_INPUT_ROOT_PATH, PRE_DATAFRAME_FOLDER_NAME)
DATAFRAME_DATASET_FOLDER_PATH = os.path.join(DATASET_INPUT_ROOT_PATH, DATAFRAME_FOLDER_NAME)

Get input files from NIH and decompress files

In case of `Anndata` :

In [5]:
urllib.request.urlretrieve(ANNDATA_URL, ANNDATA_GZ_FILE_PATH)
with gzip.open(ANNDATA_GZ_FILE_PATH, mode='rb') as gz_file:
    content = gz_file.read()
    with open(ANNDATA_DATASET_PATH, mode='wb') as h5ad_file:
        h5ad_file.write(content)

In case of `DataFrame(CSV)` :

In [None]:
urllib.request.urlretrieve(DATAFRAME_URL, PRE_DATAFRAME_TAR_FOLDER_PATH)
with tarfile.open(PRE_DATAFRAME_TAR_FOLDER_PATH) as tar_file:
    tar_file.extractall(path=PRE_DATAFRAME_DATASET_FOLDER_PATH)

dataframe_gz_paths = glob.glob(f"{PRE_DATAFRAME_DATASET_FOLDER_PATH}/*.csv.gz")

for dataframe_gz_path in dataframe_gz_paths:
    with gzip.open(dataframe_gz_path, mode='rb') as gz_file:
        content = gz_file.read()
        dataframe_path = os.path.splitext(dataframe_gz_path)[0]
        with open(dataframe_path, mode='wb') as dataframe_file:
            dataframe_file.write(content)

# merge intron file and exon file
intron_input_file_paths = sorted(glob.glob(f"{PRE_DATAFRAME_DATASET_FOLDER_PATH}/*intron_count.csv"))
exon_input_file_paths = sorted(glob.glob(f"{PRE_DATAFRAME_DATASET_FOLDER_PATH}/*exon_count.csv"))

# os.mkdir(DATAFRAME_DATASET_FOLDER_PATH)
for intron_file, exon_file, day_name in zip(intron_input_file_paths, exon_input_file_paths, DAY_NAMES):
    merged_file = f"{DATAFRAME_DATASET_FOLDER_PATH}/{day_name}.csv"
    with open(merged_file, 'w') as f_new:
        with open(intron_file, 'r') as f:
            f_new.write(f.read())
        with open(intron_file, 'r') as f:
            f_new.writelines(f.readlines()[1:])