In [1]:
import pandas as pd
import numpy as np
import glob
import gzip
import shutil
import os
from tqdm import tqdm

# set infinite display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## Metadata Processing

In [11]:
# convert metadata tsvs into dataframes
df = pd.read_csv('../../SraRunTable.txt', sep=',')
metadata_df = df[['Run', 'source_name', 'cell_type', 'condition']]

## Counts Processing

In [8]:
source_dir = "../../results/"
destination_dir = "../../Tables/"

# Get a list of all files in the source directory
filenames = os.listdir(source_dir)
# for each folder in the source directory, find the .sf file, rename it to the folder name, and move it to the destination directory
for filename in tqdm(filenames):
    folder = source_dir + filename
    for file in os.listdir(folder):
        if file.endswith('.sf'):
            shutil.move(folder + '/' + file, destination_dir + filename + '.sf')

100%|██████████| 24/24 [00:00<00:00, 10223.78it/s]


In [10]:
# Define the directory where the '.sf' files are located
sf_directory = "../../Tables/"

# Initialize an empty DataFrame to store the results
result_df = pd.DataFrame()

# List the '.sf' files in the directory
sf_files = [f for f in os.listdir(sf_directory) if f.endswith('.sf')]

# loop through all .sf files
for file in tqdm(sf_files):
    # read in the file
    df = pd.read_csv("../../Tables/" + file, sep='\t', header=0)
    # keep only the Name and TPM columns
    df = df[['Name', 'TPM']]
    # rename the TPM column to the filename without the .sf extension
    df = df.rename(columns={'TPM': file.split('.')[0]})
    # if results_df is empty, then set it to the df dataframe
    if result_df.empty:
        result_df = df
    else:
        # merge the dataframe with the result_df dataframe
        result_df = pd.merge(result_df, df, on='Name', how='outer')
    del df

100%|██████████| 24/24 [00:03<00:00,  7.04it/s]


### The dataframes are saved as .csv files

In [12]:
# save the results_df dataframe as a tsv file with the name TPM.tsv
result_df.to_csv('../../results/TPM.tsv', sep='\t', index=False)
# save the metadata dataframe
metadata_df.to_csv('../../results/metadata.tsv', sep='\t', index=False)