In [29]:
import os
import pandas as pd
import glob
import time

# Set the path to the directory containing the subfolders
data_dir = 'C:/Users/trent/OneDrive/Desktop/gdc-client_v1.6.1_Windows_x64'

# Use glob to get a list of all the subfolders in the directory
subfolders = glob.glob(os.path.join(data_dir, '*'))

# Set the chunk size
chunk_size = 25

# Loop over the subfolders in chunks
this_rnaseq_data = pd.DataFrame()
num_subfolders = len(subfolders)
num_chunks = (num_subfolders - 1) // chunk_size + 1

for chunk_idx in range(num_chunks):
    start_idx = chunk_idx * chunk_size
    end_idx = (chunk_idx + 1) * chunk_size
    subfolders_chunk = subfolders[start_idx:end_idx]
    start_time = time.time()

    # Loop over each subfolder in the chunk and import the TSV file
    for i, subfolder in enumerate(subfolders_chunk):
        # Get the path to the TSV file in the subfolder
        file_pattern = os.path.join(subfolder, '*.tsv')
        tsv_files = glob.glob(file_pattern)

        if len(tsv_files) > 0:
            # Read the first TSV file into a pandas DataFrame
            data = pd.read_csv(tsv_files[0], delimiter='\t', skiprows=1)
            
            data = data[~data['gene_id'].str.contains('N_unmapped|N_multimapping|N_noFeature|N_ambiguous')]
            
            # Get the file name without the extension from the path
            file_name = os.path.splitext(os.path.basename(tsv_files[0]))[0].split('.')[0]

            # Add a new column to this_rnaseq_data with the file name
            data['file_name'] = file_name

            # Append the data to the all_rnaseq_data DataFrame
            this_rnaseq_data = this_rnaseq_data.append(data)

        # Print progress report
        total_idx = start_idx + i
        progress = (i + 1) / chunk_size
        elapsed_time = time.time() - start_time
        remaining_time = (elapsed_time / progress) - elapsed_time
        print(f'Chunk {chunk_idx + 1}/{num_chunks} | Progress: {progress:.1%} | Elapsed Time: {elapsed_time:.0f} sec | Remaining Time: {remaining_time:.0f} sec', end='\r')

    # Save the chunk to a CSV file
    chunk_filename = f'all_rnaseq_data_chunk{chunk_idx + 1}.csv'
    this_rnaseq_data.to_csv(data_dir + '/' + chunk_filename, index=False)
    this_rnaseq_data = pd.DataFrame()
    
    # Print a progress report for this chunk
    chunk_percent = ((end_idx + 1) / num_subfolders) * 100
    print(f'Processed Chunk {chunk_idx+1} | Folders {start_idx+1}-{end_idx} ({chunk_percent:.2f}% complete) | Elapsed Time (this_chunk): {elapsed_time:.0f} sec')

# Get a list of all the chunk CSV files
csv_files = glob.glob(os.path.join(data_dir, 'all_rnaseq_data_chunk*.csv'))

# Load each CSV file into a separate DataFrame and store them in a list
df_list = []
for i, csv_file in enumerate(csv_files):
    print(f"Loading chunk {i+1}/{len(csv_files)}: {csv_file}")
    try:
        df = pd.read_csv(csv_file)
        df_list.append(df)
    except:
        print(f"Error loading chunk {i+1}/{len(csv_files)}: {csv_file}")

# Concatenate all the DataFrames in the list into a single DataFrame
all_rnaseq_data = pd.concat(df_list, ignore_index=True)

all_rnaseq_data.to_csv("C:/Users/trent/OneDrive/Desktop/all_rnaseq_data.csv", index=False)

print(all_rnaseq_data.shape)

Processed Chunk 1 | Folders 1-25 (2.03% complete) | Elapsed Time (this_chunk): 4 sec
Processed Chunk 2 | Folders 26-50 (3.98% complete) | Elapsed Time (this_chunk): 3 sec
Processed Chunk 3 | Folders 51-75 (5.93% complete) | Elapsed Time (this_chunk): 4 sec
Processed Chunk 4 | Folders 76-100 (7.88% complete) | Elapsed Time (this_chunk): 4 sec
Processed Chunk 5 | Folders 101-125 (9.83% complete) | Elapsed Time (this_chunk): 4 sec
Processed Chunk 6 | Folders 126-150 (11.78% complete) | Elapsed Time (this_chunk): 4 sec
Processed Chunk 7 | Folders 151-175 (13.73% complete) | Elapsed Time (this_chunk): 4 sec
Processed Chunk 8 | Folders 176-200 (15.68% complete) | Elapsed Time (this_chunk): 4 sec
Processed Chunk 9 | Folders 201-225 (17.63% complete) | Elapsed Time (this_chunk): 4 sec
Processed Chunk 10 | Folders 226-250 (19.58% complete) | Elapsed Time (this_chunk): 4 sec
Processed Chunk 11 | Folders 251-275 (21.53% complete) | Elapsed Time (this_chunk): 4 sec
Processed Chunk 12 | Folders 276