In [1]:
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [1]:
import pandas as pd
import numpy as np
import random
import os
from tqdm.auto import tqdm # for progress bar


In [3]:
# Rename Raw Sample Files    
# samplePath = "./GEO Samples/GSE106817/Samples/GSE106817_RAW"
# sampleFiles = os.listdir(samplePath)
# l = len(sampleFiles)
# for i in range(l): 
#     fileName = sampleFiles[i][:10]
#     oldName = samplePath +"/"+ sampleFiles[i]
  
#     sampleFiles[i] = sampleFiles[:10]
#     os.rename(oldName, samplePath+"/"+fileName+".txt")
       
    
#     # print progress bar
#     printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)   


In [14]:
# Create base Dataframe to merge all sample files in it


samplePath = "./GEO Samples/GSE211692/Samples/GSE211692_RAW files"
sampleFiles = os.listdir(samplePath)


file_name = samplePath+"/"+sampleFiles[0]
file_arr = []
with open(file_name, 'r', encoding='ISO-8859-1') as file:
        i = -1
        for line in file:
            i += 1
            if i > 8:
                a = line.strip().split('\t')
                if a[5].startswith("MIMA"):
                    file_arr.append([a[5], a[6]])




arr = np.array(file_arr)    
df = pd.DataFrame(arr)
df = df.transpose()

# New column data
new_column = ["ID", sampleFiles[0][:10]]

# Insert the new column at the beginning (index 0)
df.insert(loc=0, column='ID', value=new_column)

new_header = df.iloc[0] #grab the first row for the header
main_df = df[2:] #take the data less the header row
main_df.columns = new_header #set the header row as the df header



print(main_df)



Empty DataFrame
Columns: [ID, MIMAT0004501, MIMAT0002844, MIMAT0002843, MIMAT0002824, MIMAT0002823, MIMAT0002807, MIMAT0002806, MIMAT0001635, MIMAT0001631, MIMAT0005890, MIMAT0005889, MIMAT0005873, MIMAT0005872, MIMAT0004591, MIMAT0004590, MIMAT0004562, MIMAT0004518, MIMAT0004517, MIMAT0016907, MIMAT0016901, MIMAT0014994, MIMAT0016913, MIMAT0011777, MIMAT0011775, MIMAT0005924, MIMAT0005923, MIMAT0005906, MIMAT0005905, MIMAT0018939, MIMAT0019949, MIMAT0019764, MIMAT0019200, MIMAT0019837, MIMAT0019204, MIMAT0019811, MIMAT0019358, MIMAT0015068, MIMAT0015066, MIMAT0019017, MIMAT0019064, MIMAT0021125, MIMAT0021124, MIMAT0021079, MIMAT0021046, MIMAT0019037, MIMAT0019981, MIMAT0019982, MIMAT0019980, MIMAT0019825, MIMAT0019725, MIMAT0026555, MIMAT0026554, MIMAT0026472, MIMAT0022700, MIMAT0025846, MIMAT0025845, MIMAT0022282, MIMAT0022281, MIMAT0022266, MIMAT0022265, MIMAT0027560, MIMAT0027559, MIMAT0027544, MIMAT0027543, MIMAT0027528, MIMAT0027527, MIMAT0027512, MIMAT0027511, MIMAT0026617, MIMA

In [15]:
# Create a df for each file and append new sample file data to the main Dataframe  
l = len(sampleFiles)
for i in range(l):
    
    file_name = sampleFiles[i]
    file_path = samplePath+"/"+ file_name
    file_arr = []
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
            i = -1
            for line in file:
                i += 1
                if i > 8:
                    a = line.strip().split('\t')                    
                    if a[5].startswith("MIMA"):
                        file_arr.append([a[5], a[6]])



    arr = np.array(file_arr)    
    df = pd.DataFrame(arr, columns=['ID', 'SpotMedian'])
    df = df.transpose()
    
    # New column data
    new_column = ["ID",file_name[:10]]

    # Insert the new column at the beginning (index 0)
    df.insert(loc=0, column='ID', value=new_column)
    
    new_header = df.iloc[0] #grab the first row for the header
    new_df = df[1:] #take the data less the header row
    new_df.columns = new_header #set the header row as the df header
    
    # Append new sample file data to main Dataframe
    main_df = main_df.append(new_df, ignore_index=True)
    
    # print progress bar
#     printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 70)   


In [16]:
filepath = "./GEO Samples/GSE211692/Samples/GSE211692_Raw_Sample_files.csv"
main_df.to_csv(filepath, index=False)

In [17]:
main_df

ID,ID.1,MIMAT0004501,MIMAT0002844,MIMAT0002843,MIMAT0002824,MIMAT0002823,MIMAT0002807,MIMAT0002806,MIMAT0001635,MIMAT0001631,...,MIMAT0022965,MIMAT0022948,MIMAT0027678,MIMAT0027677,MIMAT0027662,MIMAT0027661,MIMAT0027646,MIMAT0027645,MIMAT0027630,MIMAT0027629
0,GSM6483171,56.847241,56.132793,54.320361,81.837742,51.304635,63.617516,59.963161,52.791166,67.156406,...,55.943314,72.092542,105.518825,55.900304,60.873470,80.447581,58.169422,74.539130,73.761820,61.043855
1,GSM6483172,52.015557,57.929098,51.365316,79.436405,53.957861,61.162818,56.852265,50.292433,60.644344,...,52.287042,76.228736,108.305469,51.869452,54.788668,84.209177,56.917724,89.000018,72.306331,60.194463
2,GSM6483173,54.023395,56.589050,55.907838,79.635037,53.452589,67.933299,55.671719,49.728168,96.881313,...,55.654730,81.982109,101.137080,52.958806,61.036920,93.662183,62.847806,80.954964,78.542758,60.894432
3,GSM6483174,52.753029,54.692745,50.822404,87.185541,52.220982,67.618257,52.066346,53.716319,113.455648,...,52.542901,76.683095,133.004830,51.972909,58.268871,96.824523,58.893035,83.048112,86.914982,56.281433
4,GSM6483175,71.207246,72.642474,61.846491,114.983294,69.372332,84.128884,83.722345,71.265394,116.347247,...,74.102734,113.836922,151.762692,72.396093,80.756157,128.491580,79.809953,107.949019,99.126774,83.886240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16185,GSM6499356,90.481438,71.035680,45.155518,167.761672,47.965158,378.338845,43.985662,41.067644,52.538022,...,35.842256,164.189282,732.036177,36.594522,77.380353,180.757068,40.555349,180.855897,119.524566,33.472963
16186,GSM6499357,75.871836,76.577024,49.185056,150.863390,50.755624,95.928366,43.587263,39.641827,55.305641,...,35.491738,133.066598,346.146609,36.418355,49.415266,103.039884,38.358972,96.160855,61.701770,37.303723
16187,GSM6499358,124.536441,140.982392,133.259757,215.122851,140.251059,180.022427,115.810702,115.450086,266.112491,...,108.412810,175.967957,423.856265,111.687469,123.000902,189.606922,113.961433,166.188470,139.296230,112.715554
16188,GSM6499359,49.694548,50.826868,51.959200,141.718924,50.902607,111.760164,49.588414,44.592872,149.928291,...,41.581351,100.348765,263.638190,45.758748,52.583684,97.771205,49.885319,97.329637,66.674846,40.199803
