In [2]:
import pandas as pd
import os

In [13]:
input_dir = "/Users/natem/storedAnalysis/output_sm/"


### UMI_extraction Metadata

In [70]:
samples = [_ for _ in os.listdir( os.path.join( input_dir, "umi_extracted" ) ) if os.path.isdir( os.path.join( input_dir, "umi_extracted", _ ) )]

extraction_list = list()

for _ in samples:
    temp = pd.read_csv( os.path.join( input_dir, "umi_extracted", _, "checkout.log.txt" ), sep="\t" )
    umis_identified = int( temp.loc[temp["SAMPLE"]==_, "MASTER"] )
    total_reads = umis_identified + int( temp.loc[temp["SAMPLE"]=="undef-m", "MASTER"] )
    extraction_list.append( [_, total_reads, umis_identified ] )
metadata_df = pd.DataFrame( extraction_list, columns=["Sample", "Total_Reads", "Barcode_Reads" ] )
metadata_df.set_index( "Sample", inplace=True )
metadata_df

Unnamed: 0_level_0,Total_Reads,Barcode_Reads
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1
C3,813060,743052
C2,520966,471461


### MIG_statitics

In [71]:
ms = pd.read_csv( os.path.join( input_dir, "mig_statistics", "estimates.txt"), sep="\t" )
ms.rename( columns={"#SAMPLE_ID" : "Sample",
                    "SAMPLE_TYPE" : "Sample_Type",
                    "TOTAL_MIGS" : "Total_MIGS", 
                    "OVERSEQ_THRESHOLD" : "Overseq_Threshold",
                    "COLLISION_THRESHOLD" : "Collision_Threshold",
                    "UMI_QUAL_THRESHOLD" : "UMI_Quality_Threshold",
                    "UMI_LEN" : "UMI_Length" }, inplace=True )
ms.drop( "TOTAL_READS", inplace=True, axis=1 )
ms.set_index( "Sample", inplace=True )
metadata_df = pd.concat( [metadata_df, ms], axis=1 )
metadata_df

Unnamed: 0_level_0,Total_Reads,Barcode_Reads,Sample_Type,Total_MIGS,Overseq_Threshold,Collision_Threshold,UMI_Quality_Threshold,UMI_Length
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C3,813060,743052,paired,34496,16,16,15,12
C2,520966,471461,paired,91932,8,8,15,12


### UMI_Collapsed

In [72]:
uc = pd.read_csv( os.path.join( input_dir, "umi_collapsed", "assemble.log.txt" ), sep="\t" )
uc.drop( ["SAMPLE_TYPE", "INPUT_FASTQ1", 'MIGS_DROPPED_OVERSEQ_2', 'READS_DROPPED_OVERSEQ_2', 'READS_DROPPED_COLLISION_2', 'MIGS_DROPPED_COLLISION_2', "INPUT_FASTQ2", "OUTPUT_ASSEMBLY1", "OUTPUT_ASSEMBLY2", "MIG_COUNT_THRESHOLD", "MIGS_TOTAL"], axis=1, inplace=True )
uc.rename( columns={ '#SAMPLE_ID' : 'Sample',
                     'MIGS_GOOD_FASTQ1' : 'MIGs_Good_R1',
                     'MIGS_GOOD_FASTQ2' : 'MIGs_Good_R2',
                     'MIGS_GOOD_TOTAL' : 'MIGs_Good_Total',
                     'READS_GOOD_FASTQ1' : 'Reads_Good_R1',
                     'READS_GOOD_FASTQ2' : 'Reads_Good_R2',
                     'READS_GOOD_TOTAL' : 'Reads_Good_Total',
                     'READS_TOTAL' : 'Total_Reads_Collapsed',
                     'READS_DROPPED_WITHIN_MIG_1' : 'Reads_Dropped_Within_MIG_1',
                     'READS_DROPPED_WITHIN_MIG_2' : 'Reads_Dropped_Within_MIG_2',
                     'MIGS_DROPPED_OVERSEQ_1' : 'MIGs_Dropped_Overseq',
                     'READS_DROPPED_OVERSEQ_1' : 'Reads_Dropped_Overseq',
                     'MIGS_DROPPED_COLLISION_1' : 'MIGs_Dropped_Collision',
                     'READS_DROPPED_COLLISION_1' : 'Reads_Dropped_Collision' }, inplace=True )
uc.set_index( "Sample", inplace=True )
metadata_df = pd.concat( [metadata_df, uc], axis=1 )
metadata_df["MIGs_Good_Total"]

Sample
C3    2033
C2    4422
Name: MIGs_Good_Total, dtype: int64

### Contamination Removal

In [76]:
cr = pd.read_csv( os.path.join( input_dir, "cleaned/contamination.log.txt" ) )
cr.drop( ["TOTAL_READS"], axis=1, inplace=True )
cr.rename( columns={ "SAMPLE_ID" : "Sample",
                   "CONTAMINATED_READS" : "Contaminated_MIGs",
                   "GOOD_READS" : "Good_MIGs",
                   "GOOD_READS_PCT" : "Good_Reads_PCT" }, inplace=True )
cr.set_index( "Sample", inplace=True )
metadata_df = pd.concat( [metadata_df, cr ], axis=1, sort=True )
metadata_df

Unnamed: 0,Total_Reads,Barcode_Reads,Sample_Type,Total_MIGS,Overseq_Threshold,Collision_Threshold,UMI_Quality_Threshold,UMI_Length,MIGs_Good_R1,MIGs_Good_R2,...,Total_Reads_Collapsed,Reads_Dropped_Within_MIG_1,Reads_Dropped_Within_MIG_2,MIGs_Dropped_Overseq,Reads_Dropped_Overseq,MIGs_Dropped_Collision,Reads_Dropped_Collision,Contaminated_MIGs,Good_MIGs,Good_Reads_PCT
C2,520966,471461,paired,91932,8,8,15,12,4645,4479,...,469653,3390,11485,73010,91310,14155,17387,1166,3256,73.63%
C3,813060,743052,paired,34496,16,16,15,12,2272,2039,...,739403,6651,62900,13715,32864,18410,30439,1166,867,42.65%


### Assemble Repertoire

In [116]:
df = list()
for sample in samples:
    with open( os.path.join( input_dir, "analyze", "{}.report".format( sample ) ), "r" ) as ar:
        value_dict = dict()
        for _ in ar:
            line_split = _.split(":")
            if len( line_split ) > 1:
                factor = line_split[0].split( "," )[0].replace( " ", "_" )
                value_split = line_split[1].split( "(" )
                value =  value_split[0].strip()
                try:
                    if "." in value:
                        value_dict[factor] = float( value )
                    else:
                        value_dict[factor] = int( value )
                except ValueError:
                    continue
        df.append( pd.DataFrame( value_dict, index=[sample] ) )

df = pd.concat( df, sort=True )
df.drop( ["Total_sequencing_reads"], axis=1, inplace=True )
metadata_df = pd.concat( [metadata_df, df], axis=1, sort=True)

In [117]:
metadata_df.to_csv( "test.csv", sep="," )