In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob
from collections import defaultdict

In [2]:
result_files = glob('<path/to/your>/assembled/result*.csv')

In [3]:
### Note: this section will need to be edited to match the file names your sequencing instrument creates ###
id_pattern = re.compile("result_df_Hua-Will-(.+)-(library-.+)_S([0-9]+)_L([0-9]+).csv")
libraries = defaultdict(list)

for fname in result_files:
    id_info = id_pattern.findall(fname)
    lib_name = str(id_info[0][1]).replace('-reseq1','').replace('library-','')
    
    libraries[lib_name].append(fname)

In [4]:
all_counts = pd.DataFrame(columns=['AA','counts'])

for lib in libraries:
    
    if not (lib in ['1', '2', 'ins']):
    
        combined = pd.DataFrame(columns=['AA'])
        for i,result in enumerate(libraries[lib]):
            result_df = pd.read_csv(result)
            combined = combined.merge(result_df,on='AA',how='outer',suffixes=('','_'+str(i)))

        combined.insert(len(combined.columns),'counts',
                        np.nansum(combined.loc[:,[x for x in combined.columns if 'count' in x]].values,axis=1))

        all_counts = all_counts.merge(combined.loc[:,['AA','counts']],on='AA',how='outer',suffixes=('','_'+lib))
        
all_counts = all_counts.drop('counts',axis=1)
all_counts['name'] = ['seq'+str(i) for i in range(len(all_counts))]
all_counts.to_csv('<desired/path/to/all_counts.csv>')

In [5]:
all_counts

Unnamed: 0,AA,counts_TCR-plus-pep-plus-pep2,counts_TCR-plus-pep1,counts_TCR-plus-pep2,counts_pep1,counts_TCR-minus-pep1,counts_naive,counts_pep2,counts_TCR-minus-pep2,counts_exp2,counts_TCR-plus-pep-plus-pep1,counts_exp1,name
0,TEIARRSVEELLEEAKHIDDPKRRFELLVLAQLVAEANNDPELERL...,6659.0,1321.0,2354.0,1712.0,1861.0,12.0,2699.0,2991.0,92.0,3962.0,52.0,seq0
1,SVEELLRRAEELIKEGDEKQAADLLFLAEILAEADLLFLAEILAEA...,3736.0,568.0,1000.0,1559.0,2465.0,92.0,546.0,1190.0,261.0,6059.0,195.0,seq1
2,SLEEAVRILLEKARKIEDEREREDVLILARLAAEAADDPELEKLVE...,2779.0,2155.0,4346.0,1736.0,3010.0,27.0,3248.0,3293.0,109.0,3016.0,47.0,seq2
3,SEEEEEVRELIRRAEELLERGNPKEAFEALMRAFATLMRAFAIAAQ...,2644.0,355.0,937.0,744.0,896.0,5.0,524.0,517.0,42.0,1494.0,30.0,seq3
4,SVAEEAVKHLLEKAKKIPDPKQRRDVLILAEIAARAYNNEELEELV...,2635.0,3130.0,2328.0,1284.0,1435.0,15.0,932.0,1042.0,73.0,1513.0,41.0,seq4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
339672,SEIEERVEELIREAEKLYENGDPHRAFEVLSESFAVLSESFAIALY...,,,,,,,,,,,1.0,seq339672
339673,DDEEKAKELLEEVRKLTDPEERREVLLRAAALALHAGDRELYRDIM...,,,,,,,,,,,1.0,seq339673
339674,DQEEAERLLEEARKIDDPTHARFVLFLALLLAKQTNDEELVSELQE...,,,,,,,,,,,1.0,seq339674
339675,ERERRLEELVECARNLARRGRPNALQALLVVLEALLRRLGRRELFD...,,,,,,,,,,,1.0,seq339675


In [6]:
with open('<desired/path/to/identified_seqs.fasta>','w') as f:
    for i,row in all_counts.iterrows():
        f.write('>{}\n{}\n'.format(row['name'],row['AA']))