This script is for processing the Fasta files from PandaSeq and generating peptide lists. 

In [None]:
import numpy as np
import pandas as pd
import csv
import string
import matplotlib.pyplot as plt
import seaborn as sns
import filecmp
from scipy import stats
import re
import fnmatch

In [None]:
#import the list of DNA sequences that we ordered from Twist
merged = pd.read_csv('all_revtrans_nocut_SUBMITTED remove flanking DNA.csv') 
merged = merged.drop(['aa'],axis=1)

In [None]:
def translate(pep_dna):
#Translate peptide to amino acids
    codons_aa = []
    codons_dna = []
    for j in range(0,len(pep_dna),3):   
        codons_dna.append(pep_dna[j:j+3])
    codons_aa = [codon_2_aa[x] for x in codons_dna] # translate
    pep_aa=''.join(codons_aa) #puts letters into string
    return pep_aa

codon_2_aa = {"TTT":"F", "TTC":"F", "TTA":"L", "TTG":"L",
    "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S",
    "TAT":"Y", "TAC":"Y", "TAA":"*", "TAG":"*",
    "TGT":"C", "TGC":"C", "TGG":"W", "TGA":"*",
    "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L",
    "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P",
    "CAT":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
    "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R",
    "ATT":"I", "ATC":"I", "ATA":"I", "ATG":"M",
    "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T",
    "AAT":"N", "AAC":"N", "AAA":"K", "AAG":"K",
    "AGT":"S", "AGC":"S", "AGA":"R", "AGG":"R",
    "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V",
    "GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A",
    "GAT":"D", "GAC":"D", "GAA":"E", "GAG":"E",
    "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G"}

#round/allele correspondence of files
plex_dict = {
    '01':'R0-401',
    '02':'R0-402',
    '03':'R1-401',
    '04':'R1-402',
    '05':'R2-401',
    '06':'R2-402',
    '07':'R3-401',
    '08':'R3-402',
    '09':'R4-401',
    '10':'R4-402'
    }

In [None]:
#accept either 401/402 flanking sequence as null library is made with single allele
flank5_seq = 'TATTGCTAGCGTTTTGGCAGCT'
flank3_seq_401 = 'GGTGGATCCGGTGGCGGAGAACAAAAATTAATTAGTGAAGAAGATTTAGGCGGTCTAGAAGTTCTGTTCCAGGGGCCCGGTGGCGGGTCCGGCGGT'
flank3_seq_402 = 'GGTGGATCCGGTGGCGGAGAACAAAAATTAATTAGTGAGGAGGACCTTGGCGGTCTAGAAGTTCTGTTCCAGGGGCCCGGTGGCGGGTCCGGCGGT'
flank_stop_splicevar = 'TATTGCTAGCGTTTTGGCAGCTGGATAAGCTGGTGTTTAGCGCTGGTTGCTGTGAGTGCCCGGTGGATCCGGTGGCGGGTCCGGCGGT'#variant seen in Sanger sequence of library for doping into;orig peptide+linker spliced out

def process(plex):
    file_name = 'twist_1480'+plex+'_ps.fasta'
    dna = []
    aa = []

    number_contigs = 0

    with open(file_name, 'r') as read_file:  
        for line in read_file:
            if line[0] != '>':
                number_contigs +=1
                line = line.strip()
                
                #check if sequence contains the exact flanking region that should and remove:
                flank5 = line.find(flank5_seq)
                flank3_401 = line.find(flank3_seq_401) 
                flank3_402 = line.find(flank3_seq_402) 
                if flank5 != -1 and flank3_401 !=-1:
                    trimmed = line[flank5+len(flank5_seq):flank3_401]
                    dna.append(trimmed)
                elif flank5 != -1 and flank3_402 !=-1:
                    trimmed = line[flank5+len(flank5_seq):flank3_402]
                    dna.append(trimmed)
                elif line.find(flank_stop_splicevar) !=-1:
                    dna.append('*splicevar')

    data = pd.DataFrame({'dna':dna})

    #collapse to unique sequences with counts
    data_uniq = data['dna'].value_counts()
    data_uniq = pd.DataFrame(data_uniq).reset_index()
    data_uniq.columns = ['dna','count']
    
    print('---')
    print('plex:',plex)
    print("number of contigs:",number_contigs)
    print("number of contigs with correct flanking sequence:", len(data))
    print("number of unique contigs:",len(data_uniq))

    return data_uniq

In [None]:
for plex in ['01','03','05','07','09','02','04','06','08','10']:
    data_uniq = process(plex)
    data_uniq = data_uniq.rename(columns={"count":'count_'+plex_dict[plex]})
    merged = merged.merge(data_uniq, left_on='dna', right_on='dna', how='outer')

In [None]:
#identify peptides that match stop codon-containing NNN peptides from the library for doping into
## pattern: "NNNTAANNNNNNNNNTAGNNNNNNNNNNNNTGANNNNNN"

def check_doped(trimmed):
    if len(trimmed) == 39:
        if trimmed.find('TAA') == 3:
            if trimmed.find('TAG') == 15:
                if trimmed.find('TGA') == 30:
                    return 'match'
                else:
                    return float('NaN')
            else:
                return float('NaN')
        else:
            return float('NaN')
    else:
        return float('NaN')  
    
dna_list = merged['dna']
n = len(dna_list)
doped = ['']*n

for i in range(0,n):
    doped[i] = check_doped(dna_list[i])

merged['doped_match'] = doped

In [None]:
#translate DNA
aa = []
for dna in merged['dna']:
    if len(dna)%3 == 0:
        aa.append(translate(dna))
    else:
        aa.append('***DNA Not Mult of 3')
merged['aa'] = aa
    
merged.to_csv('twist_doped.csv')

In [None]:
#filter for peptides that don't match pattern of doped library or Twist order
print(len(merged))
merged_filtered = merged.dropna(subset=['name', 'doped_match'], how='all')
print(len(merged_filtered))
merged_filtered.to_csv('twist_doped_filtered.csv')