In [None]:
# Make the most mutable version of a given sequence, in two variations (below). Then generate CSC Avgs. (using human CSC values) for each. 
#    1. Most mutable just within CDRs
#    2. Most mutable across all FRs/CDRs

In [3]:
# FUNCTIONS

def getCodons(Seq):
    # Returns a list of tuples in format (Codon Name, Amino Acid) from input sequence, in order  #
    # Seq is a STRING                                                                            #
    
    # get a list of codon names for each codon in Seq
    CodonList = []
    for x in range(0, len(Seq), 3):
        CodonList.append(Seq[x:x+3])
    
    # for each codon, translate it into amino acid, and create a list
    ProteinSeq = [] 
    for codon in CodonList:
        if codon in CodonDict:
            ProteinSeq.append(CodonDict[codon])
        else:
            ProteinSeq.append('~')
    
    # Create a list of tuples of (Codon Name, Amino Acid) from the 2 lists
    SeqTup = tuple(zip(CodonList, ProteinSeq))
    
    return SeqTup

def getAA(seq):
    aa = ''
    for codon in getCodons(seq):
        aa += str(codon[1])
    return aa

def getCodonsForAA(AA):
    # Returns a list of tuples, each in format (Codon Name, Amino Acid), containing every codon  #
    # for input amino acid (AA)                                                                  #
    # AA is a STRING                                                                             #
    
    all_codons = []
    for tuple in list:
        if tuple[1] == AA:
            all_codons.append(tuple[0])
    
    return all_codons

In [4]:
# VARIABLES - edit based on sequence. Note: framework regions are segmented so they are easier to work with
cdr1 = 'GGATTCACTTTCAGTAACGCCTGG'
cdr2 = 'ATTAAAAGCAAAACTGATGGTGGGACAACA'
cdr3 = 'ACCACAGAA'
fwr1_1 = 'CAGGTGCAGCTGCAGGAGTCGGGCCCAGGA' 
fwr1_2 = 'CTGGTGAAGCCTTCGGACACCCTG'
fwr1_3 = 'TCCCTCACCTGCGCTGTCTCT'
fwr2_1 = 'TGGGGCTGGATCCGGCAGCCCCCAGGG'
fwr2_2 = 'AAGGGACTGGAGTGGATTGGGTAC'
fwr3_1 = 'TACTACAACCCGTCCCTCAAGAGTCGA'
fwr3_2 = 'GTCACCATGTCAGTAGACACGTCCAAGAACCAG'
fwr3_3 = 'TTCTCCCTGAAGCTGAGCTCTGTG'
fwr3_4 = 'ACCGCCGTGGACACGGCCGTGTATTACTGT'

CodonDict={'ATT':'I','ATC':'I','ATA':'I','CTT':'L','CTC':'L','CTA':'L','CTG':'L','TTA':'L','TTG':'L','GTT':'V','GTC':'V',
           'GTA':'V','GTG':'V','TTT':'F','TTC':'F','ATG':'M','TGT':'C','TGC':'C','GCT':'A','GCC':'A','GCA':'A','GCG':'A',  
           'GGT':'G','GGC':'G','GGA':'G','GGG':'G','CCT':'P','CCC':'P','CCA':'P','CCG':'P','ACT':'T','ACC':'T','ACA':'T',  
           'ACG':'T','TCT':'S','TCC':'S','TCA':'S','TCG':'S','AGT':'S','AGC':'S','TAT':'Y','TAC':'Y','TGG':'W','CAA':'Q',  
           'CAG':'Q','AAT':'N','AAC':'N','CAT':'H','CAC':'H','GAA':'E','GAG':'E','GAT':'D','GAC':'D','AAA':'K','AAG':'K',  
           'CGT':'R','CGC':'R','CGA':'R','CGG':'R','AGA':'R','AGG':'R','TAA':'*','TAG':'*','TGA':'*'}

# make CodonDict into a list of tuples
list = [(k, v) for k, v in CodonDict.items()]

In [276]:
# repeat next 2 chunks for each FWR/CDR

AA_seq = getAA(fwr3_4)
d = dict()
for i in range(len(AA_seq)):
    possible_codons = getCodonsForAA(AA_seq[i])
    d[i] = possible_codons
    
total = 1
for key in d:
    total = total * len(d[key])
    
total   # total number of possible codon arrangements for this region

65536

In [277]:
# generate list of dictionaries containing sequences for every possible permutation of codons in d 
# save in variable permutations_dicts in format [ {pos:codon, pos:codon}, ... , {pos:codon, pos:codon} ]

import itertools
keys, values = zip(*d.items())
permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]

# concatenate each codon to create the full unique sequence, for each dictionary (seq) in permutations_dicts
# at same time add each seq to a string, separated by new lines

all_seq = []
for dic in permutations_dicts:
    seq = ''
    for key in dic:
        seq += dic[key]
    all_seq.append(seq)

file = ''
for seq in all_seq:
    file += str(seq) + '\n'
    
# write string into a .txt file
with open('4-28_fwr3-4.txt', 'w') as f:
    f.write(file)

In [None]:
# Next, find most mutable sequence for each of the files created using shazam in R. 
# Then concatenate all segmented framework regions and check to see if AA sequence is unchanged

In [1]:
orig_fwr1 = 'GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTAAAGCCTGGGGGGTCCCTTAGACTCTCCTGTGCAGCCTCT'
new_fwr1 = 'GAGGTACAGCTAGTAGAAAGCGGTGGTGGTCTAGTAAAACCTGGGGGCAGCTTACGGTTAAGCTGCGCAGCATCT'

orig_fwr2 = 'ATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTGGCCGT'
new_fwr2 = 'ATGAGCTGGGTACGGCAAGCACCTGGAAAGGGTTTAGAATGGGTAGGTAGA'

orig_fwr3 = 'GACTACGCTGCACCCGTGAAAGGCAGATTCACCATCTCAAGAGATGATTCAAAAAACACGCTGTATCTGCAAATGAACAGCCTGAAAACCGAGGACACAGCCGTGTATTACTGT'
new_fwr3 = 'GATTATGCAGCACCGGTAAAGGGCAGATTTACTATTAGCAGGGATGATAGCAAAAATACATTGTATCTTCAAATGAATAGCTTAAAGACGGAAGATACAGCAGTATACTACTGC'

In [5]:
#check AA seq

print(getAA(orig_fwr1) == getAA(new_fwr1))
print(getAA(orig_fwr2) == getAA(new_fwr2))
print(getAA(orig_fwr3) == getAA(new_fwr3))

True
True
True


In [6]:
# concatenate all together and check

orig = 'ATGGGGTCAACCGCCATCCTCGCCCTCCTCCTGGCTGTTCTCCAAGGAGTCTGTTCCGAGGTGCAGCTGGTGCAGTCTGGAGCAGAGGTGAAAAAGCCCGGGGAGTCTCTGAAGATCTCCTGTAAGGGTTCTGGATACAGCTTTACCAGCTACTGGATCGGCTGGGTGCGCCAGATGCCCGGGAAAGGCCTGGAGTGGATGGGGATCATCTATCCTGGTGACTCTGATACCAGATACAGCCCGTCCTTCCAAGGCCAGGTCACCATCTCAGCCGACAAGTCCATCAGCACCGCCTACCTGCAGTGGAGCAGCCTGAAGGCCTCGGACACCGCCATGTATTACTGTGCGAGACA'
new = 'ATGGGGTCAACCGCCATCCTCGCCCTCCTCCTGGCTGTTCTCCAAGGAGTCTGTTCCGAGGTACAGCTAGTTCAAAGCGGGGCAGAGGTAAAAAAACCTGGGGAGAGCTTAAAAATTAGCTGCAAGGGCAGTGGGTACAGCTTTACTAGCTACTGGATAGGCTGGGTACGTCAAATGCCCGGTAAAGGTTTAGAATGGATGGGTATCATATATCCTGGGGATAGCGATACACGGTATAGCCCGAGCTTTCAAGGGCAGGTTACTATATCAGCTGATAAAAGCATTAGCACAGCATATCTTCAATGGAGTAGTCTAAAAGCAAGCGATACAGCAATGTACTACTGCGCAAGGCA'

print(getAA(orig) == getAA(new))

True
