In [1]:
import random
import math

The `__init__` method initializes all parameters to zero by default. If any parameter needs to have a default value, you can set it here. All penalty parameters are set to a default value of 10

In [2]:
class PrimerDesign(object):
    
    def __init__ (self, name):
        """Initialises the PrimerDesign class with pre-defined variables"""
        
        # parameters for the length criterion
        self.max_length, self.min_length = 22, 18
        self.penalty_length = 10
        
        # parameters for the temperature difference criterion
        self.max_tdiff, self.min_tdiff = 1, 0
        self.penalty_tdiff = 10
        
        # parameters for the cg content criterion
        self.max_cg, self.min_cg = 0.6, 0.4
        self.penalty_cg = 10
        
        # parameters for the annealing temperature criterion
        self.max_temp, self.min_temp = 65, 52
        self.penalty_temp = 10
        
        # parameters for the run criterion
        self.run_threshold = 4
        self.penalty_runs = 10
        
        # parameters for the repeat criterion
        self.repeat_threshold = 0
        self.penalty_repeats = 10
        
        # parameters for the specificity criterion
        self.penalty_specificity = 10 
        
        # locations where the forward/revserse primer should be chosen from
        self.fp_start, self.fp_end = 100, 200
        self.rp_start, self.rp_end = 100, 200
        
        # parameters for the simulated annealing portion
        self.initial_temperature = 200
        self.stopping_temperature = 0.01
        self.drop_fraction = 0.999
        
        # self-defined
        self.dna_sequence = None

### Task 2 

In [3]:
class PrimerDesign(PrimerDesign):
    def set_dna_sequence(self, dna_sequence):
        """
        This takes in a string (raw gene sequence) and cleans it up to leave only ATCG.
        The result is stored in the PrimerDesign object's dna_sequence.
        """
        output_sequence = ""
        for character in str(dna_sequence):
            if character == "a" or character == "g" or character == "t" or character == "c":
                output_sequence += character        
        self.dna_sequence = output_sequence
        
# test code
test = PrimerDesign("DNA")
test.set_dna_sequence("gs1atgcgciuahdogciygatgcagcacgctah2872gh87xgsau")
print(test.dna_sequence)

gatgcgcagcgatgcagcacgctagga


### Task 3

In [4]:
class PrimerDesign(PrimerDesign):
    def func_select_random(self, sqtype='forward', length=20):        
        """
        Selects a forward or reverse primer at random with a specified length.
        """
        rand_primer_sequence, counter = [], []
        
        if sqtype=='forward':
            start_limit = self.fp_start 
            end_limit = self.fp_end 
            x_first = random.randint(start_limit, end_limit-length)
            x_end = x_first + length
            
            for index in range(x_first, x_end, 1):
                for character in self.dna_sequence[index-1]:
                    rand_primer_sequence.append(character)
                    counter.append(index)
            return rand_primer_sequence
        
        elif sqtype=='reverse':
            start_limit = self.rp_start 
            end_limit = self.rp_end
            y_first = random.randint(start_limit + length, end_limit)
            y_end = y_first - length
        
            for index in range(y_first, y_end, -1):
                for character in self.dna_sequence[index-1]:
                    rand_primer_sequence.append(character)
                    counter.append(index)
            return rand_primer_sequence
        
        else: return None
        
# test code
test = PrimerDesign('DNA')
dna_sequence_1 = 'gtcttctactattcgaggccgttcgttaatacttgttgcgttcctagccgctatatttgtctctttgccgactaatgtgaacaaccacaccatagcgatttatcggagcgcctcggaatacggtatgagcaggcgcctcgtgagaccattgcgaataccaggtatcgtgtaagtagcgaaggcccgtacgcgagataaac'
test.set_dna_sequence(dna_sequence_1)
print("Forward primer:", test.func_select_random('forward')) # TODO:
print("Reverse primer:", test.func_select_random('reverse')) # string or list????

Forward primer: ['c', 'a', 'g', 'g', 'c', 'g', 'c', 'c', 't', 'c', 'g', 't', 'g', 'a', 'g', 'a', 'c', 'c', 'a', 't']
Reverse primer: ['a', 'g', 'c', 'g', 't', 't', 'a', 'c', 'c', 'a', 'g', 'a', 'g', 't', 'g', 'c', 't', 'c', 'c', 'g']


### Task 4
Write methods that calculates the following properties for a given primer string:

* Its length
* The annealing temperature
* The fraction of cg bases
* The number of runs present
* The number of repeats present

In [5]:
from fractions import Fraction

sqA = 'cattaaaaatacgaaaaaagtcat'
sqB = 'atatatatattttatatataa'
sqC = ''

class PrimerDesign(PrimerDesign): 
    def func_length(self, sq):
        """Returns the length of a sequence"""
        return len(sq) 
    
    def func_cg_fraction(self, sq):
        """Returns the fraction of C/G bases in a sequence"""
        output = []
        for base in sq:
            if base == 'c' or base == 'g':
                output.append(base)
        if len(sq)!=0:        
            cg_fraction = Fraction(len(output),len(sq))
            return cg_fraction
        else:
            return 0
    
    def func_temperature(self,sq):
        """Returns the annealing temperature of a primer sequence"""
        Cs = sq.count('c')
        Gs = sq.count('g')
        As = sq.count('a')
        Ts = sq.count('t')
        Ta = 4*(Gs+Cs)+2*(As+Ts)
        return Ta

# test code
# TODO

In [6]:
class PrimerDesign(PrimerDesign):
    def func_count_runs(self,sq):
        """
        Returns the number of runs (sequence of repeated bases) present in a sequence.
        The condition for a run is defined by self.run_threshold in __init__()
        """
        counter = 0
        run = 0
        sq = list(sq)
        for index, character in enumerate(sq):
            current_char = sq[index]
            prev_char = sq[index-1]
            
            if current_char == prev_char: counter += 1
                
            elif current_char != prev_char:
                if counter >= self.run_threshold: run+=1
                counter = 0
                    
            if index==(len(sq)-1): # last letter in sequence
                if counter >= self.run_threshold: run+=1
                counter = 0
        return run
    
# test code
# TODO

In [7]:
class PrimerDesign(PrimerDesign):
    def func_count_repeats(self, sq):
        """
        Returns the number of repeats in a sequence.
        The threshold for a repeat is defined by self.repeat_threshold in __init__()
        """
        di_repeats = ['at','ac','ag',
                      'ca','ct','cg',
                      'ga','gt','gc',
                      'ta','tc','tg']
        A, B = [], []
        repeats = 0

        if len(sq) > 0:
            for index in range(len(sq)//2):
                A+=[sq[index*2:index*2+2]]
                B+=[sq[index*2-1:index*2+1]]
            B.remove('')
            for s in range(0, len(A)-1):
                if A[s] == A[s+1] and A[s] in di_repeats:
                    repeats += 1
            for s in range(0, len(B)-1):
                if B[s] == B[s+1] and B[s] in di_repeats:
                    repeats += 1
            return repeats
        elif len(sq) == 0: return 0
        
# test code
sq1 = "acacacttcgcgcgcg"
sq2 = "gtacacacttacacacag"
test = PrimerDesign('DNA')
print("Test 1:", test.func_count_repeats(sq1), ", correct: 8")
print("Test 2:", test.func_count_repeats(sq2), ", correct: 7")

Test 1: 8 , correct: 8
Test 2: 7 , correct: 7


### Task 5

In [8]:
class PrimerDesign(PrimerDesign):
    def cost_length(self, sq):
        """
        Mathematical Function: cost_primerlength(sq) [given in pg 3]
        Acceptable range is defined by self.max_length and self.min_length
        """
        sq_len = len(sq)
        if sq_len > self.max_length:
            return (sq_len-self.max_length)*self.penalty_length
        elif sq_len < self.min_length:
            return (self.min_length-sq_len)*self.penalty_length
        else: # criterion is met
            return 0 
        
    def cost_temperature(self, sq):
        """
        Mathematical Function: cost_annealtemperature(sq) [given in pg 4]
        Acceptable range is defined by self.max_temp and self.min_temp
        """
        sq_temp = self.func_temperature(sq)
        if sq_temp > self.max_temp:
            return self.penalty_temp*(sq_temp-self.max_temp)
        elif sq_temp < self.min_temp:
            return self.penalty_temp*(self.min_temp-sq_temp)
        else: # criterion is met
            return 0
        
    def cost_temperature_difference(self, fp, rp):
        """
        Mathematical Function: cost_temperaturedifference(sq) [given in pg 4]
        Acceptable range is defined by self.max_tdiff
        """
        t_forward = self.func_temperature(fp)
        t_reverse = self.func_temperature(rp)
        delta_t = abs(t_forward - t_reverse)
        if delta_t > self.max_tdiff:
            return self.penalty_temp * (delta_t-self.max_tdiff)
        else:
            return 0
        
    def cost_runs(self, sq):
        """
        Mathematical Function: cost_runs(sq) [given in pg 4]
        """
        numruns = self.func_count_runs(sq)
        return self.penalty_runs * numruns

    def cost_cgcontent(self,sq):
        """
        Mathematical Function: cost_cgcontent(sq) [given in pg 5]
        Acceptable range is defined by self.max_cg and self.min_cg
        """
        cg_fraction = self.func_cg_fraction(sq)
        if cg_fraction > self.max_cg:
            return self.penalty_cg*(cg_fraction-self.max_cg)
        elif cg_fraction < self.min_cg:
            return self.penalty_cg*(self.min_cg-cg_fraction)
        else: # criterion is met
            return 0
        
    def cost_specificity(self, sq):
        """
        Mathematical Function: cost_specificity(sq) [given in pg 5]
        Returns a function of the number of occurences of the primer in self.dna_sequence
        """
        n_pos = self.dna_sequence.count(sq)
        if n_pos > 0: return self.penalty_specificity * (n_pos - 1)
        else: return 0

    def cost_repeats(self,sq):
        """
        Mathematical Function: cost_repeats(sq) [given in pg 5]
        Returns a function of the number of repeats in the primer.
        """
        repeats = self.func_count_repeats(sq)
        return self.penalty_repeats * repeats

### Task 6

In [9]:
class PrimerDesign(PrimerDesign):
    
    def cost_objective_function(self, fp, rp):
        '''
        Complete cost function to evaluate suitability of a primer.
        This is the sum of all the individual cost functions (Task 5).
        '''
        
        cost_forward = sum([self.cost_length(fp),
                        self.cost_temperature(fp),
                        self.cost_runs(fp),
                        self.cost_cgcontent(fp),
                        self.cost_specificity(fp),
                        self.cost_repeats(fp)])
        
        cost_reverse = sum([self.cost_length(rp),
                        self.cost_temperature(rp),
                        self.cost_runs(rp),
                        self.cost_cgcontent(rp),
                        self.cost_specificity(rp),
                        self.cost_repeats(fp)])
        
        total_cost = self.cost_temperature_difference(fp, rp) + cost_forward + cost_reverse
        
        return total_cost 

### Task 7

In [10]:
class PrimerDesign(PrimerDesign):
    
    def reverse_complement(self, rp):
        output = ""
        for char in rp:
            if char=="a": output = output + "t"
            elif char=="t": output = output + "a"
            elif char=="c": output = output + "g"
            elif char=="g": output = output + "c"
            else: print("[!] Error:", char, "not a proper base, skipping")
        return output

    def cost_objective_function_info(self, fp, rp):
        print("""
`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"
   `=`,'=/     `=`,'=/     `=`,'=/     `=`,'=/
     y==/        y==/        y==/        y==/
   ,=,-<=`.    ,=,-<=`.    ,=,-<=`.    ,=,-<=`.
,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_""")
        
        labels = ["length:",
                  "annealing temperature:",
                  "%cg_content:",
                  "specificity:",
                  "runs:",
                  "repeats:"]
        
        cost_forward = [self.cost_length(fp),
                        self.cost_temperature(fp),
                        self.cost_cgcontent(fp),
                        self.cost_specificity(fp),
                        self.cost_runs(fp),
                        self.cost_repeats(fp)]
        
        cost_reverse = [self.cost_length(rp),
                        self.cost_temperature(rp),
                        self.cost_cgcontent(rp),
                        self.cost_specificity(rp),
                        self.cost_runs(rp),
                        self.cost_repeats(rp)]
        
        print("\n=== Forward Primer ===>", fp)
        print("\nCriterion".ljust(25), "Cost function score")
        print("----------------------------------------------")
        for n, cost in enumerate(cost_forward):
            print(labels[n].ljust(25), round(cost,3))
        print("\n=== Reverse Primer ===>", self.reverse_complement(rp))
        print("\nCriterion".ljust(25), "Cost function score")
        print("----------------------------------------------")
        for n, cost in enumerate(cost_reverse):
            print(labels[n].ljust(25), round(cost,3))
        print("\n===    Overall    ===")
        # TODO!!!!
        print("Temperature Difference:", "TODO")
        print("Total cost:", self.cost_objective_function(fp, rp))

### Task 10

In [11]:
"""
# optional task
class PrimerDesign(PrimerDesign): 
    
    def func_simulated_annealing(self):
        
        temperature = self.initial_temperature
        stopping_temperature = self.stopping_temperature
        drop = self.drop_fraction
        
        pass 
"""

'\n# optional task\nclass PrimerDesign(PrimerDesign): \n    \n    def func_simulated_annealing(self):\n        \n        temperature = self.initial_temperature\n        stopping_temperature = self.stopping_temperature\n        drop = self.drop_fraction\n        \n        pass \n'

### Store the DNA sequence given to you in the variable below 

In [12]:
# test code
fp = "catcaagcaggtctgttccaagggcc"
rp = "ctgcgtcgggcgtccgtcggggtgtg"
dna_sequence_1 = 'gtcttctactattcgaggccgttcgttaatacttgttgcgttcctagccgctatatttgtctctttgccgactaatgtgaacaaccacaccatagcgatttatcggagcgcctcggaatacggtatgagcaggcgcctcgtgagaccattgcgaataccaggtatcgtgtaagtagcgaaggcccgtacgcgagataaac'

### Instantiate your class and read in the DNA sequence

In [13]:
test = PrimerDesign('DNA')
test.set_dna_sequence(dna_sequence_1)

### If you need to adjust any parameter from their default values in the init method, do it here

### Show the outcome of your testing and the functions in the subsequent cells 

In [14]:
test.cost_objective_function_info(fp, rp)


`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"`-:-.   ,-;"
   `=`,'=/     `=`,'=/     `=`,'=/     `=`,'=/
     y==/        y==/        y==/        y==/
   ,=,-<=`.    ,=,-<=`.    ,=,-<=`.    ,=,-<=`.
,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_,-'-'   `-=_

=== Forward Primer ===> catcaagcaggtctgttccaagggcc

Criterion                Cost function score
----------------------------------------------
length:                   40
annealing temperature:    170
%cg_content:              0
specificity:              0
runs:                     0
repeats:                  0

=== Reverse Primer ===> gacgcagcccgcaggcagccccacac

Criterion                Cost function score
----------------------------------------------
length:                   40
annealing temperature:    270
%cg_content:              1.692
specificity:              0
runs:                     0
repeats:                  20

===    Overall    ===
Temperature Difference: TODO
Total cost: 611.6923076923076
