In [78]:
import pandas as pd
from math import sqrt, log
import sys
import time
import json



def _is_sym(seq):
    """Returns True if s is symmetric (same as rev. complement)"""
    comp = {
        'A': 'T',
        'T': 'A',
        'G': 'C',
        'C': 'G'
    }
    return seq == ''.join([comp[i] for i in seq][::-1])


def _overcount(st, p):
    ocu = 0
    x = 0
    while True:
        try:
            i = st.index(p, x)
        except ValueError:
            break
        ocu += 1
        x = i + 1
    return ocu


def _tercorr(st):
    _dh = 0
    _ds = -1.4 if _is_sym(st) else 0
    start = st[0]
    end = st[-1]

    if start == 'G' or start == 'C':
        _dh += 0.1
        _ds -= 2.8
    elif start == 'A' or start == 'T':
        _dh += 2.3
        _ds += 4.1

    if end == 'G' or end == 'C':
        _dh += 0.1
        _ds -= 2.8
    elif end == 'A' or end == 'T':
        _dh += 2.3
        _ds += 4.1
    return _dh, _ds

def temp(s, DNA_c=5000.0, Na_c=10.0, Mg_c=20.0, dNTPs_c=10.0, uncorrected=False):
    '''
    Returns the DNA/DNA melting temp using nearest-neighbor thermodynamics.

    This function returns better results than EMBOSS DAN because it uses updated
    thermodynamics values and takes into account initialization parameters from
    the work of SantaLucia (1998).

    Corrects for mono- and divalent cation concentrations.

    Arguments:
    - DNA_c:   DNA concentration [nM]
    - Na_c:    Na+ concentration [mM]
    - Mg_c:    Mg2+ concentration [mM]
    - dNTPs_c: dNTP concentration [mM]
    - correction: correct for cation concentration?
    '''

    R = 1.987    # Universal gas constant (cal/(K*mol))
    s = s.upper()
    dh, ds = _tercorr(s)
    k = DNA_c * 1e-9

    # Adapted from Table 1 in Allawi and SantaLucia (1997).
    # delta H (kcal/mol)
    dh_coeffs = {"AA": -7.9, "TT": -7.9,
                 "AT": -7.2,
                 "TA": -7.2,
                 "CA": -8.5, "TG": -8.5,
                 "GT": -8.4, "AC": -8.4,
                 "CT": -7.8, "AG": -7.8,
                 "GA": -8.2, "TC": -8.2,
                 "CG": -10.6,
                 "GC": -9.8,
                 "GG": -8.0, "CC": -8.0}

    # delta S (eu)
    ds_coeffs = {"AA": -22.2, "TT": -22.2,
                 "AT": -20.4,
                 "TA": -21.3,
                 "CA": -22.7, "TG": -22.7,
                 "GT": -22.4, "AC": -22.4,
                 "CT": -21.0, "AG": -21.0,
                 "GA": -22.2, "TC": -22.2,
                 "CG": -27.2,
                 "GC": -24.4,
                 "GG": -19.9, "CC": -19.9}

    # Multiplies the number of times each nuc pair is in the sequence by the
    # appropriate coefficient, then returns the sum of all the pairs
    dh = dh + \
        sum(_overcount(s, pair) * coeff for pair, coeff in dh_coeffs.items())
    ds = ds + \
        sum(_overcount(s, pair) * coeff for pair, coeff in ds_coeffs.items())

    fgc = len([filter(lambda x: x == 'G' or x == 'C', s)]) / float(len(s))

    # Melting temperature
    tm = (1000 * dh) / (ds + (R * log(k)))

    if uncorrected:
        return tm - 273.15

    MNa = Na_c * 1e-3
    MMg = Mg_c * 1e-3
    MdNTPs = dNTPs_c * 1e-3

    # Free magnesium concentration
    Ka = 3e4  # association constant in biological buffers
    D = (Ka * MdNTPs - Ka * MMg + 1)**2 + (4 * Ka * MMg)
    Fmg = (-(Ka * MdNTPs - Ka * MMg + 1) + sqrt(D)) / (2 * Ka)

    cation_ratio = sqrt(Fmg) / MNa if MNa > 0 else 7.0

    if cation_ratio < 0.22:
        tm = 1 / (
            (1 / tm) +
            ((4.29 * fgc - 3.95) * log(MNa) + 0.94 * log(MNa)**2) * 1e-5)
    else:
        a = 3.92
        d = 1.42
        g = 8.31
        Fmg = MMg
        if cation_ratio < 6.0:
            a = a * (0.843 - 0.352 * sqrt(MNa) * log(MNa))
            d = d * \
                (1.279 - 4.03 * log(MNa) * 1e-3 - 8.03 * log(MNa)**2 * 1e-3)
            g = g * (0.486 - 0.258 * log(MNa) + 5.25 * log(MNa)**3 * 1e-3)
        tm = 1 / (
            (1 / tm) +
            (a - 0.911 * log(Fmg) + fgc * (6.26 + d * log(Fmg)) +
             1 / (2 * (len(s) - 1)) * (-48.2 + 52.5 * log(Fmg) +
                                       g * log(Fmg)**2)) * 1e-5)

    return tm - 273.15

# Algorithm for actually scoring the primer sets
def score_kmer(primer_dict, c1=0.1, c2=-5, c3=3, default_score=1):
    # FgOccur, BgOccur, degDiff, inEx
    # C1T + C2E + C3*R
    E = 0
    R = primer_dict['FgOccur'] / primer_dict['BgOccur']
    if(primer_dict['InEx'] == True):
        E = -2
    score = c1*default_score + c2*E + c3*R
    return score

def get_fg_dict(foreground, kmers, min_temp, max_temp):
    with open(foreground, 'r') as fg_file:
        value_dict = {}
        fg_first_line = fg_file.readline()
        fg_data = fg_file.read().replace('\n', '')
        
        for i in range(int(len(fg_data)) - kmers):
            cur_mer = fg_data[i:i+kmers]
            cur_temp = temp(cur_mer)
            if(min_temp <= cur_temp <= max_temp):
                if(cur_mer in value_dict):
                    value_dict[cur_mer]['FgOccur'] += 1
                    value_dict[cur_mer]['FgLocations'].append(i)
                else:
                    value_dict[cur_mer] = {'FgOccur': 1, 'FgLocations': [i]}
                    
        return value_dict

def kmers(foregrounds, background, exclusion_set, kmers=14, pref_min=10, pref_max_temp=120, min_temp=0, max_temp=150):
    
    value_dict = {}
    foreground_dict = {}
    fg_bg_dict = {}
    num_seq = 0
    for foreground in foregrounds:
        print(foreground)
        fg_dict = get_fg_dict(foreground, kmers, min_temp, max_temp)
        foreground_dict[foreground] = fg_dict
        
    with open(background, 'r') as bg_file:
        bg_first_line = bg_file.readline()
        bg_data = bg_file.read().replace('\n', '')

        with open(exclusion_set, 'r') as ex_file:
            bg_first_line = ex_file.readline()
            ex_data = ex_file.read().replace('\n', '')
            for foreground in foregrounds:
                value_dict = {}
                for i in range(int(len(bg_data)) - kmers):
                    cur_mer = bg_data[i:i+kmers]
                    inEx = False
                    defDiff = 0
                    if(cur_mer in foreground_dict[foreground]):
                        cur_temp = temp(cur_mer)
                        if(pref_min > cur_temp):
                            defDiff = pref_min - cur_temp
                        if(pref_max_temp < cur_temp):
                            defDiff = pref_max_temp - cur_temp
                        if(cur_mer in ex_data):
                            inEx = True
                        if(cur_mer in value_dict):
                            value_dict[cur_mer]['BgOccur'] += 1
                            if(i not in value_dict[cur_mer]['BgLocations']):
                                value_dict[cur_mer]['BgLocations'].append(i)
                        else:
                            value_dict[cur_mer] = {'FgOccur': foreground_dict[foreground][cur_mer]['FgOccur'], 'FgLocations': foreground_dict[foreground][cur_mer]['FgLocations'], 'BgOccur': 1, 'BgLocations': [i], 'MeltingTemp': cur_temp, 'DegreeDiff': defDiff, 'InEx': inEx, 'Score': 0}

                        for key in value_dict.keys():
                            current_mer = value_dict[key]
                            current_mer['Score'] = score_kmer(current_mer)
                
                fg_bg_dict['Background'] = background
                fg_bg_dict['Foreground'] = foreground
                fg_bg_dict['Kmers'] = value_dict
                fg_bg_dict['#Sequences'] = len(fg_bg_dict['Kmers'].keys())
                filename = foreground.split('/')[-1].split('.')[0] + '_' + str(kmers) + 'mer.json'
                with open(filename, "w") as json_data_file:
                    json.dump(fg_bg_dict, json_data_file, indent=4, sort_keys=True)
        
    return fg_bg_dict

def find_kmers(foregrounds, background, exclusion_set, filepath):
    identity = 0
    file_dict = {}
    file_lookup = {}
    i = 0
    for file in foregrounds:
        kmer_df, fg, bg, value_dict = kmers(file, background, exclusion_set, filepath)
        filename = filepath + 'fg' + fg.split(' ')[0].strip('>') + '_' + 'bg' + bg.split(' ')[0].strip('>') + '.json'
        file_id = 'foreground_' + str(i)
        file_dict[file_id] = value_dict
        file_lookup[file_id] = filename
        i += 1
    return file_dict, file_lookup

def select_primer(json_file):
    with open('sequence_data.json') as json_data:
        json_data_files = json.load(json_data)
        df = pd.read_json('sequence_data.json')
        F = len(df['foreground_0'][1]['data'][0]['Location'])
        B = value_dict[cur_mer]['data'][2]['FgOccur']
    return df        

In [79]:
bg = "/Users/thatcher/Documents/Graduate School/FofanovResearch/INF685/data/GCF_000007465.2_ASM746v2_genomic.fna"
ex = "/Users/thatcher/Documents/Graduate School/FofanovResearch/INF685/data/exclude.fasta"
fgs = ["/Users/thatcher/Documents/Graduate School/FofanovResearch/INF685/data/Chromosone16.fasta", "/Users/thatcher/Documents/Graduate School/FofanovResearch/INF685/data/Chromosone17.fasta"]
start = time.process_time()
value_dict = kmers(fgs, bg, ex)
end = time.process_time()
print("Time Taken: {}".format(end - start))

/Users/thatcher/Documents/Graduate School/FofanovResearch/INF685/data/Chromosone16.fasta
/Users/thatcher/Documents/Graduate School/FofanovResearch/INF685/data/Chromosone17.fasta
Time Taken: 6.538397000000032
