In [38]:
import pandas as pd
from math import sqrt, log
import sys
import time



def _is_sym(seq):
    """Returns True if s is symmetric (same as rev. complement)"""
    comp = {
        'A': 'T',
        'T': 'A',
        'G': 'C',
        'C': 'G'
    }
    return seq == ''.join([comp[i] for i in seq][::-1])


def _overcount(st, p):
    ocu = 0
    x = 0
    while True:
        try:
            i = st.index(p, x)
        except ValueError:
            break
        ocu += 1
        x = i + 1
    return ocu


def _tercorr(st):
    _dh = 0
    _ds = -1.4 if _is_sym(st) else 0
    start = st[0]
    end = st[-1]

    if start == 'G' or start == 'C':
        _dh += 0.1
        _ds -= 2.8
    elif start == 'A' or start == 'T':
        _dh += 2.3
        _ds += 4.1

    if end == 'G' or end == 'C':
        _dh += 0.1
        _ds -= 2.8
    elif end == 'A' or end == 'T':
        _dh += 2.3
        _ds += 4.1
    return _dh, _ds

def temp(s, DNA_c=5000.0, Na_c=10.0, Mg_c=20.0, dNTPs_c=10.0, uncorrected=False):
    '''
    Returns the DNA/DNA melting temp using nearest-neighbor thermodynamics.

    This function returns better results than EMBOSS DAN because it uses updated
    thermodynamics values and takes into account initialization parameters from
    the work of SantaLucia (1998).

    Corrects for mono- and divalent cation concentrations.

    Arguments:
    - DNA_c:   DNA concentration [nM]
    - Na_c:    Na+ concentration [mM]
    - Mg_c:    Mg2+ concentration [mM]
    - dNTPs_c: dNTP concentration [mM]
    - correction: correct for cation concentration?
    '''

    R = 1.987    # Universal gas constant (cal/(K*mol))
    s = s.upper()
    dh, ds = _tercorr(s)
    k = DNA_c * 1e-9

    # Adapted from Table 1 in Allawi and SantaLucia (1997).
    # delta H (kcal/mol)
    dh_coeffs = {"AA": -7.9, "TT": -7.9,
                 "AT": -7.2,
                 "TA": -7.2,
                 "CA": -8.5, "TG": -8.5,
                 "GT": -8.4, "AC": -8.4,
                 "CT": -7.8, "AG": -7.8,
                 "GA": -8.2, "TC": -8.2,
                 "CG": -10.6,
                 "GC": -9.8,
                 "GG": -8.0, "CC": -8.0}

    # delta S (eu)
    ds_coeffs = {"AA": -22.2, "TT": -22.2,
                 "AT": -20.4,
                 "TA": -21.3,
                 "CA": -22.7, "TG": -22.7,
                 "GT": -22.4, "AC": -22.4,
                 "CT": -21.0, "AG": -21.0,
                 "GA": -22.2, "TC": -22.2,
                 "CG": -27.2,
                 "GC": -24.4,
                 "GG": -19.9, "CC": -19.9}

    # Multiplies the number of times each nuc pair is in the sequence by the
    # appropriate coefficient, then returns the sum of all the pairs
    dh = dh + \
        sum(_overcount(s, pair) * coeff for pair, coeff in dh_coeffs.items())
    ds = ds + \
        sum(_overcount(s, pair) * coeff for pair, coeff in ds_coeffs.items())

    fgc = len([filter(lambda x: x == 'G' or x == 'C', s)]) / float(len(s))

    # Melting temperature
    tm = (1000 * dh) / (ds + (R * log(k)))

    if uncorrected:
        return tm - 273.15

    MNa = Na_c * 1e-3
    MMg = Mg_c * 1e-3
    MdNTPs = dNTPs_c * 1e-3

    # Free magnesium concentration
    Ka = 3e4  # association constant in biological buffers
    D = (Ka * MdNTPs - Ka * MMg + 1)**2 + (4 * Ka * MMg)
    Fmg = (-(Ka * MdNTPs - Ka * MMg + 1) + sqrt(D)) / (2 * Ka)

    cation_ratio = sqrt(Fmg) / MNa if MNa > 0 else 7.0

    if cation_ratio < 0.22:
        tm = 1 / (
            (1 / tm) +
            ((4.29 * fgc - 3.95) * log(MNa) + 0.94 * log(MNa)**2) * 1e-5)
    else:
        a = 3.92
        d = 1.42
        g = 8.31
        Fmg = MMg
        if cation_ratio < 6.0:
            a = a * (0.843 - 0.352 * sqrt(MNa) * log(MNa))
            d = d * \
                (1.279 - 4.03 * log(MNa) * 1e-3 - 8.03 * log(MNa)**2 * 1e-3)
            g = g * (0.486 - 0.258 * log(MNa) + 5.25 * log(MNa)**3 * 1e-3)
        tm = 1 / (
            (1 / tm) +
            (a - 0.911 * log(Fmg) + fgc * (6.26 + d * log(Fmg)) +
             1 / (2 * (len(s) - 1)) * (-48.2 + 52.5 * log(Fmg) +
                                       g * log(Fmg)**2)) * 1e-5)

    return tm - 273.15

def kmers(foreground, background, kmers=16, min_temp=0, max_temp=150):
    
    value_dict = {}
    with open(foreground, 'r') as fg_file:
        fg_first_line = fg_file.readline()
        bg_data = fg_file.read().replace('\n', '')
        
        with open(background, 'r') as bg_file:
            bg_first_line = bg_file.readline()
            fg_data = bg_file.read().replace('\n', '')
            
            for i in range(int(len(bg_data)/30) - kmers):
                cur_mer = bg_data[i:i+kmers]
                if(cur_mer in fg_data):
                    cur_temp = temp(cur_mer)
                    if(min_temp <= cur_temp <= max_temp):
                        value_dict[cur_mer] = [i, cur_temp]
    df = pd.DataFrame.from_dict(value_dict, orient='index', columns=["Location", "Melting Temp"])
    return df, fg_first_line, bg_first_line

def find_kmers(foregrounds, background, filepath):
    for file in foregrounds:
        kmer_df, fg, bg = kmers(file, background)
        filename = filepath + 'fg' + fg.split(' ')[0].strip('>') + '_' + 'bg' + bg.split(' ')[0].strip('>') + '_optimal_mers.csv'
        kmer_df.to_csv(filename)
    return 0
        

In [39]:
start = time.time()
bg = "/Users/thatcher/Documents/Graduate School/FofanovResearch/INF685/data/GCF_000007465.2_ASM746v2_genomic.fna"
fgs = ["/Users/thatcher/Documents/Graduate School/FofanovResearch/INF685/data/Chromosone16.fasta", "/Users/thatcher/Documents/Graduate School/FofanovResearch/INF685/data/Chromosone17.fasta"]
find_kmers(fgs, bg, "/Users/thatcher/Documents/Graduate School/FofanovResearch/INF685/data/")
end = time.time()
print("Time Taken: {}".format(end - start))

Time Taken: 32.71850323677063


In [35]:
print('fg' + fg.split(' ')[0].strip('>') + '_' + 'bg' + bg.split(' ')[0].strip('>') + '_optimal_mers.csv')


fgCM000678.2_bgNC_004350.2_optimal_mers.csv
