In [56]:
import pandas as pd
from math import sqrt, log
import sys

def temp(s, DNA_c=5000.0, Na_c=10.0, Mg_c=20.0, dNTPs_c=10.0, uncorrected=False):
    '''
    Returns the DNA/DNA melting temp using nearest-neighbor thermodynamics.

    This function returns better results than EMBOSS DAN because it uses updated
    thermodynamics values and takes into account initialization parameters from
    the work of SantaLucia (1998).

    Corrects for mono- and divalent cation concentrations.

    Arguments:
    - DNA_c:   DNA concentration [nM]
    - Na_c:    Na+ concentration [mM]
    - Mg_c:    Mg2+ concentration [mM]
    - dNTPs_c: dNTP concentration [mM]
    - correction: correct for cation concentration?
    '''

    R = 1.987    # Universal gas constant (cal/(K*mol))
    s = s.upper()
    dh, ds = _tercorr(s)
    k = DNA_c * 1e-9

    # Adapted from Table 1 in Allawi and SantaLucia (1997).
    # delta H (kcal/mol)
    dh_coeffs = {"AA": -7.9, "TT": -7.9,
                 "AT": -7.2,
                 "TA": -7.2,
                 "CA": -8.5, "TG": -8.5,
                 "GT": -8.4, "AC": -8.4,
                 "CT": -7.8, "AG": -7.8,
                 "GA": -8.2, "TC": -8.2,
                 "CG": -10.6,
                 "GC": -9.8,
                 "GG": -8.0, "CC": -8.0}

    # delta S (eu)
    ds_coeffs = {"AA": -22.2, "TT": -22.2,
                 "AT": -20.4,
                 "TA": -21.3,
                 "CA": -22.7, "TG": -22.7,
                 "GT": -22.4, "AC": -22.4,
                 "CT": -21.0, "AG": -21.0,
                 "GA": -22.2, "TC": -22.2,
                 "CG": -27.2,
                 "GC": -24.4,
                 "GG": -19.9, "CC": -19.9}

    # Multiplies the number of times each nuc pair is in the sequence by the
    # appropriate coefficient, then returns the sum of all the pairs
    dh = dh + \
        sum(_overcount(s, pair) * coeff for pair, coeff in dh_coeffs.items())
    ds = ds + \
        sum(_overcount(s, pair) * coeff for pair, coeff in ds_coeffs.items())

    fgc = len([filter(lambda x: x == 'G' or x == 'C', s)]) / float(len(s))

    # Melting temperature
    tm = (1000 * dh) / (ds + (R * log(k)))

    if uncorrected:
        return tm - 273.15

    MNa = Na_c * 1e-3
    MMg = Mg_c * 1e-3
    MdNTPs = dNTPs_c * 1e-3

    # Free magnesium concentration
    Ka = 3e4  # association constant in biological buffers
    D = (Ka * MdNTPs - Ka * MMg + 1)**2 + (4 * Ka * MMg)
    Fmg = (-(Ka * MdNTPs - Ka * MMg + 1) + sqrt(D)) / (2 * Ka)

    cation_ratio = sqrt(Fmg) / MNa if MNa > 0 else 7.0

    if cation_ratio < 0.22:
        tm = 1 / (
            (1 / tm) +
            ((4.29 * fgc - 3.95) * log(MNa) + 0.94 * log(MNa)**2) * 1e-5)
    else:
        a = 3.92
        d = 1.42
        g = 8.31
        Fmg = MMg
        if cation_ratio < 6.0:
            a = a * (0.843 - 0.352 * sqrt(MNa) * log(MNa))
            d = d * \
                (1.279 - 4.03 * log(MNa) * 1e-3 - 8.03 * log(MNa)**2 * 1e-3)
            g = g * (0.486 - 0.258 * log(MNa) + 5.25 * log(MNa)**3 * 1e-3)
        tm = 1 / (
            (1 / tm) +
            (a - 0.911 * log(Fmg) + fgc * (6.26 + d * log(Fmg)) +
             1 / (2 * (len(s) - 1)) * (-48.2 + 52.5 * log(Fmg) +
                                       g * log(Fmg)**2)) * 1e-5)

    return tm - 273.15

def kmers(datafile, kmers=16):
    value_dict = []
    with open(datafile, 'r') as myfile:
        next(myfile)
        data = myfile.read().replace('\n', '')
    for i in range(len(data) - kmers):
        value_dict.append(data[i:i+kmers])
    s = pd.Series(value_dict, name='Kmer')
    df = pd.DataFrame(s)
    return df

def find_kmers(df, background):
    with open(background, 'r') as myfile:
        next(myfile)
        data = myfile.read().replace('\n', '')
    
    def find_kmer(x):
        return data.find(x)
    
    def get_melting_temp(x):
        return temp(x)
        
            
    df['Location'] = df['Kmer'].map(find_kmer)
        
    frame = df.loc[df['Location'] != -1]
    
    frame['MeltTmp'] = frame['Kmer'].map(get_melting_temp)
    
    return frame
    
    
        
        
df = kmers("/Users/thatcher/Documents/Graduate School/FofanovResearch/data/Chromosone16.fasta")
df = find_kmers(df[0:2000], "/Users/thatcher/Documents/Graduate School/FofanovResearch/data/GCF_000007465.2_ASM746v2_genomic.fna")
df
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Kmer,Location,MeltTmp
1541,TTTGCTGTTCCTGCAT,1923021,66.947479
1627,TTTTCTTTGACCTCTT,1626715,59.619402
1628,TTTCTTTGACCTCTTC,1626716,60.116533
