In [14]:
import pandas as pd
import numpy as np
from fisher import pvalue
import numdifftools
from lmfit import Minimizer, Parameters, fit_report
import math
import scipy

In [15]:
root_dir = '/home/xoel/Escritorio/mastersthesis/'
data_dir = root_dir+'data/'
scripts_dir = root_dir+'scripts/'
results_dir = root_dir+'results/'
plots_dir = root_dir+'plots/'

In [16]:
def makeSfs(x, cum=False):
    f = np.arange(0.025,0.980,0.05)
    pi = pd.DataFrame(x['daf0f'].apply(lambda daf0f: [int(num) for num in daf0f.split(';')]).to_list()).sum()
    p0 = pd.DataFrame(x['daf4f'].apply(lambda daf4f: [int(num) for num in daf4f.split(';')]).to_list()).sum()

    daf = pd.concat([pd.Series(f), pi, p0], axis='columns', ignore_index=True)
    daf.columns = ['daf', 'Pi', 'P0']
    div = pd.DataFrame(x[['mi','di','m0','d0']].sum(), dtype=int).transpose().rename(columns={'di': 'Di', 'd0': 'D0'})
    
    if cum:
        daf = cumulative(daf)    
    
    return daf, div

In [17]:
def cumulative(x):
    psyn = [x['P0'].sum()]+[0] * (len(x)-1)
    pnsyn = [x['Pi'].sum()]+[0] * (len(x)-1)
    for i in range(1, len(x)):
        appS = psyn[i-1] - x['P0'][i-1]
        appNsyn = pnsyn[i-1] - x['Pi'][i-1]
        if (appS > 0) & (appNsyn > 0):
            psyn[i] = appS
            pnsyn[i] = appNsyn
        else:
            psyn[i] = 0
            pnsyn[i] = 0    

    x['P0'] = psyn
    x['Pi'] = pnsyn
    
    return x

In [18]:
def eMKT(daf, div, cutoff=0.15):
    P0 = daf['P0'].sum()
    Pi = daf['Pi'].sum()
    D0 = int(div['D0'])
    Di = int(div['Di'])
    m0 = int(div['m0'])
    mi = int(div['mi'])
    
#     mktTableStandard = pd.DataFrame({'Polymorphism': [P0, Pi], 'Divergence': [D0 ,Di]}, index=["Neutral class","Selected class"])
    
    ### Divergence metrics
    Ka = Di/mi
    Ks = D0/m0
    omega = Ka/Ks
    
    ### Estimating alpha with Pi/P0 ratio
    PiMinus = daf[daf['daf'] <= cutoff]['Pi'].sum()
    PiGreater = daf[daf['daf'] > cutoff]['Pi'].sum()
    P0Minus = daf[daf['daf'] <= cutoff]['P0'].sum()
    P0Greater = daf[daf['daf'] > cutoff]['P0'].sum()
    
    ratioP0 = P0Minus/P0Greater
    deleterious = PiMinus - (PiGreater * ratioP0)
    PiNeutral = Pi - deleterious

    alphaC = 1 - (((Pi - deleterious)/P0)*(D0/Di))
    
    ## Estimation of b: weakly deleterious
    b    = (deleterious/P0)*(m0/mi)

    ## Estimation of f: neutral sites
    f = (m0*PiNeutral)/(mi*P0)

    ## Estimation of d, strongly deleterious sites
    d = 1 - (f+b)


    m = [[P0, D0],[Pi-deleterious,Di]]
    pv = pvalue(P0, D0,Pi-deleterious,Di).two_tail

    ## Omega A and Omega D
    omegaA = omega * alphaC
    omegaD = omega - omegaA
    
    return {'alpha': alphaC,
            'neg_b':b,
            'neg_f':f,
            'neg_d':d,
            'omega': omega,
            'omegaA':omegaA,
            'omegaD':omegaD,
            'pvalue': pv}

In [19]:
def aMKT(daf, div, xlow=0, xhigh=1):
    P0 = daf['P0'].sum()
    Pi = daf['Pi'].sum()
    D0 = int(div['D0'])
    Di = int(div['Di'])
    m0 = int(div['m0'])
    mi = int(div['mi'])
    
#     mktTableStandard = pd.DataFrame({'Polymorphism': [P0, Pi], 'Divergence': [D0 ,Di]}, index=["Neutral class","Selected class"])
    
    ### Divergence metrics
    Ka = Di/mi
    Ks = D0/m0
    omega = Ka/Ks

    ## Run asymptotic MKT and retrieve alphas 
    asymptoticMkTable = asymptoticMKExp(daf, div, xlow, xhigh)
    alphaAsymptotic   = asymptoticMkTable['alphaAsymptotic']
#     alphaStandard     = asymptoticMkTable['alphaOriginal']
    alphaCiLow        = asymptoticMkTable['ciLow'] 
    alphaCiHigh       = asymptoticMkTable['ciHigh']

    ## Estimate the synonymous and non-synonymous ratio
    synonymousRatio = P0/m0
    nonSynonymousRatio = Pi/mi 

    ## Estimate the fraction of neutral sites incluiding weakly deleterious variants
    fb = nonSynonymousRatio/synonymousRatio

    ## Estimate the fraction of strongly deleleterious sites (d)
    d = 1 - fb

    # Estimate the fraction of sligthly deleterious sites in each daf category (b)
    omegaD = daf['Pi'] - (((1 - alphaAsymptotic) * Di *  daf['P0'])/D0)
    b = (omegaD.sum()/daf['P0'].sum())*(m0/mi)

    # Re-estimate the truly number of neutral sites, removing the slightly deleterious 
    f = fb - b
    
    ## Omega A and Omega 
    omegaA = omega * alphaAsymptotic
    
    return {'alpha': alphaAsymptotic,
            'neg_b':b,
            'neg_f':f,
            'neg_d':d,
            'omega': omega,
            'omegaA':omegaA,
            'omegaD':omegaD,
            'ciLow':alphaCiLow,
            'ciHigh': alphaCiHigh}

In [20]:
 def asymptoticMKExp(daf, div, xlow, xhigh):
    if (daf['P0']==0).any():
        print('Input daf file contains P0 values = 0.\nThis can bias the function fitting and the estimation of alpha.')
    
    ## Parse the data from argument x
    f  = daf['daf'] #derived alelle frequencies
    p  = daf['Pi'] #non-synonymous polymorphism 
    p0 = daf['P0'] #synonymous polymorphism

    ## Parse the data from argument y
    m  = div['mi'] #number of non-synonymous analyzed positions   
    m0 = div['m0'] ##number of synonymous analyzed positions
    d  = div['Di'] #non-synonymous divergence
    d0 = div['D0'] #synonymous divergence
    d_ratio = float(d0/d)

    ## Compute alpha values and trim
    alpha         = 1 - d_ratio * (p/p0)
    cutoff_f1     = xlow
    cutoff_f2     = xhigh
    trim          = ((f >= cutoff_f1) & (f <= cutoff_f2))
    f_trimmed     = f[trim]
    alpha_trimmed = alpha[trim]
    
    ## Compute the original MK alpha
    alpha_nonasymp = 1 - d_ratio * (p[trim].sum()/p0[trim].sum()) #using trimmed values
    
    ## Two-step nls2() model fit at a given level of precision (res)
    model = scipy.optimize.curve_fit(exp_model, f_trimmed, alpha_trimmed)
    a = model[0][0]
    b = model[0][1]
    c = model[0][2]
    vcov = pd.concat([pd.DataFrame([0]*4).transpose(), pd.concat([pd.DataFrame([0]*4) ,pd.DataFrame(model[1])], axis=1, ignore_index=True)], axis=0, ignore_index=True)
    vcov = vcov.iloc[0:4,:].values
    
    ## alpha for predicted model
    alpha_asymp = exp_model(1,a,b,c)
    
    ## Compute confidence intervals based on simulated data (MC-SOERP)
    pred = 1.0
    mu = [pred,a,b,c]
    simpars = np.random.multivariate_normal(mean=mu, cov=vcov, size=10000)
    ciLow, ciHigh = np.quantile([exp_model(x[0],x[1],x[2],x[3]) for x in simpars], [0.025,0.975])
    
    return {'a':a,
            'b':b,
            'c':c,
            'alphaAsymptotic': alpha_asymp,
            'ciLow':ciLow,
            'ciHigh':ciHigh}#,
#             'alphaOriginal':alpha_nonasymp}

In [21]:
def exp_model(f_trimmed, a,b,c):
    return a + b * np.exp(-c*f_trimmed)

In [22]:
genes = pd.read_csv(data_dir+'aa_genes.csv', index_col=0, header=[0,1])
data = pd.read_csv(data_dir+'metaPopsori.tsv', sep='\t')

In [23]:
gene_list = genes[genes['W1','OFC'] == 1].index.values
df = data[data['id'].isin(gene_list)]
df = df[df['pop'] == 'EUR']
daf, div = makeSfs(df, cum=True)

In [24]:
%%timeit
eMKT(daf, div)

46.6 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
%%timeit
aMKT(daf, div)

60.5 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
for col in genes.columns: print(genes[genes[col]==0].index)

In [29]:
for col in genes.columns: print(genes[genes[col]==0].index)

Index(['ENSG00000001626', 'ENSG00000002726', 'ENSG00000004468',
       'ENSG00000004846', 'ENSG00000004948', 'ENSG00000005073',
       'ENSG00000005187', 'ENSG00000005421', 'ENSG00000005981',
       'ENSG00000006059',
       ...
       'ENSG00000279169', 'ENSG00000279408', 'ENSG00000279486',
       'ENSG00000279493', 'ENSG00000279765', 'ENSG00000279956',
       'ENSG00000279983', 'ENSG00000280090', 'ENSG00000280109',
       'ENSG00000280148'],
      dtype='object', length=3805)
Index(['ENSG00000001626', 'ENSG00000002726', 'ENSG00000004468',
       'ENSG00000004809', 'ENSG00000004846', 'ENSG00000004948',
       'ENSG00000005001', 'ENSG00000005020', 'ENSG00000005073',
       'ENSG00000005102',
       ...
       'ENSG00000279408', 'ENSG00000279486', 'ENSG00000279493',
       'ENSG00000279765', 'ENSG00000279956', 'ENSG00000279983',
       'ENSG00000280090', 'ENSG00000280109', 'ENSG00000280148',
       'ENSG00000280165'],
      dtype='object', length=4473)
Index(['ENSG00000001626', 'ENSG000

Index(['ENSG00000001626', 'ENSG00000002726', 'ENSG00000004809',
       'ENSG00000004846', 'ENSG00000004948', 'ENSG00000005001',
       'ENSG00000005073', 'ENSG00000005102', 'ENSG00000005187',
       'ENSG00000005421',
       ...
       'ENSG00000279073', 'ENSG00000279169', 'ENSG00000279408',
       'ENSG00000279486', 'ENSG00000279493', 'ENSG00000279765',
       'ENSG00000279956', 'ENSG00000279983', 'ENSG00000280090',
       'ENSG00000280148'],
      dtype='object', length=4045)
Index(['ENSG00000002726', 'ENSG00000004809', 'ENSG00000004846',
       'ENSG00000004948', 'ENSG00000005001', 'ENSG00000005187',
       'ENSG00000005421', 'ENSG00000005981', 'ENSG00000006453',
       'ENSG00000006606',
       ...
       'ENSG00000279073', 'ENSG00000279169', 'ENSG00000279408',
       'ENSG00000279486', 'ENSG00000279493', 'ENSG00000279765',
       'ENSG00000279956', 'ENSG00000279983', 'ENSG00000280090',
       'ENSG00000280148'],
      dtype='object', length=3553)
Index(['ENSG00000002726', 'ENSG000

Index(['ENSG00000000460', 'ENSG00000001626', 'ENSG00000002726',
       'ENSG00000004809', 'ENSG00000004846', 'ENSG00000004939',
       'ENSG00000004948', 'ENSG00000005001', 'ENSG00000005073',
       'ENSG00000005102',
       ...
       'ENSG00000279408', 'ENSG00000279486', 'ENSG00000279493',
       'ENSG00000279765', 'ENSG00000279956', 'ENSG00000279968',
       'ENSG00000279983', 'ENSG00000280071', 'ENSG00000280090',
       'ENSG00000280148'],
      dtype='object', length=3377)
Index(['ENSG00000000460', 'ENSG00000001626', 'ENSG00000004809',
       'ENSG00000004846', 'ENSG00000004948', 'ENSG00000005187',
       'ENSG00000006788', 'ENSG00000007038', 'ENSG00000007216',
       'ENSG00000007306',
       ...
       'ENSG00000279073', 'ENSG00000279169', 'ENSG00000279408',
       'ENSG00000279486', 'ENSG00000279493', 'ENSG00000279765',
       'ENSG00000279956', 'ENSG00000279968', 'ENSG00000279983',
       'ENSG00000280148'],
      dtype='object', length=2031)
Index(['ENSG00000000460', 'ENSG000

Index(['ENSG00000002726', 'ENSG00000004809', 'ENSG00000004846',
       'ENSG00000004948', 'ENSG00000005073', 'ENSG00000005102',
       'ENSG00000005187', 'ENSG00000006788', 'ENSG00000007216',
       'ENSG00000007306',
       ...
       'ENSG00000279073', 'ENSG00000279169', 'ENSG00000279408',
       'ENSG00000279486', 'ENSG00000279493', 'ENSG00000279765',
       'ENSG00000279956', 'ENSG00000279983', 'ENSG00000280090',
       'ENSG00000280148'],
      dtype='object', length=2115)
Index(['ENSG00000001626', 'ENSG00000002726', 'ENSG00000004809',
       'ENSG00000004846', 'ENSG00000005073', 'ENSG00000005102',
       'ENSG00000005187', 'ENSG00000006788', 'ENSG00000007306',
       'ENSG00000009765',
       ...
       'ENSG00000279073', 'ENSG00000279169', 'ENSG00000279408',
       'ENSG00000279486', 'ENSG00000279493', 'ENSG00000279765',
       'ENSG00000279956', 'ENSG00000279983', 'ENSG00000280090',
       'ENSG00000280148'],
      dtype='object', length=1914)
Index(['ENSG00000001626', 'ENSG000