In [1]:
#!/usr/bin/env python

import pandas as pd
import numpy as np
from fisher import pvalue
from scipy import optimize
import multiprocessing
import multiprocessing.pool
import copy

In [2]:
class NoDaemonProcess(multiprocessing.Process):
    @property
    def daemon(self):
        return False

    @daemon.setter
    def daemon(self, value):
        pass


class NoDaemonContext(type(multiprocessing.get_context())):
    Process = NoDaemonProcess

# We sub-class multiprocessing.pool.Pool instead of multiprocessing.Pool
# because the latter is only a wrapper function, not a proper class.
class MyPool(multiprocessing.pool.Pool):
    def __init__(self, *args, **kwargs):
        kwargs['context'] = NoDaemonContext()
        super(MyPool, self).__init__(*args, **kwargs)

In [3]:
def makeSfs(x, cum=False):
    f = np.arange(0.025, 0.980, 0.05)
    pi = pd.DataFrame(x['daf0f'].apply(lambda daf0f: [int(num) for num in daf0f.split(';')]).to_list()).sum()
    p0 = pd.DataFrame(x['daf4f'].apply(lambda daf4f: [int(num) for num in daf4f.split(';')]).to_list()).sum()

    daf = pd.concat([pd.Series(f), pi, p0], axis='columns', ignore_index=True)
    daf.columns = ['daf', 'Pi', 'P0']
    div = pd.DataFrame(x[['mi', 'di', 'm0', 'd0']].sum(), dtype=int).transpose().rename(
        columns={'di': 'Di', 'd0': 'D0'})

    if cum:
        daf = cumulative(daf)

    return daf, div


def cumulative(x):
    psyn = [x['P0'].sum()] + [0] * (len(x) - 1)
    pnsyn = [x['Pi'].sum()] + [0] * (len(x) - 1)
    for i in range(1, len(x)):
        appS = psyn[i - 1] - x['P0'][i - 1]
        appNsyn = pnsyn[i - 1] - x['Pi'][i - 1]
        if (appS > 0) & (appNsyn > 0):
            psyn[i] = appS
            pnsyn[i] = appNsyn
        else:
            psyn[i] = 0
            pnsyn[i] = 0

    x['P0'] = psyn
    x['Pi'] = pnsyn

    return x


def eMKT(daf, div, cutoff=0.15):
    res = {}

    P0 = daf['P0'].sum()
    Pi = daf['Pi'].sum()
    D0 = int(div['D0'])
    Di = int(div['Di'])
    m0 = int(div['m0'])
    mi = int(div['mi'])

    # Divergence metrics
    res['Ka'] = Di / mi
    res['Ks'] = D0 / m0
    res['omega'] = res['Ka'] / res['Ks']

    ### Estimating alpha with Pi/P0 ratio
    PiMinus = daf[daf['daf'] <= cutoff]['Pi'].sum()
    PiGreater = daf[daf['daf'] > cutoff]['Pi'].sum()
    P0Minus = daf[daf['daf'] <= cutoff]['P0'].sum()
    P0Greater = daf[daf['daf'] > cutoff]['P0'].sum()

    ratioP0 = P0Minus / P0Greater
    deleterious = PiMinus - (PiGreater * ratioP0)
    PiNeutral = Pi - deleterious

    res['alpha'] = 1 - (((Pi - deleterious) / P0) * (D0 / Di))

    ## Estimation of b: weakly deleterious
    res['neg_b'] = (deleterious / P0) * (m0 / mi)

    ## Estimation of f: neutral sites
    res['neg_f'] = (m0 * PiNeutral) / (mi * P0)

    ## Estimation of d, strongly deleterious sites
    res['neg_d'] = 1 - (res['neg_f'] + res['neg_b'])

    res['pvalue'] = pvalue(P0, D0, Pi - deleterious, Di).two_tail

    ## Omega A and Omega D
    res['omegaA'] = res['omega'] * res['alpha']
    res['omegaD'] = res['omega'] - res['omegaA']

    return res


def aMKT(daf, div, xlow=0, xhigh=1):
    res = {}
    
    P0 = daf['P0'].sum()
    Pi = daf['Pi'].sum()
    D0 = int(div['D0'])
    Di = int(div['Di'])
    m0 = int(div['m0'])
    mi = int(div['mi'])

    ### Divergence metrics
    res['Ka'] = Di / mi
    res['Ks'] = D0 / m0
    res['omega'] = res['Ka'] / res['Ks']
    
    ## Estimate the synonymous and non-synonymous ratio
    synonymousRatio = P0 / m0
    nonSynonymousRatio = Pi / mi

    ## Estimate the fraction of neutral sites incluiding weakly deleterious variants
    fb = nonSynonymousRatio / synonymousRatio

    ## Estimate the fraction of strongly deleleterious sites (d)
    res['neg_d'] = 1 - fb
    
    try:
        ## Run asymptotic MKT and retrieve alphas 
        model = amkt_fit(daf, div, xlow, xhigh)
        res.update(model)
        
    except RuntimeError:       
#         print(e)
        daf10 = daf.copy(deep=True)
        daf10['daf'] =  np.array([[x/100, x/100] for x in range(5,100,10)]).flatten()
        daf10 = daf.groupby('daf', as_index=False).sum()
        
        try:
            model = amkt_fit(daf, div, xlow, xhigh)
            res.update(model)
            res['daf10'] = True
        
        except RuntimeError:
            return res
        

    # Estimate the fraction of sligthly deleterious sites in each daf category (b)
    omegaD = daf['Pi'] - (((1 - res['alpha']) * Di * daf['P0'])/ D0)
    res['neg_b'] = (omegaD.sum() / daf['P0'].sum()) * (m0 / mi)

    # Re-estimate the truly number of neutral sites, removing the slightly deleterious 
    res['neg_f'] = fb - res['neg_b']

    ## Omega A and Omega
    res['omegaA'] = res['omega'] * res['alpha']
    res['omegaD'] = res['omega'] - res['omegaA']

    return res


def amkt_fit(daf, div, xlow, xhigh):
    if (daf['P0'] == 0).any():
        print('Input daf file contains P0 values = 0.\nThis can bias the function fitting and the estimation of alpha.')

    res = {}

    d_ratio = float(div['D0'] / div['Di'])

    # Compute alpha values and trim
    alpha = 1 - d_ratio * (daf['Pi'] / daf['P0'])
    trim = ((daf['daf'] >= xlow) & (daf['daf'] <= xhigh))

    # Two-step nls2() model fit at a given level of precision (res)
    try:
        model = optimize.curve_fit(exp_model, daf['daf'][trim], alpha[trim], method='lm')
#         print('Fit: lm')
    except RuntimeError:
        try:
            model = optimize.curve_fit(exp_model, daf['daf'][trim], alpha[trim], method='trf')
#             print('Fit: trf')
        except RuntimeError:
            try:
                model = optimize.curve_fit(exp_model, daf['daf'][trim], alpha[trim], method='dogbox')
#                 print('Fit: dogbox')
            except RuntimeError:
                raise RuntimeError("Couldn't fit any method")
                return

    
    res['a'] = model[0][0]
    res['b'] = model[0][1]
    res['c'] = model[0][2]

    # alpha for predicted model
    res['alpha'] = exp_model(1.0, res['a'], res['b'], res['c'])

    # Compute confidence intervals based on simulated data (MC-SOERP)
    vcov = pd.concat([pd.DataFrame([0] * 4).transpose(),
                      pd.concat([pd.DataFrame([0] * 4), pd.DataFrame(model[1])], axis=1, ignore_index=True)],
                     axis=0, ignore_index=True)
    vcov = vcov.iloc[0:4, :].values

    simpars = np.random.multivariate_normal(mean=[1.0, res['a'], res['b'], res['c']], cov=vcov, size=10000, check_valid='ignore')

    res['ciLow'], res['ciHigh'] = np.quantile([exp_model(x[0], x[1], x[2], x[3]) for x in simpars], [0.025, 0.975])

    return res


def exp_model(f_trimmed, a, b, c):
    return a + b * np.exp(-c * f_trimmed)

In [4]:
def mkt_on_df(gene_df, data_df, approach=None, pops=['AFR','EUR'], tests=['eMKT','aMKT'], cutoffs=[0.05,0.15], do_trims=[True,False]):

    pars = [(gene_df.iloc[:,i], data_df, pops, tests, cutoffs, do_trims) for i in range(len(gene_df.columns.values))]
    
    # Loads the models for all the parameters parsed using multiprocessing to speed up computations
    pool = MyPool(processes=multiprocessing.cpu_count())
    results_list = pool.starmap(mkt_on_col, pars)
    pool.terminate()
    results = pd.concat(results_list, axis=0, ignore_index=True)
    
    if approach is not None: results['approach'] = approach
    
    return results


def mkt_on_col(col, data_df, pops=['AFR','EUR'], tests=['eMKT','aMKT'], cutoffs=[0.05,0.15], do_trims=[True,False]):
    
    glists = {'+': col[col == 1].index.values, '-': col[col == 0].index.values}
    pars = [(glists[gtype], data_df, gtype, pops, tests, cutoffs, do_trims) for gtype in glists.keys()]
    
    pool = MyPool(processes=multiprocessing.cpu_count())
    results_list = copy.deepcopy(pool.starmap(mkt_on_list, pars))
    pool.terminate()
    results = pd.concat(results_list, axis=0, ignore_index=True)
    
    if col.name is not None:
        results['stage'] = col.name[0]
        results['region'] = col.name[1]
        print(col.name,'done')
    return results

def mkt_on_list(glist, data_df, gtype=None, pops=['AFR','EUR'], tests=['eMKT','aMKT'], cutoffs=[0.05,0.15], do_trims=[True,False]):
    df = data_df[data_df['id'].isin(glist)]

    dafs = {}
    divs = {}
    dafs_cum = {}
    nogenes = {}

    for pop in pops:
        pop_df = df[df['pop'] == pop]
        nogenes[pop] = len(pop_df.index.values)

        if 'aMKT' in tests:
            dafs_cum[pop], divs[pop] = makeSfs(pop_df, cum=True)
        if 'eMKT' in tests:
            dafs[pop], divs[pop] = makeSfs(pop_df, cum=False)

    pars = []
    for pop in pops:
        for test in tests:
            if test == 'eMKT':
                for cutoff in cutoffs:
                    pars.append([dafs[pop], divs[pop], pop, nogenes[pop], test, cutoff])
            elif test == 'aMKT':
                for do_trim in do_trims:
                    pars.append((dafs_cum[pop], divs[pop], pop, nogenes[pop], test, do_trim))

    # Loads the models for all the parameters parsed using multiprocessing to speed up computations
    pool = MyPool(processes=multiprocessing.cpu_count())
    results_list = copy.deepcopy(pool.starmap(mkt_on_daf, pars))
    pool.terminate()
    
    results = pd.concat(results_list, axis=0, ignore_index=True)
    
    if gtype is not None: results['gtype'] = gtype

    return results

def mkt_on_daf(daf, div, pop, nogenes, test, par):
    if test == 'eMKT':
        results = copy.deepcopy(eMKT(daf, div, par))
        results = pd.DataFrame(results, index=[0])
        label_col = 'cutoff'
    elif test == 'aMKT':
        if par:
            xlow = 0.1
            xhigh = 0.9
        else:
            xlow = 0
            xhigh = 1
        results = copy.deepcopy(aMKT(daf, div,xlow, xhigh))
        results = pd.DataFrame(results, index=[0])
        label_col = 'trim'
    else:
        return None

    if pop is not None: results['pop'] = pop
    if nogenes is not None: results['nogenes'] = nogenes
    results['test'] = test
    results[label_col] = par

    return results

In [5]:
root_dir = '~/Escritorio/mastersthesis/'
data_dir =root_dir+'data/'
scripts_dir = root_dir+'scripts/'
results_dir = root_dir+'results/'

genes = pd.read_csv(data_dir + 'aa_genes.csv', index_col=0, header=[0, 1])
data = pd.read_csv(data_dir + 'metaPopsori.tsv', sep='\t')


gene_list = genes[genes['W1', 'OFC'] == 1].index.values


df = data[data['id'].isin(gene_list)]


df = df[df['pop'] == 'EUR']


daf, div = makeSfs(df, cum=True)

# print(mkt_on_daf(daf,div,None, None,'aMKT',False).to_string())
results = mkt_on_list(gene_list,data,gtype=None, pops=['AFR','EUR'], tests=['aMKT', 'eMKT'], cutoffs=[0.05,0.15], do_trims=[True,False])
results

Unnamed: 0,Ka,Ks,omega,neg_d,a,b,c,alpha,ciLow,ciHigh,neg_b,neg_f,omegaA,omegaD,pop,nogenes,test,trim,pvalue,cutoff
0,0.00433,0.019808,0.218581,0.741135,2.494139,-2.388038,0.036792,0.192365,-432.843976,0.1439117,0.082331,0.176534,0.042047,0.176534,AFR,14861,aMKT,True,,
1,0.00433,0.019808,0.218581,0.741135,0.154833,-1.758204,37.932417,0.154833,0.145065,0.1647892,0.074127,0.184738,0.033844,0.184738,AFR,14861,aMKT,False,,
2,0.00433,0.019808,0.218581,0.666164,,,,0.065038,,,0.12947,0.204365,0.014216,0.204365,AFR,14861,eMKT,,7.4e-05,0.05
3,0.00433,0.019808,0.218581,0.666164,,,,0.123442,,,0.142236,0.191599,0.026982,0.191599,AFR,14861,eMKT,,7.4e-05,0.15
4,0.004424,0.020389,0.216999,0.736538,-55.777344,55.872339,0.000276,0.079573,1164.491622,73527980.0,0.06373,0.199732,0.017267,0.199732,EUR,14861,aMKT,True,,
5,0.004424,0.020389,0.216999,0.736538,0.087627,-2.478782,46.121764,0.087627,0.078765,0.09662383,0.065478,0.197984,0.019015,0.197984,EUR,14861,aMKT,False,,
6,0.004424,0.020389,0.216999,0.632143,,,,0.016305,,,0.154396,0.213461,0.003538,0.213461,EUR,14861,eMKT,,0.02283,0.05
7,0.004424,0.020389,0.216999,0.632143,,,,0.07419,,,0.166957,0.2009,0.016099,0.2009,EUR,14861,eMKT,,6.5e-05,0.15


In [6]:
try_col = genes.iloc[:,5]
results = mkt_on_col(try_col,data, pops=['AFR','EUR'], tests=['aMKT', 'eMKT'], cutoffs=[0.05,0.15], do_trims=[True,False])
results

('W1', 'S1C') done


Unnamed: 0,Ka,Ks,omega,neg_d,a,b,c,alpha,ciLow,ciHigh,...,omegaD,pop,nogenes,test,trim,pvalue,cutoff,gtype,stage,region
0,0.004228,0.019639,0.215271,0.743604,0.71738,-0.617339,0.167844,0.195429,-19.28409,0.1929308,...,0.173201,AFR,14225,aMKT,True,,,+,W1,S1C
1,0.004228,0.019639,0.215271,0.743604,0.15571,-1.757326,37.239342,0.15571,0.1445393,0.1670996,...,0.181751,AFR,14225,aMKT,False,,,+,W1,S1C
2,0.004228,0.019639,0.215271,0.668876,,,,0.062474,,,...,0.201822,AFR,14225,eMKT,,7.2e-05,0.05,+,W1,S1C
3,0.004228,0.019639,0.215271,0.668876,,,,0.119298,,,...,0.18959,AFR,14225,eMKT,,7.1e-05,0.15,+,W1,S1C
4,0.004319,0.020216,0.213667,0.738379,-31.530752,31.617964,0.000227,0.080042,5324.766,149640300000.0,...,0.196565,EUR,14225,aMKT,True,,,+,W1,S1C
5,0.004319,0.020216,0.213667,0.738379,0.084998,-2.514567,45.988028,0.084998,0.0770001,0.09311966,...,0.195506,EUR,14225,aMKT,False,,,+,W1,S1C
6,0.004319,0.020216,0.213667,0.634243,,,,0.012116,,,...,0.211079,EUR,14225,eMKT,,0.099698,0.05,+,W1,S1C
7,0.004319,0.020216,0.213667,0.634243,,,,0.069611,,,...,0.198794,EUR,14225,eMKT,,6.4e-05,0.15,+,W1,S1C
8,0.008112,0.026713,0.303689,0.652986,0.056992,-0.12304,10.421579,0.056988,0.05352545,0.06054783,...,0.286382,AFR,4441,aMKT,True,,,-,W1,S1C
9,0.008112,0.026713,0.303689,0.652986,-556.74481,556.685481,-0.000313,0.115131,-7857409.0,-1036.99,...,0.268725,AFR,4441,aMKT,False,,,-,W1,S1C


In [7]:
%%time
results = mkt_on_df(genes, data, 'aa', pops=['AFR','EUR'], tests=['aMKT', 'eMKT'], cutoffs=[0.05,0.15], do_trims=[True,False])
results

('W1', 'OFC') done
('W1', 'IPC') done
('W1', 'DFC') done
('W2', 'A1C') done
('W2', 'STR') done
('W2', 'DFC') done
('W1', 'AMY') done
('W3', 'STC') done
('W3', 'VFC') done
('W1', 'A1C') done
('W1', 'VFC') done
('W2', 'VFC') done
('W2', 'STC') done
('W1', 'STR') done
('W3', 'ITC') done
('W2', 'MD') done
('W3', 'MFC') done
('W1', 'STC') done
('W2', 'CBC') done
('W1', 'MFC') done
('W2', 'ALL') done
('W2', 'MFC') done
('W1', 'MD') done
('W2', 'ITC') done
('W1', 'M1C') done
('W3', 'V1C') done
('W3', 'M1C') done
('W3', 'HIP') done
('W1', 'S1C') done
('W1', 'ITC') done
('W3', 'OFC') done
('W2', 'M1C') done
('W2', 'V1C') done
('W1', 'CBC') done
('W3', 'DFC') done
('W3', 'S1C') done
('W2', 'HIP') done
('W1', 'ALL') done
('W1', 'V1C') done
('W3', 'AMY') done
('W3', 'MD') done
('W2', 'AMY') done
('W3', 'STR') done
('W2', 'OFC') done
('W1', 'HIP') done
('W3', 'CBC') done
('W2', 'S1C') done
('W4', 'CBC') done
('W5', 'M1C') done
('W5', 'V1C') done
('W4', 'MFC') done
('W3', 'IPC') done
('W5', 'S1C') d

Unnamed: 0,Ka,Ks,omega,neg_d,a,b,c,alpha,ciLow,ciHigh,...,pop,nogenes,test,trim,pvalue,cutoff,gtype,stage,region,approach
0,0.004330,0.019808,0.218581,0.741135,2.494139,-2.388038,0.036792,0.192365,-4.328440e+02,1.439117e-01,...,AFR,14861,aMKT,True,,,+,W1,OFC,aa
1,0.004330,0.019808,0.218581,0.741135,0.154833,-1.758204,37.932417,0.154833,1.450654e-01,1.647892e-01,...,AFR,14861,aMKT,False,,,+,W1,OFC,aa
2,0.004330,0.019808,0.218581,0.666164,,,,0.065038,,,...,AFR,14861,eMKT,,0.000074,0.05,+,W1,OFC,aa
3,0.004330,0.019808,0.218581,0.666164,,,,0.123442,,,...,AFR,14861,eMKT,,0.000074,0.15,+,W1,OFC,aa
4,0.004424,0.020389,0.216999,0.736538,-55.777344,55.872339,0.000276,0.079573,1.164492e+03,7.352798e+07,...,EUR,14861,aMKT,True,,,+,W1,OFC,aa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2715,0.008563,0.029221,0.293032,0.600507,,,,0.138271,,,...,AFR,770,eMKT,,0.000016,0.15,-,ALL,ALL,aa
2716,0.008716,0.030201,0.288594,0.677202,-44.724979,44.763511,-0.000385,0.055768,-6.711163e+10,-3.986421e+03,...,EUR,770,aMKT,True,,,-,ALL,ALL,aa
2717,0.008716,0.030201,0.288594,0.677202,0.047008,-1.067876,31.096791,0.047008,3.008062e-02,6.415675e-02,...,EUR,770,aMKT,False,,,-,ALL,ALL,aa
2718,0.008716,0.030201,0.288594,0.582799,,,,-0.040774,,,...,EUR,770,eMKT,,0.247504,0.05,-,ALL,ALL,aa


In [8]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                          data: 28.0 MiB
                         genes: 26.1 MiB
                            df:  5.5 MiB
                       try_col:  2.0 MiB
                             _:  1.4 MiB
                       results:  1.4 MiB
                            _7:  1.4 MiB
                            __:  7.5 KiB
                            _6:  7.5 KiB
                           _i3:  5.8 KiB


In [9]:
results.columns


Index(['Ka', 'Ks', 'omega', 'neg_d', 'a', 'b', 'c', 'alpha', 'ciLow', 'ciHigh',
       'neg_b', 'neg_f', 'omegaA', 'omegaD', 'pop', 'nogenes', 'test', 'trim',
       'pvalue', 'cutoff', 'gtype', 'stage', 'region', 'approach'],
      dtype='object')