In [1]:
from pandas import DataFrame, read_csv
import pandas as pd
import random
import numpy as np
import json
from collections import defaultdict, Counter
import math
import sys
import itertools
import time
import scipy.stats 
from sklearn.neighbors import KernelDensity
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import re
from scipy.optimize import curve_fit
import copy
import dill



chr_lengths= {1: 643000,
                         2: 947000,
                         3: 1100000,
                         4: 1200000,
                         5: 1350000,
                         6: 1420000,
                         7: 1450000,
                         8: 1500000,
                         9: 1550000,
                         10: 1700000,
                         11: 2049999,
                         12: 2300000,
                         13: 2950000,
                         14: 3300000}

class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """

    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):  #### This is the fix
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

cpalette1 = sns.color_palette('Reds_r', 3)
cpalette2 = sns.color_palette('Blues_r', 3)
cpalette3 = sns.color_palette('Greys_r', 3)

color_map_dict = {'PC': cpalette1[0],
                 'FS': cpalette1[1],
                 'MS': cpalette1[2],
                 'GC': cpalette2[0],
                 'HS': cpalette2[1],
                  'FAV': cpalette2[2],
                  'GGC': cpalette3[0],
                  'HAV': cpalette3[1],
                  'FCS': cpalette3[2]}

In [2]:
pdfs = dill.load(open('ll_parameters.pkl', 'rb'))


In [3]:
def evaluate_max_segment_piecewise_pdf(x, relationship, chrom, pdf_dict, bandwidth =0.02):
    '''modeled as a kde with spikes'''
    #print(fs_max_segment_pdf_dict[relationship][int(chrom)])
    p0,p1,kde, tstep, = pdf_dict[relationship][int(chrom)]
                       
    if float(x) <= 0 + bandwidth:
        return p0/tstep
    elif float(x) >= 1 - bandwidth:
        return p1/tstep
    else:
        x_transmute = np.asarray([x])
        #if beta_dis:
        #    pdf = scipy.stats.beta.pdf(x, alpha, beta, loc, scale)
        #else:
        pdf = np.exp(kde.score_samples(x_transmute.reshape(-1,1)))[0]
        pdf = pdf * (1-p0-p1)
        return pdf
    
def evaluate_max_segment_piecewise_pdf(x, relationship, chrom, pdf_dict, bandwidth =0.02):
    '''modeled as a kde with spikes'''
    #print(fs_max_segment_pdf_dict[relationship][int(chrom)])
    p0,p1,kde, tstep, = pdf_dict[relationship][int(chrom)]
                       
    if float(x) <= 0 + bandwidth:
        return p0/tstep
    elif float(x) >= 1 - bandwidth:
        return p1/tstep
    else:
        x_transmute = np.asarray([x])
        #if beta_dis:
        #    pdf = scipy.stats.beta.pdf(x, alpha, beta, loc, scale)
        #else:
        pdf = np.exp(kde.score_samples(x_transmute.reshape(-1,1)))[0]
        pdf = pdf * (1-p0-p1)
        return pdf
    
def fit_beta(r_dict):
    beta_variables = {}
    for relationship in r_dict:
        r_list = r_dict[relationship]
        r_list = [x if x != 0 else 1e-9 for x in r_list]
        alpha, beta, loc, scale = scipy.stats.beta.fit(r_list, floc=0,fscale=1)
        #x = np.linspace(0, 1, 100)
        beta_variables[relationship] = (alpha, beta, loc, scale)
        
    return beta_variables

def create_pdfs(p_max_segment_dict, bandwidth= 0.02):
    max_segment_pdf_dict = defaultdict(dict)
    for relationship in p_max_segment_dict:
        for chromosome in p_max_segment_dict[relationship]:
            data = np.asarray(p_max_segment_dict[relationship][chromosome])
            mask0 = np.asarray(p_max_segment_dict[relationship][chromosome]) <= 0 + bandwidth
            mask1 = np.asarray(p_max_segment_dict[relationship][chromosome]) >= 1 - bandwidth
            mask = ~mask0 & ~mask1
            data = np.asarray(data[mask])

            p0 = (np.sum(mask0) + 1 )/ (len(mask0) + 3)
            p1 = (np.sum(mask1) + 1 )/ (len(mask1) + 3)
            kde = KernelDensity(kernel='gaussian',bandwidth=bandwidth).fit(data.reshape(-1,1))#training of model
            #beta_params = scipy.stats.beta.fit(data)

            max_segment_pdf_dict[relationship][int(chromosome)] =  (p0, p1, kde, bandwidth)#, beta_params)
    return max_segment_pdf_dict

def calc_seg_count_pmf(segment_counts_dict):
    '''empirical segment_count_pmf, with add one smoothing
    theta_i = (x_i + alpha) / (N + alpha * d)
    where x_i is the count of the cateogory i
    alpha is set to 1
    d is the number of categories
    N is the sample count'''
    seg_count_pmf = defaultdict(lambda: defaultdict(lambda : defaultdict(dict)))
    for relationship in segment_counts_dict:
        for chrom in segment_counts_dict[relationship]:
            counts = Counter(segment_counts_dict[relationship][chrom])
            total = np.sum(list(counts.values()))
            max_count= max(counts)
            bins = range(0,max_count + 2) #add an additional category representing max + 1
            n_bins = len(bins)
            for x in bins:
                if x in counts:
                    seg_count_pmf[relationship][chrom][x] = (counts[x] + 1)/(total + n_bins)
                else:
                    seg_count_pmf[relationship][chrom]['misc'] = 1 / (total + n_bins)
    return seg_count_pmf

def evaluate_max_segment_piecewise_pdf(x, relationship, chrom, pdf_dict, bandwidth =0.02):
    '''modeled as a kde with spikes'''
    #print(fs_max_segment_pdf_dict[relationship][int(chrom)])
    p0,p1,kde, tstep, = pdf_dict[relationship][int(chrom)]
                       
    if float(x) <= 0 + bandwidth:
        return p0/tstep
    elif float(x) >= 1 - bandwidth:
        return p1/tstep
    else:
        x_transmute = np.asarray([x])
        #if beta_dis:
        #    pdf = scipy.stats.beta.pdf(x, alpha, beta, loc, scale)
        #else:
        pdf = np.exp(kde.score_samples(x_transmute.reshape(-1,1)))[0]
        pdf = pdf * (1-p0-p1)
        return pdf
    
class Sim:

    def __init__(self, comparison, r_total, max_ibd_segment, n_segment_count):
        self.comparison = comparison
        self.s1 = comparison.split(':')[0]
        self.s2 = comparison.split(':')[1]
        
        self.r_total = r_total

        self.max_ibd_segment = max_ibd_segment
        self.n_segment_count = n_segment_count
        self.calc_all_likelihoods()
        self.max_ll_categorization = max(self.complete_likelihoods,key=self.complete_likelihoods.get)
        
    def likelihood(self, G, r_flag = 1, ibdmax_flag = 1, count_flag = 1):
        r_total_beta_params = pdfs['r_beta'][G]
        logL = 0
        P_rtotal = scipy.stats.beta.logpdf(self.r_total, *r_total_beta_params)
        P_ibdmax = 0
        P_seg_count = 0
        for chrom in range(1,15):
            idx = chrom - 1
            P_ibdmax += np.log(evaluate_max_segment_piecewise_pdf(self.max_ibd_segment[idx], G, chrom, \
                                                            pdfs['p_max_segment']))

            n_segments = self.n_segment_count[idx]
            if n_segments in pdfs['segment_count'][G][str(chrom)].keys():
                P_seg_count += np.log(pdfs['segment_count'][G][str(chrom)][n_segments])
            else:
                P_seg_count += np.log(pdfs['segment_count'][G][str(chrom)]['misc'])
        logL = P_rtotal * r_flag +  P_ibdmax * ibdmax_flag + P_seg_count * count_flag

        return logL
    
    def calc_all_likelihoods(self, r_flag = 1, ibdmax_flag = 1, count_flag = 1):
        self.complete_likelihoods = {}
        self.r_likelihoods = {}
        
        self.complete_likelihoods_ms = {}
        self.r_likelihoods_ms = {}

        for G in ['PC', 'GC', 'GGC', 'FS', 'HS', 'FAV', 'HAV', 'FCS']:#, 'MS.MS', 'FAV.MS', 'FCS.MS']:
            self.complete_likelihoods[G] = self.likelihood(G, r_flag, ibdmax_flag, count_flag)
            self.r_likelihoods[G] = self.likelihood(G, 1, 0, 0)
        for G in ['PC', 'GC', 'GGC', 'FS', 'HS', 'FAV', 'HAV', 'FCS','MS.MS', 'FAV.MS', 'FCS.MS']:
            self.complete_likelihoods_ms[G] = self.likelihood(G, r_flag, ibdmax_flag, count_flag)
            self.r_likelihoods_ms[G] = self.likelihood(G, r_flag, ibdmax_flag, count_flag)
        
        if self.r_total <= 0.05:
            self.max_ll = 'unrelated'
        elif self.r_total >= 0.95:
            self.max_ll = 'clone'
        else:
            self.max_ll = max(self.complete_likelihoods,key=self.complete_likelihoods.get)
        
            
def format_ingest_data(file):
    data_dict = defaultdict(list)
    df = DataFrame(read_csv(file, sep = '\t'))
    data = df.to_numpy()
    n_ibd_segment_dict = {}
    max_ibd_segment_dict = {}
    for row in data:
        comparison = row[0]
        relatedness = row[1]
        n_ibd_segments = row[2:16]
        max_ibd_segments = row[16:]
        #for chrom, n_ibd, max_ibd in zip(range(1,15), n_ibd_segments, max_ibd_segments):
        #    n_ibd_segment_dict[chrom] = n_ibd
        #    max_ibd_segment_dict[chrom] = max_ibd
        print(comparison)
        S = Sim(comparison, relatedness,max_ibd_segments, n_ibd_segments)
        data_dict['unknown'].append(S)
    return data_dict

In [4]:
data = {}
data['input'] = format_ingest_data('example_input.txt')

s1:s2
s2:s3
s4:s5
s6:s7
