In [None]:
from pandas import DataFrame, read_csv, read_excel
import pandas as pd
import random
import numpy as np
import json
from collections import defaultdict, Counter
import sys
import itertools
import time
import scipy.stats 
from sklearn.neighbors import KernelDensity
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import re
import copy
import dill
import matplotlib.patches as mpatches
import copy


chr_lengths= {1: 643000,
                         2: 947000,
                         3: 1100000,
                         4: 1200000,
                         5: 1350000,
                         6: 1420000,
                         7: 1450000,
                         8: 1500000,
                         9: 1550000,
                         10: 1700000,
                         11: 2049999,
                         12: 2300000,
                         13: 2950000,
                         14: 3300000}

class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """

    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):  #### This is the fix
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

cpalette1 = sns.color_palette('Reds_r', 2)
cpalette2 = sns.color_palette('Blues_r', 3)
cpalette3 = sns.color_palette('Greys_r', 3)

color_map_dict = {'PC': cpalette1[0],
                 'FS': cpalette1[1],
                 'MS': cpalette1[1],
                 'GC': cpalette2[0],
                 'HS': cpalette2[1],
                  'FAV': cpalette2[2],
                  'GGC': cpalette3[0],
                  'HAV': cpalette3[1],
                  'FCS': cpalette3[2]}

In [None]:
def combine_results(pattern, metric):
    if metric != 'r_totals':
        combined_dict = defaultdict(lambda: defaultdict(list))
    else:
        combined_dict = defaultdict(list)
    for file in glob.glob(pattern):
        file_dict = json.load(open(file))
        for relationship in file_dict:
            if metric != 'r_totals':
                for chrom in file_dict[relationship]:
                    for segment_lengths in file_dict[relationship][chrom]:
                        combined_dict[relationship][chrom].append(segment_lengths)
            else:
                combined_dict[relationship] += file_dict[relationship]
    return combined_dict


def combine_sims(directory,basename):
    fs_file_patterns = ['*_ibd_segment_numbers.json', 
                        '*_ibd_segment_max.json',
               '*_r_totals.json']
    combined_dicts = {}
    for pattern in fs_file_patterns:
        match = re.search('/*_(.+).json',pattern)
        metric = match.groups()[0]
        print(metric)
        combined_dicts[metric] = combine_results(directory + pattern, metric)
        json.dump(combined_dicts[metric], 
                  open(directory + '{b}_{m}_combined.json'.format(m=metric,b=basename), 'w'), cls = NumpyEncoder)
    
combine_sims('sims/outcross_tx_chain/', basename = 'outcross_chain')
combine_sims('sims/serial_cotx_chain/', basename = 'serial_cotx_chain')


In [None]:

sim_dict = defaultdict(dict)
for sim_type in ['serial_cotx_chain']:
    sim_dict[sim_type]['r_totals'] = json.load(open('sims/{p}/serial_cotx_chain_r_totals_combined.json'.format(p=sim_type)))
    sim_dict[sim_type]['ibd_segment_max'] = json.load(open('sims/{p}/serial_cotx_chain_ibd_segment_max_combined.json'.format(p=sim_type)))
    sim_dict[sim_type]['ibd_segment_numbers'] = json.load(open('sims/{p}/serial_cotx_chain_ibd_segment_numbers_combined.json'.format(p=sim_type)))
    max_segment_p_dict = defaultdict(dict)
    for relationship in sim_dict[sim_type]['ibd_segment_max']:
        for chrom in sim_dict[sim_type]['ibd_segment_max'][relationship]:
            max_segment_p_dict[relationship][chrom] = np.asarray([x for x in np.asarray(sim_dict[sim_type]['ibd_segment_max'][relationship][chrom]) / chr_lengths[int(chrom)]])
    sim_dict[sim_type]['p_ibd_max'] = max_segment_p_dict
    

In [None]:
#focus_nodes = ['P1', 'F1', 'F2', 'F3', 'F4']
focus_comparisons = {'P1.F1':'PC', 'P1.F12':'PC', 'P2.F1':'PC', 'P2.F12':'PC',
                    'F1.F2':'PC', 'F1.F22':'PC', 'A1.F2':'PC', 'A1.F22':'PC',
                     'F2.F3':'PC', 'F2.F32':'PC', 'A3.F3':'PC', 'A3.F32':'PC',
                     'F1.F12':'FS', 'F2.F22':'FS','F3.F32':'FS',
                    'P1.F2':'GC', 'P1.F22':'GC', 'P2.F2':'GC', 'P2.F22':'GC',
                    'A1.F3':'GC', 'A1.F32':'GC', 'F1.F3':'GC', 'F1.F32':'GC',
                    'F12.F2': 'FAV', 'F12.F22':'FAV', 'F12.F3':'FAV', 'F22.F32':'FAV',
                    'P1.F3':'GGC', 'P2.F3':'GGC', 'P1.F32':'GGC','P2.F32':'GGC'}

simulated_M_dict = defaultdict(list)

for comparison in sim_dict['serial_cotx_chain']['p_ibd_max'].keys():
    ibd_segment_count_M = np.asarray([sim_dict['serial_cotx_chain']['ibd_segment_numbers'][comparison][str(key)] for key in range(1,15)]).T
    p_ibd_segment_M = np.asarray([sim_dict['serial_cotx_chain']['p_ibd_max'][comparison][str(key)] for key in range(1,15)]).T
    #sample x chromosome

    idx = 0
    for relatedness, ibd_segment_count_v, p_ibd_segment_v in zip(sim_dict['serial_cotx_chain']['r_totals'][comparison],
                                                                ibd_segment_count_M,
                                                                p_ibd_segment_M):
        name = str(idx)
        simulated_M_dict[comparison].append([name, relatedness]  + list(ibd_segment_count_v) + list(p_ibd_segment_v))
        idx += 1
    simulated_M_dict[comparison] = np.asarray(simulated_M_dict[comparison]).astype(float)


In [None]:
def evaluate_max_segment_piecewise_pdf(x, relationship, chrom, pdf_dict, bandwidth =0.02):
    '''modeled as a kde with spikes'''
    #print(fs_max_segment_pdf_dict[relationship][int(chrom)])
    p0,p1,kde, tstep, = pdf_dict[relationship][int(chrom)]
                       
    if float(x) <= 0 + bandwidth:
        return p0/tstep
    elif float(x) >= 1 - bandwidth:
        return p1/tstep
    else:
        x_transmute = np.asarray([x])
        #if beta_dis:
        #    pdf = scipy.stats.beta.pdf(x, alpha, beta, loc, scale)
        #else:
        pdf = np.exp(kde.score_samples(x_transmute.reshape(-1,1)))[0]
        pdf = pdf * (1-p0-p1)
        return pdf
    
def fit_beta(relationships,):
    beta_variables = {}
    r_dict = raw_sim_data['fs']['r_totals']
    for relationship in relationships:
        print(relationship)
        r_list = r_dict[relationship]
        r_list = [x if x != 0 else 1e-9 for x in r_list]
        alpha, beta, loc, scale = scipy.stats.beta.fit(r_list, floc=0,fscale=1)
        #x = np.linspace(0, 1, 100)
        beta_variables[relationship] = (alpha, beta, loc, scale)
        
    return beta_variables


def plot_r_totals(beta_vals_dict, relationships, color_map_dict):
    x = np.linspace(0, 1, 100)
    #plt.figure(figsize = (8,5))
    i= 1
    for relationship in relationships:
        alpha, beta, loc, scale = beta_vals_dict[relationship]
        pdf = scipy.stats.beta.pdf(x, alpha, beta, loc, scale)

        plt.plot(x, pdf, color = color_map_dict[relationship], label = relationship)
        plt.xlabel('Genome-wide Total Relatedness', fontsize = 15)
        plt.ylabel('Density', fontsize = 15)
        plt.xlim(-0.05,1.05)
        plt.ylim(-0.05, 10)
        plt.legend(ncols = 3, fontsize = 12)
        plt.tick_params(axis='both', which='major', labelsize=12)

In [None]:
pdfs = dill.load(open('ll_parameters_2.0_11.3_v2.pkl', 'rb'))
degree_relationships_ms = defaultdict(list)
degree_relationships_ms[1] = ['PC', 'FS', 'MS.MS']
degree_relationships_ms[2] = ['GC', 'HS', 'FAV', 'FAV.MS']
degree_relationships_ms[3] = ['GGC', 'HAV', 'FCS', 'FCS.MS']
degree_relationships_ms['all'] = ['PC', 'GC', 'MS.MS', 
                               'GGC', 'FS', 'HS', 'FAV','FAV.MS', 
                               'HAV', 'FCS', 'FCS.MS']





class Sim:
    parental_strains = ['P1', 'P2']

    def __init__(self, comparison, r_total, max_ibd_segment, n_segment_count):
        self.comparison = comparison
        
        self.r_total = r_total

        self.max_ibd_segment = max_ibd_segment
        self.n_segment_count = n_segment_count
        self.calc_all_likelihoods()
    
    def likelihood(self, G, r_flag = 1, ibdmax_flag = 1, count_flag = 1):
        r_total_beta_params = pdfs['r_beta'][G]
        logL = 0
        P_rtotal = scipy.stats.beta.logpdf(self.r_total, *r_total_beta_params)
        P_ibdmax = 0
        P_seg_count = 0
        for chrom in range(1,15):
            idx = chrom - 1
            P_ibdmax += np.log(evaluate_max_segment_piecewise_pdf(self.max_ibd_segment[idx], G, chrom, \
                                                            pdfs['p_max_segment']))

            n_segments = self.n_segment_count[idx]
            if n_segments in pdfs['segment_count'][G][str(chrom)].keys():
                P_seg_count += np.log(pdfs['segment_count'][G][str(chrom)][n_segments])
            else:
                P_seg_count += np.log(pdfs['segment_count'][G][str(chrom)]['misc'])
        logL = P_rtotal * r_flag +  P_ibdmax * ibdmax_flag + P_seg_count * count_flag

        return logL
    
    def calc_all_likelihoods(self, r_flag = 1, ibdmax_flag = 1, count_flag = 1):       
        self.complete_likelihoods_ms = {}
        for G in ['PC', 'GC', 'GGC', 'FS', 'HS', 'FAV', 'HAV', 'FCS','MS.MS', 'FAV.MS', 'FCS.MS']:
            self.complete_likelihoods_ms[G] = self.likelihood(G, r_flag, ibdmax_flag, count_flag)
        
        self.max_complete_likelihood_ms = max(self.complete_likelihoods_ms, key = self.complete_likelihoods_ms.get)
        
        self.llr = {}
        for degree in [1,2,3]:
            numerator_hypotheses = degree_relationships_ms[degree]
            numerator = max([self.complete_likelihoods_ms[G] for G in degree_relationships_ms[degree]])
            denominator_hypotheses = [G for G in degree_relationships_ms['all'] if G not in degree_relationships_ms[degree]]
            denominator = max([self.complete_likelihoods_ms[G] for G in denominator_hypotheses])
            self.llr[degree] = -2 * (numerator-denominator)
            
        self.llr_degree_classification = min(self.llr, key = self.llr.get)
        
        
            
def format_ingest_data(comparison):
    data_dict = defaultdict(list)
    data = simulated_M_dict[comparison]
    n_ibd_segment_dict = {}
    max_ibd_segment_dict = {}
    for row in data:
        comparison = row[0]
        if comparison % 500 == 0:
            print(comparison)
        relatedness = row[1]
        n_ibd_segments = row[2:16]
        max_ibd_segments = row[16:]
        #for chrom, n_ibd, max_ibd in zip(range(1,15), n_ibd_segments, max_ibd_segments):
        #    n_ibd_segment_dict[chrom] = n_ibd
        #    max_ibd_segment_dict[chrom] = max_ibd
        #print(comparison)
        #print(relatedness, n_ibd_segments, max_ibd_segments)
        S = Sim(comparison, relatedness,max_ibd_segments, n_ibd_segments)
        #print(S.complete_likelihoods_ms)
        data_dict[comparison]= S
    return data_dict

In [None]:
degree_relationships_ms = defaultdict(list)
degree_relationships_ms[1] = ['PC', 'FS', 'MS.MS']
degree_relationships_ms[2] = ['GC', 'HS', 'FAV', 'FAV.MS']
degree_relationships_ms[3] = ['GGC', 'HAV', 'FCS', 'FCS.MS']
degree_relationships_ms['all'] = ['PC', 'GC', 'MS.MS', 
                               'GGC', 'FS', 'HS', 'FAV','FAV.MS', 
                               'HAV', 'FCS', 'FCS.MS']

In [None]:
likelihood_M_dict = {}
for comparison in simulated_M_dict:
    print(comparison)
    likelihood_M_dict[comparison] = format_ingest_data(comparison)

In [None]:
answers = {}
first_degree_nodes = ['P1.F11', 'P2.F11', 'P1.F12', 'P2.F12',
                     'F11.F21', 'F11.F22', 'F12.F21', 'F12.F22',
                     'F21.F31', 'F21.F32', 'F22.F31', 'F22.F32',
                     'F11.F12', 'F21.F22', 'F31.F32']

pc_nodes = ['P1.F11', 'P2.F11', 'P1.F12', 'P2.F12',
             'F11.F21', 'F11.F22', 'F12.F21', 'F12.F22',
             'F21.F31', 'F21.F32', 'F22.F31', 'F22.F32',]

fs_nodes = ['F11.F12', 'F21.F22', 'F31.F32']


second_degree_nodes = ['P1.F21', 'P1.F22', 'P2.F21', 'P2.F22',
                       'F11.F31', 'F12.F31', 'F11.F32', 'F12.F32',]
third_degree_nodes = ['P1.F31', 'P1.F32','P2.F31', 'P2.F32']

nodes_dict = {1: first_degree_nodes,
             'PC': pc_nodes,
             'FS': fs_nodes,
              2: second_degree_nodes,
              3: third_degree_nodes,
        }


for node in first_degree_nodes:
    answers[node] =1
for node in second_degree_nodes:
    answers[node] = 2
for node in third_degree_nodes:
    answers[node] = 3
focus_comparisons = first_degree_nodes + second_degree_nodes + third_degree_nodes

answers2 = {}
for node in pc_nodes:
    answers2[node] = 'PC'
for node in fs_nodes:
    answers2[node] = 'FS'
for node in second_degree_nodes:
    answers2[node] = '2'
for node in third_degree_nodes:
    answers2[node] = '3'

In [None]:
node_swap_dict = {}
for G in degree_relationships_ms[2]:
    node_swap_dict[G] = '2'
for G in degree_relationships_ms[3]:
    node_swap_dict[G] = '3'
for G in degree_relationships_ms[1]:
    node_swap_dict[G] = G

In [None]:
degree_relationships_ms = defaultdict(list)
degree_relationships_ms[1] = ['PC', 'FS', 'MS.MS']
degree_relationships_ms[2] = ['GC', 'HS', 'FAV', 'FAV.MS']
degree_relationships_ms[3] = ['GGC', 'HAV', 'FCS', 'FCS.MS']
degree_relationships_ms['all'] = ['PC', 'GC', 'MS.MS', 
                               'GGC', 'FS', 'HS', 'FAV','FAV.MS', 
                               'HAV', 'FCS', 'FCS.MS']

In [None]:
reorganized_likelihood_M = []
for _ in range(2500):
    max_likelihood_calculations = [likelihood_M_dict[node][_].llr_degree_classification for node in likelihood_M_dict.keys()]
    reorganized_likelihood_M.append(max_likelihood_calculations)
reorganized_likelihood_M = np.asarray(reorganized_likelihood_M)

reorganized_likelihood_df = DataFrame(reorganized_likelihood_M, columns =likelihood_M_dict.keys())

reorganized_likelihood_M2 = []
for _ in range(2500):
    max_likelihood_calculations = [node_swap_dict[likelihood_M_dict[node][_].max_complete_likelihood_ms] for node in likelihood_M_dict.keys()]
    reorganized_likelihood_M2.append(max_likelihood_calculations)
reorganized_likelihood_M2 = np.asarray(reorganized_likelihood_M2)
reorganized_likelihood_df2 = DataFrame(reorganized_likelihood_M2, columns =likelihood_M_dict.keys())
reorganized_likelihood_df2.to_excel('serial_inbreeding_likelihood_categories.xlsx', index = False)

In [None]:
unrelated = ['P1.P2','P1.A1','P1.A2','P1.A3', 
             'P2.A1', 'P2.A2', 'P2.A3', 
             'A1.A2' 'A1.A3', 'A1.F1','A1.F12', 'A2.A3',
            'A2.F1', 'A3.F12', 'A2.F2', 'A2.F22', 'A2.F4',
            'A2.F42','A1.A2', 'A1.A3', 'A2.F12', 'A3.F1',
            'A3.F2', 'A3.F22', 'A3.F3', 'A3.F32']

mod_degree_relationships_ms = defaultdict(list)
mod_degree_relationships_ms[1] = ['PC', 'FS', 'MS.MS']
mod_degree_relationships_ms[2] = ['GC', 'HS', 'FAV', 'FAV.MS','2',2]
mod_degree_relationships_ms[3] = ['GGC', 'HAV', 'FCS', 'FCS.MS','3',3]
mod_degree_relationships_ms['all'] = ['PC', 'GC', 'MS.MS', 
                               'GGC', 'FS', 'HS', 'FAV','FAV.MS', 
                               'HAV', 'FCS', 'FCS.MS','2','3','1',1,2,3]
mod_degree_relationships_ms['PC'] = ['PC']
mod_degree_relationships_ms['FS'] = ['FS']

class Diagnostic_Stats:
    def __init__(self, degree):
        self.degree= degree
        self.calculate_confusion_M()

    @classmethod
    def calc_standard_error(cls,p, total):
        return np.sqrt((p*(1-p) / total))
    
    def calculate_confusion_M(self):
        degree_nodes = nodes_dict[self.degree]
        if self.degree == int(3):
            degree_nodes = list(nodes_dict[3]) #+ list(nodes_dict['3+'])
        relationship_nodes = mod_degree_relationships_ms[self.degree]
        converse_columns = [column for column in reorganized_likelihood_df2.columns if (column not in degree_nodes) & (column not in unrelated)]

        TP_M = reorganized_likelihood_df2[degree_nodes].isin(relationship_nodes)
        FN_M = ~TP_M

        FP_M = reorganized_likelihood_df2[converse_columns].isin(relationship_nodes)
        TN_M = ~FP_M

        self.TPC = np.sum(TP_M, axis = 1) #calculated for each simulation iteration
        self.FPC = np.sum(FP_M, axis = 1)
        self.FNC = np.sum(FN_M, axis = 1)
        self.TNC = np.sum(TN_M, axis = 1)

        self.sensitivity = self.TPC / (self.TPC + self.FNC) #recall
        #print(self.TPC + self.FNC)
        self.specificity = self.TNC / (self.TNC + self.FPC)
        self.ppv = self.TPC / (self.TPC + self.FPC) #precision
        self.npv = self.TNC / (self.FNC + self.TNC)
        self.f1 = 2*self.TPC / (2*self.TPC + self.FPC + self.FNC)
        
        self.youdens_J = self.sensitivity + self.specificity - 1
        self.MCC =  (self.TPC * self.TNC - self.FPC * self.FNC) / np.sqrt((self.TPC + self.FNC) *(self.TPC + self.FNC) *(self.TNC + self.FPC) *(self.TNC + self.FNC))

        
        

In [None]:
diagnostic_stats = {}
for degree in ['PC', 'FS', 1,2,3]:
    diagnostic_stats[degree] = Diagnostic_Stats(degree)

plt.figure(figsize=(8,4))

categories = ['PC', 'FS', 1,2,3]
averages = [np.mean(diagnostic_stats[degree].f1) for degree in categories]
ci_lows = [max(np.percentile(diagnostic_stats[degree].f1, 97.5) - np.mean(diagnostic_stats[degree].f1),0)  for degree in categories]

ci_highs = [np.mean(diagnostic_stats[degree].f1) - np.percentile(diagnostic_stats[degree].f1, 2.5) for degree in categories]

plt.bar(range(5),[np.mean(diagnostic_stats[degree].f1) for degree in categories],
        capsize = 5, yerr = (ci_highs, ci_lows), color = [color_map_dict['PC'],
                                                          color_map_dict['FS'], 'crimson',
                                                          'blue', '#E2E0E0'])

plt.xticks(range(5), ['PC', 'FS', '1°', '2°', '3°+'])



#plt.ylim(0,1)
plt.ylabel('F1-score', fontsize = 15)
plt.xlabel('Genealogical Relationship', fontsize = 15)
plt.tick_params(axis='both', labelsize = 12)
plt.ylim(0,1)
plt.savefig('/Users/weswong/Wirth Lab Dropbox/wes wong/MalKinID/Figures/inbred_pointimportation.svg')


