In [1]:
import numpy as np

In [205]:
from collections import Counter

Davidoff (2015) is "Cone opsin gene variants in color blindness and other vision disorders".

Stockman (1998) is "Red, Green, and Red-Green Hybrid Pigments in the Human Retina".

I didn't end up using the data from the Stockman paper because there are less than 50 samples. Even though the Davidoff paper is only for male trichromats, it has examples of nearly every combination of SNP, and is more comprehensive than any other available dataset.

In [None]:


JOINT_STATS = {
    'M_opsin': {
        'trichromat': {
            'data': {
                (True, True): 0,
                (False, True): 2,
                (True, False): 60,
                (False, False): 872         
            },
            'source': 'Davidoff 2015'
        },
        'dichromat': {
            'data': {
                (True, True): 1,
                (True, False): 5,
                (False, True): 1,
                (False, False): 6
            },
            'source': 'Stockman 1998'
        }
    },
    'L_opsin': {
        'trichromat': {
            'data': {
                (True, True, True): 1,
                (False, True, True): 8,
                (True, False, True): 0,
                (True, True, False): 3,
                (False, False, True,): 15,
                (False, True, False,): 308,
                (True, False, False): 13,
                (False, False, False): 674
            },
            'source': 'Davidoff 2015'
        },
        'dichromat': {
            'data': {
                (True, True, True): 0,
                (False, True, True): 0,
                (True, False, True): 0,
                (True, True, False): 1,
                (False, False, True): 0,
                (False, True, False): 6,
                (True, False, False): 1,
                (False, False, False): 20
            },
            'source': 'Stockman 1998'
        }
    },

}

In [None]:
def build_joint_pdf(measurements, alpha=0.5):
    pdf = {}
    total = 0
    for combo, count in measurements.items():
        pdf[combo] = count + alpha
        total += count + alpha
    for combo in pdf:
        pdf[combo] /= total
    return pdf

In [134]:
l_joint_pdf = build_joint_pdf(JOINT_STATS['L_opsin']['trichromat']['data'])
l_joint_pdf

{(True, True, True): 0.0014619883040935672,
 (False, True, True): 0.008284600389863547,
 (True, False, True): 0.0004873294346978557,
 (True, True, False): 0.00341130604288499,
 (False, False, True): 0.015107212475633527,
 (False, True, False): 0.300682261208577,
 (True, False, False): 0.013157894736842105,
 (False, False, False): 0.6574074074074074}

In [155]:
m_joint_pdf = build_joint_pdf(JOINT_STATS['M_opsin']['trichromat']['data'])
m_joint_pdf

{(True, True): 0.0005341880341880342,
 (False, True): 0.002670940170940171,
 (True, False): 0.06463675213675214,
 (False, False): 0.9321581196581197}

In [159]:
def sample(joint_pdf):
    r = random.random()
    cumulative = 0.0
    for combo, prob in joint_pdf.items():
        cumulative += prob
        if r < cumulative:
            return combo
    return combo

In [160]:
sample(l_joint_pdf)

(False, False, False)

In [166]:
PEAKS = {
    # m peaks
    (True, True): 536,
    (True, False): 533,
    (False, True): 533,
    (False, False): 530,

    # l peaks
    (True, True, True): 547,
    (True, True, False): 552,
    (True, False, True): 553,
    (True, False, False): 556.5,
    (False, True, True): 551,
    (False, True, False): 555,
    (False, False, True): 556,
    (False, False, False): 559
}

In [175]:
def sample_peaks(case):
    snps = []
    if case == "ML":
        snps = [m_joint_pdf, l_joint_pdf]
        
    elif case == "M":
        snps = [m_joint_pdf]

    elif case == "MM":
        snps = [m_joint_pdf, m_joint_pdf]

    elif case == "L":
        snps = [l_joint_pdf]
    
    elif case == "LL":
        snps = [l_joint_pdf, l_joint_pdf]
    
    return [PEAKS[sample(snp)] for snp in snps]

In [183]:
def get_random_x(n):
    genotypes = ["ML",
                 "M", "MM",
                 "L", "LL"]
    weights = [92,
               0.21, 1.89,
               0.86, 5.04]
    genotype = random.choices(genotypes, weights, k=n)
    return [sample_peaks(g) for g in genotype]

In [163]:
def simulate_males(n):
    return get_random_x(n)

def simulate_females(n):
    return [x1 + x2 for x1, x2 in zip(get_random_x(n), get_random_x(n))]

In [164]:
def functional_genotype(peaks):
    return tuple(sorted(set(peaks)))

In [206]:
N = 100000
male_simulations = Counter([
        functional_genotype(peaks) for peaks in simulate_males(N)
    ])
female_simulations = Counter([
        functional_genotype(peaks) for peaks in simulate_females(N)
    ])

In [217]:
import pandas as pd

In [226]:
def results_to_pandas(simulations):
    results = pd.DataFrame(
        [(peaks, count, len(peaks)) for peaks, count in simulations.most_common()],
        columns=['Peaks', 'Count', 'Dimension']
    )

    results['Percentage'] = 100 * results['Count'] / N

    results['Peaks'] = results['Peaks'].apply(lambda x: ', '.join(map(str, x)))
    results['Percentage'] = results['Percentage'].round(4)
    
    return results

In [236]:
male_table = results_to_pandas(male_simulations)
# this command copies the pandas table to your clipboard
male_table.to_clipboard(index=False)
male_table.head()

Unnamed: 0,Peaks,Count,Dimension,Percentage
0,"530, 559",56529,2,56.529
1,"530, 555",25783,2,25.783
2,"533, 559",4032,2,4.032
3,559,2654,1,2.654
4,"555, 559",1945,2,1.945


In [235]:
female_table = results_to_pandas(female_simulations)
female_table.to_clipboard(index=False)
female_table.head()

Unnamed: 0,Peaks,Count,Dimension,Percentage
0,"530, 559",37123,2,37.123
1,"530, 555, 559",34689,3,34.689
2,"530, 555",8008,2,8.008
3,"530, 533, 559",5004,3,5.004
4,"530, 533, 555, 559",4217,4,4.217
