# Experience 3

**Name**:  Tina Yung-Fang Tu
**Time Spent**: 30 hr

In [496]:
import numpy as np
import pandas as pd
import itertools as it

In [497]:
# please set the path to your data directory here
path = './EPI511 3/'

# please use the following function (or something like it) to read files
def pname(name):
    '''Prepend the path to the filename'''
    return path + '/' + name

def popen(name):
    '''Open file in the path'''
    return open(pname(name))

In [498]:
#################### functions to read in data ##################
def read_snp(file):
    '''Read a snp file into a pandas dataframe'''
    return(pd.read_table(
        file,
        sep='\s+', # columns are separated by whitespace
        # names of the columns
        names=[None, 'chromosome', 'morgans', 'position', 'ref', 'alt'],
        index_col=0))

SNPs = read_snp(path + 'HapMap3.snp') 

def get_chr_range(chromosome):
    '''Returns the range of positions where SNPs for a chromosome are kept'''
    filt = SNPs.query('chromosome=={}'.format(chromosome))
    start = SNPs.index.get_loc(filt.iloc[0].name)
    stop  = SNPs.index.get_loc(filt.iloc[-1].name) + 1
    return(start, stop)

def read_geno(file):
    '''Reads a geno file into a masked numpy matrix'''
    return(np.genfromtxt(
        file,               # the file
        dtype='uint8',      # read the data in as 1-byte integers
        delimiter=1,        # 1-byte width data
        missing_values=9,   # 9 indicates missing data
        usemask=True        # return a masked array
    ))

def read_geno_pop_chr(pop, chromosome):
    '''Reads a slice of a geno file into a masked numpy matrix'''
    f = open(path + pop + '.geno')      # open the file
    (start, stop) = get_chr_range(chromosome)
    s = it.islice(f, start, stop) # slice the file only keeping SNPs of chr
    return read_geno(s)

## (1) $\lambda_{GC}$ Correction

Consider a hypothetical case-control association study involving the first 100 CEU samples and the first 100 YRI samples.  Label the first 60 CEU samples and 40 YRI samples as Cases, and the remaining samples as Controls.  Compute case-control association statistics for every SNP on chromosome 20 using the Armitage Trend Test.  Are association statistics inflated? What is λGC?  How does this compare to what would be expected given the FST between Case and Control populations and the #samples?  Apply Genomic Control to correct for stratification.  Are corrected association statistics inflated? What is λGC for the corrected statistics?  Repeat the above computations using only the first 50 CEU samples (first 30 are Cases, rest are Controls) and the first 50 YRI samples (first 20 are Cases, rest are Controls).  How do the results change?    

### Solution

In [499]:
# Read CEU and YRI geno
CEU_geno = read_geno_pop_chr('CEU', 20)
YRI_geno = read_geno_pop_chr('YRI', 20)
CEU_geno.shape

(18359, 112)

In [500]:
# Define cases and controls
CEU_geno_subset = CEU_geno[:, :100]  
YRI_geno_subset = YRI_geno[:, :100]  

geno_data = np.ma.hstack((CEU_geno_subset, YRI_geno_subset))
labels = np.array([1] * 60 + [0] * 40 + [1] * 40 + [0] * 60)  

def armitage_trend_test(genotype, phenotype):
    # Remove missing data
    valid = ~genotype.mask 
    valid_genotype = genotype[valid]
    valid_phenotype = phenotype[valid] 

    if len(valid_genotype) == 0:
        return np.nan
    # Compute correlation coefficient
    r = np.corrcoef(valid_genotype, valid_phenotype)[0, 1]
    if np.isnan(r):
        return np.nan
    
    # Compute chi-square statistic
    N = len(valid_genotype)
    return N * r**2


# Calculate chi-square statistics
results = np.array([armitage_trend_test(geno_data[i, :], labels) for i in range(geno_data.shape[0])])

# Calculate median of observed chi-square statistics
observed_median_chi_sq = np.median(results)

# Compute λGC
lambda_gc = observed_median_chi_sq / 0.455
print("λGC:", lambda_gc)

λGC: 2.014114917340724


In [501]:
def fst_est(pop1, pop2, chr): 
    pop1_geno = read_geno_pop_chr(pop1, chr)
    pop2_geno = read_geno_pop_chr(pop2, chr)

    # Get allele frequencies (returns '-1' if all values at a locus are missing)
    pop1_af = pop1_geno.mean(axis=1).filled(-1) / 2
    pop2_af = pop2_geno.mean(axis=1).filled(-1) / 2

    # Select valid SNPs present in both populations
    valid_snps = (pop1_af >= 0) & (pop2_af >= 0)

    # Filter valid allele frequencies and genotypes
    af1_filtered = pop1_af[valid_snps]
    af2_filtered = pop2_af[valid_snps]
    pop1_geno_valid = pop1_geno[valid_snps, :]
    pop2_geno_valid = pop2_geno[valid_snps, :]

    # Compute mean allele frequency of two populations
    p = (af1_filtered + af2_filtered) / 2

    # Compute sample sizes
    n1 = np.sum(~pop1_geno_valid.mask, axis=1)
    n2 = np.sum(~pop2_geno_valid.mask, axis=1)

    # Compute Fst
    fst = np.sum(((af1_filtered - af2_filtered) ** 2 - (1 / (2 * n1) + 1 / (2 * n2)) * p * (1 - p)) / np.sum(2 * p * (1 - p)))

    return fst

fst_total = fst_est('CEU', 'YRI', 20)
print("FST between CEU and YRI:", fst_total)
print("FST between cases and controls:", fst_total * 0.2 * 0.2)

FST between CEU and YRI: 0.14985831303644273
FST between cases and controls: 0.00599433252145771


In [502]:
# Apply Genomic Control correction
chi_sq_corrected = results / lambda_gc

# Calculate the median of the corrected chi-square statistics
median_corrected = np.median(chi_sq_corrected)
lambda_gc_corrected = median_corrected / 0.455
print("Corrected λGC:", lambda_gc_corrected)

Corrected λGC: 1.0


In [503]:
# Define cases and controls
CEU_geno_subset2 = CEU_geno[:, :50]  
YRI_geno_subset2 = YRI_geno[:, :50]  

geno_data_2 = np.ma.hstack((CEU_geno_subset2, YRI_geno_subset2))  
labels_2 = np.array([1] * 30 + [0] * 20 + [1] * 20 + [0] * 30)  

# Calculate chi-square statistics
results_2 = np.array([armitage_trend_test(geno_data_2[i, :], labels_2) for i in range(geno_data_2.shape[0])])

# Calculate median of observed chi-square statistics
observed_chi_sq_2 = np.median(results_2)

# Compute λGC
lambda_gc_2 = observed_chi_sq_2 / 0.455
print("λGC:", lambda_gc_2)

λGC: 1.4996375875830021


*comments*

The obtained inflation factor (λGC) of about 2.014, meaning that the chi‐square statistic is inflated. The FST between Case and Control populations given the samples are calculated through FST from the whole CEU and YRI population and the other directly from the samples. Given the FST (0.00599), we would expect λGC to be 1 + 200 * FST, which is around 2.199. Therefore, the λGC observed is still higher than expected. After applying Genomic Control to the chi-square statistics, the λGC would be exactly 1. 

For the reduced sample (50 CEU + 50 YRI), the computed inflation factor (λGC) is about 1.4996. This is lower than the previously observed λGC with the full set of samples because the inflation is proportional to the total sample size. 

## (2) Eigenvector Correction

Consider a hypothetical eigenvector for the set of 200 samples from (1), which has value
-1/sqrt(200) for each CEU sample and +1/sqrt(200) for each YRI sample.  Recompute chr20 association statistics from (1) by correcting for this eigenvector, instead of applying Genomic Control.  Are corrected association statistics inflated? What is λGC for the corrected statistics?

### Solution

In [504]:
# Number of samples
n_samples = 200

# Define eigenvector
eigenvector = np.concatenate((-np.ones(100), np.ones(100))) / np.sqrt(n_samples)

def correct_for_eigenvector(genotypes, eigenvector):
    eigenvector = eigenvector[:, np.newaxis]  # Reshape to (200, 1)

    projection_coeff = np.ma.dot(genotypes, eigenvector) / np.dot(eigenvector.T, eigenvector)
    projection_coeff = np.ma.masked_array(projection_coeff, mask=np.ma.getmaskarray(genotypes).any(axis=1))

    projection_coeff_filled = projection_coeff.filled(0)  # Replace masked values with 0
    corrected_genotypes = genotypes - np.dot(projection_coeff_filled, eigenvector.T)

    return corrected_genotypes

corrected_geno_data = correct_for_eigenvector(geno_data, eigenvector)

# Apply the test to each SNP in corrected_geno_data
corrected_chi_sq_results = np.array([armitage_trend_test(corrected_geno_data[i, :], labels) for i in range(corrected_geno_data.shape[0])])

# Calculate median of observed chi-square statistics
adjusted_median_chi_sq = np.median(corrected_chi_sq_results[~np.isnan(corrected_chi_sq_results)])  # Exclude NaN values
print("Adjusted median chi-square:", adjusted_median_chi_sq)

# Compute λGC
lambda_gc_adjusted = adjusted_median_chi_sq / 0.455
print("Adjusted λGC:", lambda_gc_adjusted)

Adjusted median chi-square: 0.502933780385583
Adjusted λGC: 1.1053489678804023


*comments*

After eigenvector correction, the adjusted λGC 1.1053489678804023, which is very close to 1. This means that the population stratification is corrected by the eigenvector. 

## (3) Lactase Persistence

Consider a hypothetical case-control association study of the lactase persistence phenotype involving all 112 CEU samples and 88 TSI samples.  Although this phenotype was not reported, it can be approximated by the genotype at SNP rs13404551, which is strongly correlated to the SNP rs4988235 that is known to perfectly predict lactase phenotype.  Specifically, define CEU or TSI individuals with genotype=0 or 1 at rs13404551 to be Cases, and remaining CEU or TSI individuals to be Controls.  Compute case-control association statistics for every SNP on chromosome 20 using the Armitage Trend Test.  Are association statistics inflated? What is λGC?  How does this compare to what would be expected given the FST between Case and Control populations and the #samples?  Does the chr20 SNP with highest association statistic have higher LD score than the chr20 SNP with lowest association statistic, and is this expected?  (Note: ok to use CEU data only and restrict to SNPs within ±50kb when computing LD scores.) Apply PCA and Structured Association strategies to correct for stratification.  Report the λGC for the corrected statistics in each case.

### Solution

In [505]:
CEU_SNPs = read_snp(path + 'CEU.snp') 
TSI_SNPs = read_snp(path + 'TSI.snp') 

# Get the index of rs13404551 in the CEU genotype data for chromosome 2
chromosome_2_snps_CEU = CEU_SNPs[CEU_SNPs['chromosome'] == 2]

# Find the index of rs13404551 within chromosome 2
rs13404551_index = chromosome_2_snps_CEU.index.get_loc('rs13404551')

# Load the genotype data for chromosome 2
CEU_geno_chr2 = read_geno_pop_chr('CEU', 2)
TSI_geno_chr2 = read_geno_pop_chr('TSI', 2)

CEU_rs13404551 = CEU_geno_chr2[rs13404551_index, :]
TSI_rs13404551 = TSI_geno_chr2[rs13404551_index, :]

combined_genotypes_rs13404551 = np.ma.hstack([CEU_rs13404551, TSI_rs13404551])

# Cases: genotype 0 or 1, Controls: genotype 2
labels_combined = np.where(combined_genotypes_rs13404551 <= 1, 1, 0)

# Count CEU cases and controls
CEU_labels = np.where(CEU_rs13404551 <= 1, 1, 0)
num_CEU_cases = np.sum(CEU_labels == 1)
num_CEU_controls = np.sum(CEU_labels == 0)

# Count TSI cases and controls
TSI_labels = np.where(TSI_rs13404551 <= 1, 1, 0)
num_TSI_cases = np.sum(TSI_labels == 1)
num_TSI_controls = np.sum(TSI_labels == 0)

print(f"CEU cases: {num_CEU_cases}, CEU controls: {num_CEU_controls}")
print(f"TSI cases: {num_TSI_cases}, TSI controls: {num_TSI_controls}")

CEU cases: 105, CEU controls: 7
TSI cases: 38, TSI controls: 50


In [506]:
TSI_geno = read_geno_pop_chr('TSI', 20)

combined_geno_chr20 = np.ma.concatenate([CEU_geno, TSI_geno], axis=1)

# Calculate chi-square statistics
lactose_results = np.array([armitage_trend_test(combined_geno_chr20[i, :], labels_combined) for i in range(combined_geno_chr20.shape[0])])

# Calculate median of observed chi-square statistics
lactose_median_chi_sq = np.median(lactose_results)

# Compute λGC
lambda_gc_lactose = lactose_median_chi_sq / 0.455
print('Median chi-square statistics: ', lactose_median_chi_sq)
print("λGC:", lambda_gc_lactose)

Median chi-square statistics:  0.586452847483451
λGC: 1.2889073571064855


In [507]:
def calculate_fst(case_geno, control_geno):
    # Get allele frequencies for each SNP 
    case_af = case_geno.mean(axis=1).filled(-1) / 2
    control_af = control_geno.mean(axis=1).filled(-1) / 2

    # Select valid SNPs present in both groups (allele frequency is valid if not -1)
    valid_snps = (case_af >= 0) & (control_af >= 0)

    # Filter valid allele frequencies and genotype data
    af_case = case_af[valid_snps]
    af_control = control_af[valid_snps]
    case_geno_valid = case_geno[valid_snps, :]
    control_geno_valid = control_geno[valid_snps, :]

    # Compute the average allele frequency of the two groups for each SNP
    p = (af_case + af_control) / 2

    mask_case = np.ma.getmaskarray(case_geno_valid)
    mask_control = np.ma.getmaskarray(control_geno_valid)
    n_case = np.sum(~mask_case, axis=1)
    n_control = np.sum(~mask_control, axis=1)

    # Compute Fst
    fst = np.sum(((af_case - af_control) ** 2 - (1 / (2 * n_case) + 1 / (2 * n_control)) * p * (1 - p)) / np.sum(2 * p * (1 - p)))

    return fst

In [508]:
# Ensure labels_combined correspond to these individuals (200 samples)
case_lactose_geno = combined_geno_chr20[:, labels_combined == 1]  # SNPs for cases
control_lactose_geno = combined_geno_chr20[:, labels_combined == 0]  # SNPs for controls

fst_case_control_lactose = calculate_fst(case_lactose_geno, control_lactose_geno)
print("FST between Cases and Controls for lactase persistence:", fst_case_control_lactose)
print('Expected λGC based on Fst and N: ', 1 + fst_case_control_lactose * 200)

FST between Cases and Controls for lactase persistence: 0.0015179402950575125
Expected λGC based on Fst and N:  1.3035880590115025


In [509]:
# Identify the SNP with highest and lowest association statistic
max_association_snp_idx = np.argmax(lactose_results)
min_association_snp_idx = np.argmin(lactose_results)

max_association_stat = lactose_results[max_association_snp_idx]
min_association_stat = lactose_results[min_association_snp_idx]

print("Max association SNP index:", max_association_snp_idx)
print("Min association SNP index:", min_association_snp_idx)
print("Max association statistic:", max_association_stat)
print("Min association statistic:", min_association_stat)

Max association SNP index: 17503
Min association SNP index: 4093
Max association statistic: 19.018736104806216
Min association statistic: 2.2688772364345377e-33


In [510]:
def compute_r2(snp1, snp2):
    # Use masks to filter valid data
    valid = ~snp1.mask & ~snp2.mask
    if np.sum(valid) < 2:
        return np.nan

    gA = snp1[valid].astype(float)
    gB = snp2[valid].astype(float)
    
    if np.std(gA) == 0 or np.std(gB) == 0:
        return np.nan
    
    # Genotype count for allele
    gA_mean = np.mean(gA) 
    gB_mean = np.mean(gB) 

    Na = len(gA)
    Nb = len(gB)

    var_a = ((gA - gA_mean)**2).sum()/Na
    var_b = ((gB - gB_mean)**2).sum()/Nb

    # Compute r^2
    numerator = ((gA *gB).mean() - (gA_mean * gB_mean))**2
    denominator = var_a * var_b
    if denominator == 0:
        return np.nan
    return numerator / denominator


def compute_ld_score(geno_data, snp_idx, window_size=50000, snp_positions=None):
    
    # Get target SNP genotype
    target_snp = geno_data[snp_idx, :]
    
    # If SNP positions are provided, find SNPs within ±window_size
    if snp_positions is not None:
        distance_mask = np.abs(snp_positions - snp_positions[snp_idx]) <= window_size
        valid_indices = np.where(distance_mask)[0]  # Get indices of SNPs in range
    else:
        valid_indices = np.arange(geno_data.shape[0])  # Use all SNPs if no positions provided

    num_snps_in_window = len(valid_indices) 
    # Compute r^2 for SNPs within the valid window
    r_squared_values = np.array([compute_r2(target_snp, geno_data[i, :]) for i in valid_indices])

    # Ensure SNP contributes its own LD score
    r_squared_values[valid_indices == snp_idx] = 1.0

    # Sum valid r^2 values to compute LD score
    ld_score = np.nansum(r_squared_values)
    
    return ld_score

In [511]:
# Step 1: Identify max and min association SNPs
max_association_snp_idx = np.argmax(lactose_results)
min_association_snp_idx = np.argmin(lactose_results)

# Step 2: Get SNP positions for chromosome 20
chromosome = 20
start, stop = get_chr_range(chromosome)
snp_positions = SNPs.iloc[start:stop]['position'].values

max_ld_score = compute_ld_score(CEU_geno, max_association_snp_idx, window_size=50000, snp_positions=snp_positions)
min_ld_score = compute_ld_score(CEU_geno, min_association_snp_idx, window_size=50000, snp_positions=snp_positions)

# Step 4: Compare and interpret results
print("LD score near max association SNP:", max_ld_score)
print("LD score near min association SNP:", min_ld_score)

LD score near max association SNP: 6.546252768517539
LD score near min association SNP: 1.9741579567250183


In [512]:
# Define eigenvector
eigenvector = np.concatenate([-np.ones(112) / 112, np.ones(88) / 88])
eigenvector_sq_sum = np.sum(eigenvector ** 2)

copied_geno_chr20 = combined_geno_chr20.astype(float).copy()
eigen_corrected_chr20 = correct_for_eigenvector(copied_geno_chr20, eigenvector)

# Adjust the phenotype vector similarly
eigen_pheno_proj = np.sum(eigenvector * labels_combined) / eigenvector_sq_sum
eigen_pheno_adjusted = labels_combined - eigen_pheno_proj * eigenvector

# Apply the test to each SNP in corrected_geno_data
eigen_corrected_chr20_results = np.array([armitage_trend_test(eigen_corrected_chr20[i, :], eigen_pheno_adjusted) for i in range(eigen_corrected_chr20.shape[0])])

# Calculate median of observed chi-square statistics
eigen_corrected_chi_sq = np.median(eigen_corrected_chr20_results[~np.isnan(eigen_corrected_chr20_results)])  # Exclude NaN values
print("Median chi-square after EIGEN correction:", eigen_corrected_chi_sq)

# Compute λGC
eigen_corrected_lambda_gc = eigen_corrected_chi_sq/ 0.455
print("Adjusted λGC:", eigen_corrected_lambda_gc)

Median chi-square after EIGEN correction: 0.4533902114239909
Adjusted λGC: 0.9964620031296503


In [545]:
def armitage_trend_test_with_sign(genotypes, phenotype): 
    if np.ma.is_masked(genotypes):
        valid_mask = ~genotypes.mask
        g = genotypes.data[valid_mask]
        y = phenotype[valid_mask]
    else:
        g = genotypes
        y = phenotype
    N = len(g)
    r = np.corrcoef(g, y)[0, 1] 
    chi2 = N * (r ** 2)
    return chi2, np.sign(r)

num_snps = combined_geno_chr20.shape[0]

chi2_ceu = np.zeros(num_snps)
Z_ceu = np.zeros(num_snps)
chi2_tsi = np.zeros(num_snps)
Z_tsi = np.zeros(num_snps)

for i in range(num_snps):
    chi2_val, sign_val = armitage_trend_test_with_sign(CEU_geno[i, :], CEU_labels)
    chi2_ceu[i] = chi2_val
    Z_ceu[i] = np.sqrt(chi2_val) * sign_val
    chi2_val, sign_val = armitage_trend_test_with_sign(TSI_geno[i, :], TSI_labels)
    chi2_tsi[i] = chi2_val
    Z_tsi[i] = np.sqrt(chi2_val) * sign_val

w_ceu = np.sqrt(112)
w_tsi = np.sqrt(88)
Z_meta = (w_ceu * Z_ceu + w_tsi * Z_tsi) / np.sqrt(112 + 88)
chi2_meta = Z_meta ** 2
observed_median_meta = np.median(chi2_meta[~np.isnan(chi2_meta)])
lambda_gc_meta = observed_median_meta / 0.455
print("GC after Structured Association:", lambda_gc_meta)

GC after Structured Association: 1.06383269155541


*comments*

For this hypothetical study, the λGC is 1.2889073571064855, which is slightly inflated, but is close to what we expected based on Fst and N (1.3035880590115025). SNPs with higher association statistic also have higher LD scores, and this is as expected. If a SNP is causal or near a causal variant, it will have a high association statistic because it is directly associated with the trait. Nearby SNPs in high LD with the causal variant will also show association. After EIGEN vector correction, the adjusted λGC is 0.9964620031296503, which is very close to 1. The structure association approach also corrected the λGC to approximately 1. 

## (4) 

Consider the SNP rs3131972 on chromosome 1.  Suppose that this is a causal SNP, and define 0 (of 2) CEU individuals with genotype 0 at this SNP + the first 10 (of 33) CEU individuals with genotype 1 at this SNP + the first 50 (of 77) CEU individuals with genotype 2 at this SNP to be Cases, and the other 52 CEU individuals to be Controls.  Define the first 28 (of 69) YRI individuals with genotype 0 at this SNP + the first 28 (of 39) YRI individuals with genotype 1 at this SNP + all 5 (of 5) YRI individuals with genotype 2 at this SNP to be Cases, and the remaining 52 YRI individuals to be Controls.  Now pretend that you don’t know which SNP is the causal SNP, but assume that there is exactly 1 causal SNP in this data.  (a) Using CEU data only, conduct a fine-mapping study at the locus.  What is the posterior probability of each nearby SNP (e.g. within 50kb of rs3131972, including rs3131972 itself) being causal?  (b) Using YRI data only, conduct a fine-mapping study at the locus.  What is the posterior probability of each nearby SNP being causal?  Compare to (a), and discuss.  (c) Using CEU + YRI data, conduct a fine-mapping study at the locus (you can just multiply the Bayes factors).  What is the posterior probability of each nearby SNP being causal?  Compare to (a) and (b), and discuss.   

### Solution

In [525]:
# Extract the SNP and index
YRI_SNPs = read_snp(path + 'YRI.snp') 

# Get the index of rs13404551 in the CEU genotype data for chromosome 2
chromosome_1_snps_YRI = YRI_SNPs[YRI_SNPs['chromosome'] == 1]
rs3131972_index = chromosome_1_snps_YRI.index.get_loc('rs3131972')

In [526]:
# Extract genotypes for rs3131972
CEU_rs3131972 = CEU_geno[rs3131972_index, :]
YRI_rs3131972 = YRI_geno[rs3131972_index, :]

# Initialize labels (0 = Control, 1 = Case)
CEU_labels = np.zeros(CEU_rs3131972.shape, dtype=int)
YRI_labels = np.zeros(YRI_rs3131972.shape, dtype=int)

# CEU case definition
CEU_cases = []
CEU_cases.extend(np.where(CEU_rs3131972 == 1)[0][:10])  # First 10 individuals with genotype 1
CEU_cases.extend(np.where(CEU_rs3131972 == 2)[0][:50])  # First 50 individuals with genotype 2

# YRI case definition
YRI_cases = []
YRI_cases.extend(np.where(YRI_rs3131972 == 0)[0][:28])  # First 28 individuals with genotype 0
YRI_cases.extend(np.where(YRI_rs3131972 == 1)[0][:28])  # First 28 individuals with genotype 1
YRI_cases.extend(np.where(YRI_rs3131972 == 2)[0][:5])   # All 5 individuals with genotype 2

# Assign labels to cases
CEU_labels[CEU_cases] = 1  # Cases in CEU
YRI_labels[YRI_cases] = 1  # Cases in YRI

# Define Controls for CEU and YRI (remaining individuals)
CEU_controls = np.where(CEU_labels == 0)[0]
YRI_controls = np.where(YRI_labels == 0)[0]

# Combine CEU and YRI labels
total_labels = np.concatenate([CEU_labels, YRI_labels])

# Print case-control counts
print("CEU Cases:", np.sum(CEU_labels))
print("CEU Controls:", len(CEU_labels) - np.sum(CEU_labels))
print("YRI Cases:", np.sum(YRI_labels))
print("YRI Controls:", len(YRI_labels) - np.sum(YRI_labels))

CEU Cases: 60
CEU Controls: 52
YRI Cases: 61
YRI Controls: 52


In [527]:
CEU_geno_chr1 = read_geno_pop_chr('CEU', 1)
YRI_geno_chr1 = read_geno_pop_chr('YRI', 1)

total_geno_chr1 = np.hstack((CEU_geno_chr1, YRI_geno_chr1))

# Get SNP positions for chromosome 1
chromosome = 1  
start, stop = get_chr_range(chromosome)
snp_positions = SNPs.iloc[start:stop]['position'].values

# Get the physical position of rs3131972
rs3131972_position = snp_positions[rs3131972_index]

# Create a mask for SNPs within ±50kb of rs3131972
window_mask = (snp_positions >= (rs3131972_position - 50000)) & (snp_positions <= (rs3131972_position + 50000))
window_indices = np.where(window_mask)[0]
print("Number of SNPs within 50kb:", len(window_indices))

Number of SNPs within 50kb: 6


In [547]:
# Define region geno
CEU_region_geno = CEU_geno[0:6, ]  
YRI_region_geno = YRI_geno[0:6, ]  

CEU_region_labels = CEU_labels  
YRI_region_labels = YRI_labels  

def compute_bayes_factors(geno_matrix, labels):
    n_snps, n_samples = geno_matrix.shape
    BF = np.zeros(n_snps)
    
    for i in range(n_snps):
        # Get genotype vector for SNP i
        g = np.ma.asarray(geno_matrix[i, :])
        
        # Identify valid (non-masked) indices
        valid_idx = np.where(~np.ma.getmaskarray(g))[0]
        g_valid = g.data[valid_idx]
        labels_valid = labels[valid_idx]
        
        # Identify case and control indices
        case_idx = np.where(labels_valid == 1)[0]
        ctrl_idx = np.where(labels_valid == 0)[0]
        
        n_case = len(case_idx)
        n_ctrl = len(ctrl_idx)
        if n_case == 0 or n_ctrl == 0:
            BF[i] = np.nan
            continue
        
        # Compute allele counts
        var_case = np.sum(g_valid[case_idx])
        ref_case = 2 * n_case - var_case
        var_ctrl = np.sum(g_valid[ctrl_idx])
        ref_ctrl = 2 * n_ctrl - var_ctrl
        
        var_all = var_case + var_ctrl
        ref_all = ref_case + ref_ctrl
        
        total_case_alleles = var_case + ref_case
        total_ctrl_alleles = var_ctrl + ref_ctrl
        total_all_alleles = var_all + ref_all
        
        # Compute allele frequencies (avoiding log(0) issues)
        p_case = var_case / total_case_alleles
        p_ctrl = var_ctrl / total_ctrl_alleles
        p_all = var_all / total_all_alleles
        
        # Compute log likelihoods
        logL_M1 = (var_case * np.log(p_case) + ref_case * np.log(1 - p_case) +
                   var_ctrl * np.log(p_ctrl) + ref_ctrl * np.log(1 - p_ctrl))
        logL_M0 = (var_all * np.log(p_all) + ref_all * np.log(1 - p_all))
        
        logBF = logL_M1 - logL_M0
        BF[i] = np.exp(logBF)
    return BF

BF_CEU = compute_bayes_factors(CEU_region_geno, CEU_region_labels)
BF_YRI = compute_bayes_factors(YRI_region_geno, YRI_region_labels)

def posterior_from_bf(bf_array):
    return bf_array / bf_array.sum()

post_CEU = posterior_from_bf(BF_CEU)
post_YRI = posterior_from_bf(BF_YRI)

print("=== CEU only ===")
for SNP in range(6):
    print(f'Posterior probability for SNP {SNP+1}, {post_CEU[SNP]}')

print("=== YRI only ===")
for SNP in range(6):
    print(f'Posterior probability for SNP {SNP+1}, {post_YRI[SNP]}')

=== CEU only ===
Posterior probability for SNP 1, 0.2597280237210705
Posterior probability for SNP 2, 0.7402719761538669
Posterior probability for SNP 3, 3.413349227346625e-13
Posterior probability for SNP 4, 4.45840590492611e-11
Posterior probability for SNP 5, 7.36822496941617e-11
Posterior probability for SNP 6, 6.45491950830921e-12
=== YRI only ===
Posterior probability for SNP 1, 0.9683957895514098
Posterior probability for SNP 2, 0.03136714921893613
Posterior probability for SNP 3, 9.301151656611182e-06
Posterior probability for SNP 4, 0.00021242531315166345
Posterior probability for SNP 5, 3.3239606328132306e-06
Posterior probability for SNP 6, 1.2010804212850844e-05


In [548]:
# Multiply BFs for a combined single-causal SNP analysis
BF_combined = BF_CEU * BF_YRI
post_combined = posterior_from_bf(BF_combined)

print("=== Combined Population ===")
for SNP in range(6):
    print(f'Posterior probability for SNP {SNP+1}, {post_combined[SNP]}')

=== Combined Population ===
Posterior probability for SNP 1, 0.9154828456203449
Posterior probability for SNP 2, 0.08451715437961949
Posterior probability for SNP 3, 1.1555691983681623e-17
Posterior probability for SNP 4, 3.4471833210230594e-14
Posterior probability for SNP 5, 8.914505482469829e-16
Posterior probability for SNP 6, 2.821898742852324e-16


*comments*

Looking at CEU data only, we can see that SNP1 and SNP2 have the highest posterior probabilites, which could be due to linkage disequilibrium in the CEU population. Looking at YRI population, we observe similar results but SNP1 now has very high posterior probability, which is also expected since African populations generally have lower LD, allowing finer resolution in mapping the causal SNP. The combined analysis still correctly identifies SNP1 as the most probable causal variant, showing that integrating populations improves inference. The probability might not be as high as what we see in YRI because the Bayes factor was influenced by the CEU population. 

## (5) 

For each SNP analyzed in (4) (e.g. within 50kb of rs3131972, including rs3131972 itself) compute the odds ratio in CEU and the odds ratio in YRI.  Do SNPs with large effect sizes in CEU have large effect sizes in YRI?  Regress log(odds ratio in YRI) vs. log(odds ratio in CEU) (without affine term) to provide a quantitative answer to this question.  Discuss.

In [549]:
# Compute odds ratios for a given window of SNPs
def compute_odds_ratio_for_snp_window(geno, SNP_index, labels, window):
    region_geno = geno[SNP_index:SNP_index + window, :]  # SNPs within the window
    odds_ratios = np.ones(region_geno.shape[0])  # Initialize for storing ORs

    for i in range(region_geno.shape[0]):
        # Get the genotype data for SNP i in the window
        genotypes = region_geno[i, :]
        
        valid_mask = ~genotypes.mask  # Masked individuals will be ignored
        genotypes = genotypes[valid_mask]
        curr_labels = labels[valid_mask]

        if len(genotypes) == 0:
            odds_ratios[i] = np.nan
            continue  

        # Split into case genotypes vs. control genotypes
        case_geno = genotypes[curr_labels == 1]
        ctrl_geno = genotypes[curr_labels == 0]

        # Compute odds ratio for the current SNP
        odds_ratios[i] = compute_odds_ratio(case_geno, ctrl_geno)
    return odds_ratios

# Function to compute odds ratio
def compute_odds_ratio(genotypes_case, genotypes_control):

    # Compute allele counts
    var_case = np.sum(genotypes_case == 2) * 2 + np.sum(genotypes_case == 1)
    ref_case = np.sum(genotypes_case == 0) * 2 + np.sum(genotypes_case == 1)

    var_ctrl = np.sum(genotypes_control == 2) * 2 + np.sum(genotypes_control == 1) 
    ref_ctrl = np.sum(genotypes_control == 0) * 2 + np.sum(genotypes_control == 1) 

    odds_ratio = (var_case / var_ctrl) / (ref_case / ref_ctrl)
    return odds_ratio

In [550]:
window = len(window_indices)

odds_ratio_ceu = compute_odds_ratio_for_snp_window(CEU_geno_chr1, rs3131972_index, CEU_labels, window)
odds_ratio_yri = compute_odds_ratio_for_snp_window(YRI_geno_chr1, rs3131972_index, YRI_labels, window)
# Print results
print("Odds Ratios in CEU for SNPs in window:", odds_ratio_ceu)
print("Odds Ratios in YRI for SNPs in window:", odds_ratio_yri)

# Compute odds ratio regression
def regress_log_odds_ratios(odds_ratios_ceu, odds_ratios_yri):
    valid_indices = ~np.isnan(odds_ratios_ceu) & ~np.isnan(odds_ratios_yri)
    log_or_ceu = np.log(odds_ratios_ceu[valid_indices])
    log_or_yri = np.log(odds_ratios_yri[valid_indices])

    beta = np.dot(log_or_ceu, log_or_yri) / np.dot(log_or_ceu, log_or_ceu)
    return beta

# Perform regression
slope = regress_log_odds_ratios(odds_ratio_ceu, odds_ratio_yri)
print("Slope of log(OR_YRI) vs. log(OR_CEU):", slope)

False False
False False
False False
False False
False False
False False
False False
False False
False False
False False
False False
False False
Odds Ratios in CEU for SNPs in window: [1.1126071  0.86554622 0.70652174 1.13909774 0.46907216 0.79028133]
Odds Ratios in YRI for SNPs in window: [1.05933014 0.95764706 0.93176471 0.89348371 0.45714286 0.94024276]
Slope of log(OR_YRI) vs. log(OR_CEU): 0.7883134143099462


*comments*

SNP1 (the causal SNP) has OR > 1 in both populations. The slope of log(OR_YRI) vs. log(OR_CEU) = 0.79 suggests that the odds ratios (ORs) in the YRI population are generally correlated with those in the CEU population but not perfectly. This could be due to that SNP is in weaker LD with the causal variant in YRI population compared to CEU, so the observed effect size in YRI was be attenuated. But in YRI population, SNP 1 is the only SNP that has OR > 1, further proving its greater abilitiy in inferring causal SNP. 