# Experience 1

**Name**:  Tina Yung-Fang Tu
**Time Spent**: 15 - 20 hr

In [94]:
import numpy as np
import pandas as pd
import itertools as it

In [95]:
# please set the path to your data directory here
path = "./EPI511/"

# please use the following function (or something like it) to read files
def pname(filename):
    '''Prepend the path to the filename'''
    return path + '/' + filename

def popen(filename):
    '''Open file in the path'''
    return open(pname(filename))

In [96]:
################### functions to read in data ##################
def read_snp(file):
    '''Read a snp file into a pandas dataframe'''
    return(pd.read_table(
        file,
        sep='\s+', # columns are separated by whitespace
        # names of the columns
        names=[None, 'chromosome', 'morgans', 'position', 'ref', 'alt'],
        index_col=0
    ))

SNPs = read_snp(path + 'HapMap3.snp') 

def get_chr_range(chromosome):
    '''Returns the range of positions where SNPs for a chromosome are kept'''
    filt = SNPs.query('chromosome=={}'.format(chromosome))
    start = SNPs.index.get_loc(filt.iloc[0].name)
    stop  = SNPs.index.get_loc(filt.iloc[-1].name) + 1
    return(start, stop)

def read_geno(file):
    '''Reads a geno file into a masked numpy matrix'''
    return(np.genfromtxt(
        file,               # the file
        dtype='uint8',      # read the data in as 1-byte integers
        delimiter=1,        # 1-byte width data
        missing_values=9,   # 9 indicates missing data
        usemask=True        # return a masked array
    ))


def read_geno_pop_chr(pop, chromosome):
    '''Reads a slice of a geno file into a masked numpy matrix'''
    f = open(path + pop + '.geno')      # open the file
    (start, stop) = get_chr_range(chromosome)
    s = it.islice(f, start, stop) # slice the file only keeping SNPs of chr
    return read_geno(s)

In [97]:
######################## solving question 0 #########################

# read genotypes from YRI ind. from chr 3 into masked numpy array
geno = read_geno_pop_chr('YRI', 3)

# get allele freq. (returns '-1' if all values at a locus are missing)
af = geno.mean(axis=1).filled(-1) / 2

# get minor allele frequencies for first 10 SNPs
maf = af[0:10]; maf[maf > 0.5] = 1 - maf[maf > 0.5]

# print out results
for i, j in enumerate (maf):
    print("SNP " + str(i) + ":    " + str(np.round(j,3)))


SNP 0:    0.42
SNP 1:    0.088
SNP 2:    0.158
SNP 3:    0.19
SNP 4:    0.27
SNP 5:    0.129
SNP 6:    0.088
SNP 7:    0.128
SNP 8:    0.08
SNP 9:    0.295


## (1) $F_{ST}$

Using common HapMap3 SNPs from chromosome 1, estimate $F_{ST}(CEU,YRI)$ using each of the two estimators described in Week 1 lecture notes.  Similarly, estimate $F_{ST}(CEU,CHB)$ and $F_{ST}(CHB,YRI)$ using each of the two estimators.  Note: for each SNP, only samples with valid (non-missing) data for that SNP should be included in the computation.

### Solution

In [98]:
def fst_est1(pop1, pop2, chr): 
    pop1_geno = read_geno_pop_chr(pop1, chr)
    pop2_geno = read_geno_pop_chr(pop2, chr)

    # Get allele frequencies (returns '-1' if all values at a locus are missing)
    pop1_af = pop1_geno.mean(axis=1).filled(-1) / 2
    pop2_af = pop2_geno.mean(axis=1).filled(-1) / 2

    # Select valid SNPs present in both populations
    valid_snps = (pop1_af >= 0) & (pop2_af >= 0)

    # Filter valid allele frequencies and genotypes
    af1_filtered = pop1_af[valid_snps]
    af2_filtered = pop2_af[valid_snps]
    pop1_geno_valid = pop1_geno[valid_snps, :]
    pop2_geno_valid = pop2_geno[valid_snps, :]

    # Compute mean allele frequency of two populations
    p = (af1_filtered + af2_filtered) / 2

    # Compute sample sizes
    n1 = np.sum(~pop1_geno_valid.mask, axis=1)
    n2 = np.sum(~pop2_geno_valid.mask, axis=1)

    # Compute Fst
    fst = np.mean(((af1_filtered - af2_filtered) ** 2 - (1 / (2 * n1) + 1 / (2 * n2)) * p * (1 - p)) / (2 * p * (1 - p)))

    return fst

In [99]:
# Approach 1
print('FST(CEU, YRI) using approach 1 (mean)', fst_est1('CEU', 'YRI', 1))
print('FST(CEU, CHB) using approach 1 (mean)', fst_est1('CEU', 'CHB', 1))
print('FST(CHB, YRI) using approach 1 (mean)', fst_est1('CHB', 'YRI', 1))

FST(CEU, YRI) using approach 1 (mean) 0.13419004319009276
FST(CEU, CHB) using approach 1 (mean) 0.0981403003155575
FST(CHB, YRI) using approach 1 (mean) 0.15053565636515728


In [100]:
# Approach 2
def fst_est2(pop1, pop2, chr): 
    pop1_geno = read_geno_pop_chr(pop1, chr)
    pop2_geno = read_geno_pop_chr(pop2, chr)

    # Get allele frequencies (returns '-1' if all values at a locus are missing)
    pop1_af = pop1_geno.mean(axis=1).filled(-1) / 2
    pop2_af = pop2_geno.mean(axis=1).filled(-1) / 2

    # Select valid SNPs present in both populations
    valid_snps = (pop1_af >= 0) & (pop2_af >= 0)

    # Filter valid allele frequencies and genotypes
    af1_filtered = pop1_af[valid_snps]
    af2_filtered = pop2_af[valid_snps]
    pop1_geno_valid = pop1_geno[valid_snps, :]
    pop2_geno_valid = pop2_geno[valid_snps, :]

    # Compute mean allele frequency of two populations
    p = (af1_filtered + af2_filtered) / 2

    # Compute sample sizes
    n1 = np.sum(~pop1_geno_valid.mask, axis=1)
    n2 = np.sum(~pop2_geno_valid.mask, axis=1)

    # Compute Fst
    fst = np.sum(((af1_filtered - af2_filtered) ** 2 - (1 / (2 * n1) + 1 / (2 * n2)) * p * (1 - p)) / np.sum(2 * p * (1 - p)))

    return fst

In [101]:
# Approach 2 
print('FST(CEU, YRI) using approach 2 (sum)', fst_est2('CEU', 'YRI', 1))
print('FST(CEU, CHB) using approach 2 (sum)', fst_est2('CEU', 'CHB', 1))
print('FST(CHB, YRI) using approach 2 (sum)', fst_est2('CHB', 'YRI', 1))

FST(CEU, YRI) using approach 2 (sum) 0.14618376166601865
FST(CEU, CHB) using approach 2 (sum) 0.1062894870506411
FST(CHB, YRI) using approach 2 (sum) 0.1656903964385152


*comments*
Based on the result, CHB and YRI have the largest genetic distance, which is consistent with what we learned in class. Also, the second approach using the summation is more stable because we are calculating FST based on individual SNPs, rather than averaging the allele frequencies across all SNPs. 

## (2) $r^2$

Using CEU data at common HapMap3 SNPs from chromosome 1, compute the average r2 between each pair of consecutive SNPs.  Repeat the analogous computation for CHB and YRI.
Comment on the relative results for CEU, CHB and YRI populations.  Note: when computing the r2 between two SNPs, only samples that have valid (non-missing) data for both SNPs should be included in the computation of r2. 

### Solution

In [102]:
def compute_r2(snp1, snp2):
    # Use masks to filter valid data
    valid = ~snp1.mask & ~snp2.mask
    if np.sum(valid) < 2:
        return np.nan

    gA = snp1[valid].astype(float)
    gB = snp2[valid].astype(float)
    
    if np.std(gA) == 0 or np.std(gB) == 0:
        return np.nan
    
    # Genotype count for allele
    gA_mean = np.mean(gA) 
    gB_mean = np.mean(gB) 

    Na = len(gA)
    Nb = len(gB)

    var_a = ((gA - gA_mean)**2).sum()/Na
    var_b = ((gB - gB_mean)**2).sum()/Nb

    # Compute r^2
    numerator = ((gA *gB).mean() - (gA_mean * gB_mean))**2
    denominator = var_a * var_b
    if denominator == 0:
        return np.nan
    return numerator / denominator

def average_r2(pop, chr):
    geno_data = read_geno_pop_chr(pop, chr)
    r2_values = []

    for i in range(geno_data.shape[0] - 1):
        snp1 = geno_data[i, :]
        snp2 = geno_data[i + 1, :]

        r2 = compute_r2(snp1, snp2)
        if not np.isnan(r2):
            r2_values.append(r2)

    avg_r2 = np.mean(r2_values) if r2_values else np.nan
    return avg_r2

In [103]:
print("Average r² between consecutive SNPs (CEU):", average_r2("CEU", 1))
print("Average r² between consecutive SNPs (CHB):", average_r2("CHB", 1))
print("Average r² between consecutive SNPs (YRI):", average_r2("YRI", 1))

Average r² between consecutive SNPs (CEU): 0.5395113735023291
Average r² between consecutive SNPs (CHB): 0.559498364747758
Average r² between consecutive SNPs (YRI): 0.377933491199528


*comments*
Using the HapMap3 SNPs, we found the r² values for CEU and CHB populations are relatively higher compared to r² for YRI, which is consistent with the fact that non-African descent have higher LD due to out-of Africa population bottleneck. In other words, the lower r² value in YRI population shows greater genetic diversity (more recombination events) and weaker correlations (lower LD) between neighboring SNPs. 

## (3) $r^2_{15}$

Define CEU15 to be the set of the first 15 CEU individuals.  Repeat the computation of (2) using CEU15 instead of CEU, continuing to use data from chromosome 1.  Repeat the analogous computation for CHB15 and for YRI15.  Comment on the differences vs. (2).
Note: any pair of SNPs in which either SNP is monomorphic (minor allele frequency = 0%) in the specific set of samples being analyzed should be omitted from the computation.

### Solution

In [104]:
# Subsetting
def subset_geno(pop, chr, n):
    full_geno = read_geno_pop_chr(pop, chr)
    sub_geno = full_geno[:, :n]
    return sub_geno

# r2 for subset genome
def avg_r2_subset(sub_geno):
    r2_values = []

    for i in range(sub_geno.shape[0] - 1):
        snp1 = sub_geno[i, :]
        snp2 = sub_geno[i + 1, :]
        
        r2 = compute_r2(snp1, snp2)
        if not np.isnan(r2):
            r2_values.append(r2)
    
    avg_r2 = np.mean(r2_values) if r2_values else np.nan
    return avg_r2

In [105]:
ceu15 = subset_geno('CEU', 1, 15)
chb15 = subset_geno('CHB', 1, 15)
yri15 = subset_geno('YRI', 1, 15)

print("Average r² for CEU15 at chromosome 1: ", avg_r2_subset(ceu15))
print("Average r² for CHB15 at chromosome 1: ", avg_r2_subset(chb15))
print("Average r² for YRI15 at chromosome 1: ", avg_r2_subset(yri15))

Average r² for CEU15 at chromosome 1:  0.5621711130349134
Average r² for CHB15 at chromosome 1:  0.5798237624455671
Average r² for YRI15 at chromosome 1:  0.40815801596949286


*comments*
Here, we can see that the general trend of r² is the same when we use a subset of population. YRI still has the lowest r² (lowest LD, greatest diveristy) due to its ancestral role. The slightly higher values that we observe could be due to the fact that these individuals we sampled are more related or homogeneous. With smaller sample sizes, the estimates could be more prone to such bias. 

## (4) Armitage Trend Test

Consider the first SNP (rs3131972) on chromosome 1.  Define the 2 CEU individuals with genotype 0 at this SNP + all 33 CEU individuals with genotype 1 at this SNP + the first 10 CEU individuals with genotype 2 at this SNP to be Cases, and the other 67 CEU individuals to be Controls.  Compute the Armitage Trend Test chisq statistic for CEU individuals at this SNP.   For each of the next 10 SNPs (“tag SNPs”) compute the r2 with rs3131972 in CEU and the Armitage Trend Test chisq statistic for CEU individuals at this SNP.  How do these compare?

### Solution

In [106]:
CEU_SNPs = read_snp(path + 'CEU.snp') 
CEU_SNPs.head


<bound method NDFrame.head of             chromosome  morgans   position ref alt
rs3131972            1      0.0     742584   G   A
rs3131969            1      0.0     744045   G   A
rs3131967            1      0.0     744197   C   T
rs1048488            1      0.0     750775   T   C
rs12562034           1      0.0     758311   G   A
...                ...      ...        ...  ..  ..
rs672932            23      0.0  154569169   G   A
rs669237            23      0.0  154570039   G   T
rs509981            23      0.0  154578239   C   T
rs644138            23      0.0  154580775   A   G
rs557132            23      0.0  154582606   C   T

[718848 rows x 5 columns]>

In [107]:
CEU_geno = read_geno_pop_chr('CEU', 1)
genotypes_rs3131972 = CEU_geno[0, :]
# print(genotypes_rs3131972)

# Define cases and controls
case_0 = np.where(genotypes_rs3131972 == 0)[0][:2]
case_1 = np.where(genotypes_rs3131972 == 1)[0]
case_2 = np.where(genotypes_rs3131972 == 2)[0][:10]

cases = np.concatenate((case_0, case_1, case_2))

# Define controls
all_ind = np.arange(len(genotypes_rs3131972))
controls = np.setdiff1d(all_ind, cases)

In [108]:
# Define phenotype vector
phenotype = np.zeros(len(genotypes_rs3131972))
phenotype[cases] = 1

In [109]:
def armitage_trend_stat(genotype, phenotype):
    # Use masks to filter valid data
    valid = ~genotype.mask
    if not np.sum(valid):
        return np.nan  
    genotype = genotype[valid]
    phenotype = phenotype[valid]
    
    if len(genotype) == 0:
        return np.nan

    r = np.corrcoef(genotype, phenotype)[0, 1]  
    if np.isnan(r):
        return np.nan

    N = len(genotype)
    return N * r ** 2

In [110]:
# Compute for rs3131972
chi2_rs3131972 = armitage_trend_stat(genotypes_rs3131972, phenotype)
print("rs3131972 Armitage Trend Test chi-square =", chi2_rs3131972)

rs3131972 Armitage Trend Test chi-square = 70.83101320370945


In [111]:
# Compute Armitage Trend Test for next 10 SNPs
next_10_snps = CEU_geno[1: 11, :]
r2_values = []

for tag_snp_geno in next_10_snps:
    # Calculate r^2 with rs3131972
    r2 = compute_r2(CEU_geno[0, :], tag_snp_geno)
    r2_values.append(r2)

In [118]:
# Armitage Trend Test for next 10 snps

for i in range(1, 11):
    tag_snp = CEU_geno[i, :].astype(float)

    # Compute r^2 between tag SNP and rs3131972
    r2_tag_rs = compute_r2(genotypes_rs3131972, tag_snp)

    # Compute Armitage Trend Test for the tag SNP
    chi2_tag = armitage_trend_stat(tag_snp, phenotype)

    # Extract SNP name from CEU_SNPs index
    snp_name = CEU_SNPs.index[i]

    print(f"SNP {snp_name}: r^2 =", r2_tag_rs,
          "; Amritage Trend Test chi-square =", chi2_tag)

SNP rs3131969: r^2 = 0.776656860657217 ; Amritage Trend Test chi-square = 51.60935350756532
SNP rs3131967: r^2 = 0.6868546396912072 ; Amritage Trend Test chi-square = 43.051757210385695
SNP rs1048488: r^2 = 0.9999999999999983 ; Amritage Trend Test chi-square = 70.04186046511624
SNP rs12562034: r^2 = 0.0362357302812944 ; Amritage Trend Test chi-square = 3.6906324096528955
SNP rs4040617: r^2 = 0.7477047903830414 ; Amritage Trend Test chi-square = 49.121338781864175
SNP rs4970383: r^2 = 0.033781901882544804 ; Amritage Trend Test chi-square = 2.529664854024452
SNP rs4475691: r^2 = 0.0002376535671140297 ; Amritage Trend Test chi-square = 0.20233583853780723
SNP rs1806509: r^2 = 0.09094990701309916 ; Amritage Trend Test chi-square = 7.310671026766338
SNP rs7537756: r^2 = 0.001498338730574989 ; Amritage Trend Test chi-square = 0.04200105002625064
SNP rs2340587: r^2 = 0.009075395594167007 ; Amritage Trend Test chi-square = 0.5207807118254876


*comments*
Here, we defined the first SNP as the 'casual' SNP, which has a high correlation with the phenotype. As we move away from the causal SNP, we find that the r2 values show a decreasing trend. As r2 values decrease, we also observe that the Amritage Trend Test statistics decrease rapidly, meaning that for SNPs that are not that correlated with the causal SNP, it will be harder to discover strong association between the tag SNP and the phenotype. 

## (5) Unlinked SNPs

Using CEU data at common HapMap3 SNPs from chromosome 1, compute the proportion of SNPs that do NOT have r2>0.8 with either the preceding or following HapMap3 SNP.  Repeat the computation for CHB.  Repeat the computation for YRI.  Compute the proportion of SNPs satisfying the property that in at least one of CEU and CHB they do NOT have r2>0.8 with either the preceding or following HapMap3 SNP.  Finally, compute the proportion of SNPs satisfying the property that in at least one of CEU, CHB and YRI they do NOT have r2>0.8 with either the preceding or following HapMap3 SNP.  Comment on the implications for fine-mapping.  

### Solution

In [113]:
CHB_geno = read_geno_pop_chr('CHB', 1)
YRI_geno = read_geno_pop_chr('YRI', 1)

# Compute proportion

def compute_proportion(geno_data):
    lower_r2 = 0
    total_snps = geno_data.shape[0]

    for i in range(1, total_snps - 1):
        snp = geno_data[i, :]
        prev = geno_data[i - 1, :]
        next = geno_data[i + 1, :]

        r2_prev = compute_r2(prev, snp)
        r2_next = compute_r2(next, snp)
        # Check if neither r2 is greater than 0.8
        if r2_prev <= 0.8 and r2_next <= 0.8:
            lower_r2 += 1
    return lower_r2 / (total_snps - 2) if total_snps > 2 else np.nan


In [114]:
# Compute proportions for each population
ceu_proportion = compute_proportion(CEU_geno)

print("CEU Proportion:", ceu_proportion)

chb_proportion = compute_proportion(CHB_geno)
yri_proportion = compute_proportion(YRI_geno)

print("CHB Proportion:", chb_proportion)
print("YRI Proportion:", yri_proportion)

CEU Proportion: 0.46471124131401725
CHB Proportion: 0.4324681588396063
YRI Proportion: 0.6725317518443757


*comments* 
The higher proportion of unlinked SNPs in YRI population reflects the greater genetic diversity in African populations. In other words, SNPs in YRI population have weaker coorelation with nearby tags. The correlations between SNPs are similar among CEU and CHB populations in this dataset. 

In [115]:
def compute_joint_proportion(ceu_geno, chb_geno):
    lower_r2_joint = 0
    total_snps = ceu_geno.shape[0] # Both population have 55983 SNPs

    for i in range(1, total_snps - 1):
        # Extract SNPs
        ceu_snp = ceu_geno[i, :]
        chb_snp = chb_geno[i, :]
        
        # Get adjacent SNPs for CEU and CHB
        ceu_prev_snp, ceu_next_snp = ceu_geno[i - 1, :], ceu_geno[i + 1, :]
        chb_prev_snp, chb_next_snp = chb_geno[i - 1, :], chb_geno[i + 1, :]

        # Compute r^2 for CEU
        r2_ceu_prev = compute_r2(ceu_prev_snp, ceu_snp)
        r2_ceu_next = compute_r2(ceu_next_snp, ceu_snp)

        # Compute r^2 for CHB
        r2_chb_prev = compute_r2(chb_prev_snp, chb_snp)
        r2_chb_next = compute_r2(chb_next_snp, chb_snp)

        # Check the condition: at least one of CEU or CHB does not satisfy r^2 > 0.8
        ceu_lowr2 = (r2_ceu_prev <= 0.8) and (r2_ceu_next <= 0.8)
        chb_lowr2 = (r2_chb_prev <= 0.8) and (r2_chb_next <= 0.8)
        
        if ceu_lowr2 or chb_lowr2:
            lower_r2_joint += 1

    return lower_r2_joint / (total_snps - 2) 

# Compute the proportion
proportion = compute_joint_proportion(CEU_geno, CHB_geno)
print("Proportion of SNPs satisfying the property:", proportion)


Proportion of SNPs satisfying the property: 0.5246780157553456


*comments* 
When we consider 2 populations together, more that half of the SNPs are isolated or unlinked. This gives us a greater chance of localizing the true casual gene as compared to relying solely on one population. 

In [116]:
def compute_all_proportion(ceu_geno, chb_geno, yri_geno):
    lower_r2_joint = 0
    total_snps = ceu_geno.shape[0] 

    for i in range(1, total_snps - 1):
        # Extract SNPs
        ceu_snp = ceu_geno[i, :]
        chb_snp = chb_geno[i, :]
        yri_snp = yri_geno[i, :]
        
        # Get adjacent SNPs for CEU and CHB
        ceu_prev_snp, ceu_next_snp = ceu_geno[i - 1, :], ceu_geno[i + 1, :]
        chb_prev_snp, chb_next_snp = chb_geno[i - 1, :], chb_geno[i + 1, :]
        yri_prev_snp, yri_next_snp = yri_geno[i - 1, :], yri_geno[i + 1, :]

        # Compute r^2 for CEU
        r2_ceu_prev = compute_r2(ceu_prev_snp, ceu_snp)
        r2_ceu_next = compute_r2(ceu_next_snp, ceu_snp)

        # Compute r^2 for CHB
        r2_chb_prev = compute_r2(chb_prev_snp, chb_snp)
        r2_chb_next = compute_r2(chb_next_snp, chb_snp)

        # Compute r^2 for YRI
        r2_yri_prev = compute_r2(yri_prev_snp, yri_snp)
        r2_yri_next = compute_r2(yri_next_snp, yri_snp)

        # Check the condition: at least one of CEU or CHB does not satisfy r^2 > 0.8
        ceu_lowr2 = (r2_ceu_prev <= 0.8) and (r2_ceu_next <= 0.8)
        chb_lowr2 = (r2_chb_prev <= 0.8) and (r2_chb_next <= 0.8)
        yri_lowr2 = (r2_yri_prev <= 0.8) and (r2_yri_next <= 0.8)
        

        if ceu_lowr2 or chb_lowr2 or yri_lowr2:
            lower_r2_joint += 1

    return lower_r2_joint / (total_snps - 2) # Ignoring the first and last snp for 3 populations

# Compute the proportion
proportion = compute_all_proportion(CEU_geno, CHB_geno, YRI_geno)
print("Proportion of SNPs satisfying the property:", proportion)

Proportion of SNPs satisfying the property: 0.7108661867419303


*comments* 
When we pull together all three populations, especially when joinging the YRI population, the proportion of unlinked genes increases substantially. This means that combining information accross populations will improve fine-mapping resolution, because we would not end up with a large number of highly correlated candidate genes. 