# Experience 2

**Name**:  Tina Yung-Fang Tu
**Time Spent**: 15-20 hrs

In [1]:
import numpy as np
import pandas as pd
import itertools as it

In [2]:
# please set the path to your data directory here
path = './EPI511 2/'

# please use the following function (or something like it) to read files
def pname(name):
    '''Prepend the path to the filename'''
    return path + '/' + name

def popen(name):
    '''Open file in the path'''
    return open(pname(name))

In [3]:
#################### functions to read in data ##################
def read_snp(file):
    '''Read a snp file into a pandas dataframe'''
    return(pd.read_table(
        file,
        sep='\s+', # columns are separated by whitespace
        # names of the columns
        names=[None, 'chromosome', 'morgans', 'position', 'ref', 'alt'],
        index_col=0))

SNPs = read_snp(path + 'HapMap3.snp') 

def get_chr_range(chromosome):
    '''Returns the range of positions where SNPs for a chromosome are kept'''
    filt = SNPs.query('chromosome=={}'.format(chromosome))
    start = SNPs.index.get_loc(filt.iloc[0].name)
    stop  = SNPs.index.get_loc(filt.iloc[-1].name) + 1
    return(start, stop)

def read_geno(file):
    '''Reads a geno file into a masked numpy matrix'''
    return(np.genfromtxt(
        file,               # the file
        dtype='uint8',      # read the data in as 1-byte integers
        delimiter=1,        # 1-byte width data
        missing_values=9,   # 9 indicates missing data
        usemask=True        # return a masked array
    ))

def read_geno_pop_chr(pop, chromosome):
    '''Reads a slice of a geno file into a masked numpy matrix'''
    f = open(path + pop + '.geno')      # open the file
    (start, stop) = get_chr_range(chromosome)
    s = it.islice(f, start, stop) # slice the file only keeping SNPs of chr
    return read_geno(s)

## (1) Unsupervised clustering without fractional ancestry

(Unsupervised clustering without fractional ancestry) Using common HapMap3 SNPs from chromosome 21, apply the EM algorithm to cluster the set of individuals from TSI, JPT and LWK populations into 3 clusters (0, 1, 2), without allowing fractional ancestry and without using allele frequencies from ancestral populations.  (Initialization step: randomly assign ancestry 0 or 1 or 2 to each individual.  M-step: update allele frequencies for populations 0 or 1 or 2 based on cluster assignments and genotypes.  E-step: compute likelihoods for each individual for membership in each cluster and update cluster membership based on highest likelihood.  Note: all genotypes with missing data should be omitted from the computations.).  Report the cluster memberships for each TSI, JPT and LWK individual after each E-step, for the first few E-steps.  How many E-steps does it take for the EM algorithm to reach convergence?  Comment on the correspondence between membership in inferred clusters and membership in the actual populations TSI, JPT and LWK.

### Solution

In [4]:
TSI_geno = read_geno_pop_chr('TSI', 21)
JPT_geno = read_geno_pop_chr('JPT', 21)
LWK_geno = read_geno_pop_chr('LWK', 21)

In [5]:
np.random.seed(42)
# Combine the three populations horizontally (SNPs x Individuals)
geno_data = np.ma.hstack((TSI_geno, JPT_geno, LWK_geno))

# Remove SNPs with any missing values 
valid_snp_mask = ~np.any(geno_data.mask, axis=1)
geno_data = geno_data[valid_snp_mask, :]

num_clusters = 3
num_individuals = geno_data.shape[1]

In [6]:
# Initialize cluster assignments randomly
num_clusters = 3
num_individuals = geno_data.shape[1]
cluster_assignments = np.random.randint(0, num_clusters, num_individuals)

# Compute allele frequency
def compute_allele_frequencies(geno, clusters, num_clusters):
    """
    Compute allele frequencies for each SNP and each cluster.
    Returns an array of shape (num_clusters, num_SNPs, 3) containing the frequency 
    of genotype 0, 1, and 2 for each SNP in each cluster.
    """
    freqs = np.zeros((num_clusters, geno.shape[0], 3))
    for k in range(num_clusters):
        cluster_indices = np.where(clusters == k)[0]
        if len(cluster_indices) > 0:
            # Extract the genotypes for individuals in cluster k
            cluster_geno = geno[:, cluster_indices]  # Already a masked array, but no missing values remain
            for g in range(3):
                # np.ma.mean ignores masked values automatically
                freq = np.ma.mean(cluster_geno == g, axis=1)
                freqs[k, :, g] = freq.filled(0)
    return freqs

In [7]:
max_iter = 10
n_TSI = TSI_geno.shape[1]
n_JPT = JPT_geno.shape[1]
n_LWK = LWK_geno.shape[1]

# EM algorithm 
for iteration in range(max_iter):

    allele_freqs = compute_allele_frequencies(geno_data, cluster_assignments, num_clusters)
    
    new_assignments = np.zeros(num_individuals, dtype=int)
    snps = np.arange(geno_data.shape[0])
    for i in range(num_individuals):
        individual_geno = geno_data[:, i]
        likelihoods = np.zeros(num_clusters)
        for k in range(num_clusters):
            # For each SNP, pick the probability corresponding to the observed genotype
            geno_probs = allele_freqs[k, snps, individual_geno.data]
            likelihoods[k] = np.sum(np.log(geno_probs + 1e-10))
        new_assignments[i] = np.argmax(likelihoods)
    
    # Report cluster memberships for each population after the E-step
    tsi_assignments = new_assignments[:n_TSI]
    jpt_assignments = new_assignments[n_TSI : n_TSI + n_JPT]
    lwk_assignments = new_assignments[n_TSI + n_JPT : n_TSI + n_JPT + n_LWK]
    
    print(f"Iteration {iteration + 1}:")
    print(f"  TSI cluster assignments: {tsi_assignments}")
    print(f"  JPT cluster assignments: {jpt_assignments}")
    print(f"  LWK cluster assignments: {lwk_assignments}")

    # Check convergence
    if np.all(new_assignments == cluster_assignments):
        print(f"Algorithm converged at iteration {iteration + 1}")
        break
    
    # Update assignments for next iteration
    cluster_assignments = new_assignments.copy()

else:
    print("Reached maximum iterations without full convergence.")


Iteration 1:
  TSI cluster assignments: [2 0 2 2 0 0 2 1 2 2 2 2 0 2 1 0 1 1 1 1 1 0 1 1 0 0 1 2 2 2 1 2 1 1 2 1 2
 2 0 2 0 2 2 0 0 2 1 0 1 1 1 1 1 0 1 2 2 0 2 2 1 0 1 1 1 1 1 1 1 0 2 1 1 1
 1 1 1 2 2 1 2 0 1 0 0 1 2 0]
  JPT cluster assignments: [1 0 0 0 0 2 0 0 0 2 0 0 2 2 2 0 2 2 0 2 0 1 2 1 0 2 0 1 0 2 2 0 0 2 2 2 2
 0 2 0 2 1 2 0 0 1 2 2 1 2 2 0 2 2 1 1 0 2 2 2 0 0 1 0 2 2 0 2 2 0 0 2 2 2
 1 1 1 0 1 0 0 2 1 2 2 0]
  LWK cluster assignments: [2 0 2 1 0 0 0 2 1 0 0 0 2 1 1 2 0 1 0 0 2 1 2 2 1 0 0 1 0 1 1 2 1 2 0 0 0
 0 2 0 1 1 1 2 0 0 0 2 1 1 0 1 1 2 2 2 2 0 2 1 0 1 1 1 2 2 0 0 2 1 0 2 2 2
 1 2 2 2 2 0 0 2 1 0 2 0 0 1 2 2]
Iteration 2:
  TSI cluster assignments: [1 1 2 2 0 0 1 1 2 2 1 2 1 2 1 0 1 1 1 1 1 1 1 1 1 0 1 2 2 1 1 2 1 1 2 1 2
 2 0 2 0 2 2 0 0 2 1 0 1 1 1 1 1 1 1 2 2 0 2 2 1 1 1 1 1 1 1 1 1 0 2 1 1 1
 1 1 1 2 2 1 2 1 1 1 0 1 2 1]
  JPT cluster assignments: [0 0 0 0 0 2 0 0 0 2 0 0 2 2 2 0 2 2 0 2 0 2 2 2 0 2 0 0 0 2 2 0 0 2 2 2 2
 0 2 0 2 0 2 0 0 2 2 2 2 2 2 0 2 2 1 2 0 2 2

*comments*
The algorithm took 7 steps to converge, and it successfully separated 3 populations into 3 clusters, suggesting that the individuals within a population are relatively homogeneous as compared to individuals from another population. The algorithm is effective in assigning individuals to cluster based on allele frequencies. 

## (2) Supervised clustering with fractional ancestry

(Supervised clustering with fractional ancestry) For each of the first 8 ASW samples, estimate their % European ancestry on chromosome 1 by maximizing the log likelihood of generating the ASW genotypes as a linear combination of CEU and YRI ancestral populations.  (Ignore sampling error in CEU and YRI allele frequencies, and ignore the fact that markers are not independent due to LD between markers.  Once again, all genotypes with missing data should be omitted from the computations)  For each of the first 8 ASW samples, repeat the computation without allowing fractional ancestry and indicate which of CEU or YRI ancestry attains the higher log likelihood.

### Solution

In [8]:
# Extract data
ASW_geno = read_geno_pop_chr('ASW', 1)
CEU_geno = read_geno_pop_chr('CEU', 1)
YRI_geno = read_geno_pop_chr('YRI', 1)

In [9]:
valid = ~(CEU_geno.mask.any(axis=1) | YRI_geno.mask.any(axis=1) | ASW_geno.mask.any(axis=1))

# Apply mask to filter SNPs
valid_ASW = ASW_geno[valid]
valid_CEU = CEU_geno[valid]
valid_YRI = YRI_geno[valid]

In [10]:
# Compute the allele frequency for ancestrol population 1 (CEU) and populatiton 2 (YRI)
def compute_allele_frequencies(geno_matrix): 
    return np.mean(geno_matrix, axis=1)/2

p_CEU = compute_allele_frequencies(valid_CEU)
p_YRI = compute_allele_frequencies(valid_YRI)

In [11]:
def compute_log_likelihood(geno, pop1, pop2, alpha):
    
    p_mixed = alpha * pop1 + (1 - alpha) * pop2  

    # Compute probabilities for each genotype
    prob_2 = p_mixed ** 2  
    prob_1 = 2 * p_mixed * (1 - p_mixed)  
    prob_0 = (1 - p_mixed) ** 2  

    # Compute log-likelihood
    log_likelihood = np.sum(
        (geno == 2) * np.log(prob_2) +  
        (geno == 1) * np.log(prob_1) +  
        (geno == 0) * np.log(prob_0)
    )
    
    return log_likelihood

In [12]:
def estimate_fractional_ancestry(geno, pop1, pop2):
    """Finds the alpha that maximizes log-likelihood."""
    alphas = np.linspace(0, 1, 101)  
    best_alpha = max(alphas, key=lambda a: compute_log_likelihood(geno, pop1, pop2, a))
    return best_alpha

In [13]:
# Analyze first 8 ASW samples
for i in range(8):
    fractional_ancestry = estimate_fractional_ancestry(valid_ASW[:, i], p_CEU, p_YRI)
    print(f"ASW Sample {i+1}: Fractional European Ancestry = {fractional_ancestry}")

ASW Sample 1: Fractional European Ancestry = 0.12
ASW Sample 2: Fractional European Ancestry = 0.19
ASW Sample 3: Fractional European Ancestry = 0.43
ASW Sample 4: Fractional European Ancestry = 0.2
ASW Sample 5: Fractional European Ancestry = 0.11
ASW Sample 6: Fractional European Ancestry = 0.07
ASW Sample 7: Fractional European Ancestry = 0.35000000000000003
ASW Sample 8: Fractional European Ancestry = 0.08


In [14]:
# Compute log-likelihoods for each sample without fractional ancestry 
log_likelihoods = []
for i in range(8):
    ll_ceu = compute_log_likelihood(valid_ASW[:, i], p_CEU, p_YRI, alpha=1)  # CEU ancestry
    ll_yri = compute_log_likelihood(valid_ASW[:, i], p_CEU, p_YRI, alpha=0)  # YRI ancestry
    log_likelihoods.append(("CEU" if ll_ceu > ll_yri else "YRI"))
    print(f"ASW Sample {i+1}: Ancestry = {log_likelihoods[i]}")

ASW Sample 1: Ancestry = YRI
ASW Sample 2: Ancestry = YRI
ASW Sample 3: Ancestry = YRI
ASW Sample 4: Ancestry = YRI
ASW Sample 5: Ancestry = YRI
ASW Sample 6: Ancestry = YRI
ASW Sample 7: Ancestry = YRI
ASW Sample 8: Ancestry = YRI


*comments* 
For the ASW population, the fractional European ancestry varies from 0.07 to 0.43, showing an European and African admixture. Yet if we do not allow fractional ancestry, the YRI ancestry attains the higher log likelihood among all the 8 ASW individuals, meaning that the African ancestry contribute the majority of ancestral component in the ASW population. 

## (3) Covariances

Using common HapMap3 SNPs from chromosome 1, compute and print the 8 x 8 matrix of covariances between normalized genotypes of the first  ASW samples on chromosome 1. Starting with a normalized random vector, use the method of power iteration (normalizing the resulting vector at each iteration) to approximate the top eigenvector of this matrix.  Check that the definition of eigenvector is satisfied.  What is the correlation between this eigenvector and the vector of % European ancestry from Problem 2?     

### Solution

In [15]:
# Normalize the matrix and extract the first 8 individuals
ASW_sample = ASW_geno[:, :8]
normalized_ASW = (ASW_sample - np.mean(ASW_sample, axis=1, keepdims=True)) / np.std(ASW_sample, axis=1, keepdims=True)
print(normalized_ASW.shape)

(55983, 8)


In [16]:
# Compute covariance matrix manually (8 × 8)
num_SNPs = normalized_ASW.shape[0]
num_individuals = normalized_ASW.shape[1]

cov_matrix = np.zeros((num_individuals, num_individuals))

for i in range(num_individuals):
    for j in range(num_individuals):
        cov_ij = np.sum(normalized_ASW[:, i] * normalized_ASW[:, j]) / (num_SNPs - 1)  
        cov_matrix[i, j] = cov_ij  

print(cov_matrix)

[[ 0.97396317 -0.12961659 -0.15345579 -0.13348891 -0.12397316 -0.13924061
  -0.14362328 -0.15056483]
 [-0.12961659  0.93341261 -0.13667759 -0.11511032 -0.13709511 -0.15352397
  -0.13494362 -0.1264454 ]
 [-0.15345579 -0.13667759  1.03763682 -0.15355446 -0.1718135  -0.14956779
  -0.12483786 -0.14772983]
 [-0.13348891 -0.11511032 -0.15355446  0.99407126 -0.12829303 -0.13499524
  -0.17323414 -0.15539516]
 [-0.12397316 -0.13709511 -0.1718135  -0.12829303  0.96079132 -0.12963823
  -0.14583642 -0.12414187]
 [-0.13924061 -0.15352397 -0.14956779 -0.13499524 -0.12963823  0.95244681
  -0.13598058 -0.10950039]
 [-0.14362328 -0.13494362 -0.12483786 -0.17323414 -0.14583642 -0.13598058
   0.99572629 -0.13727039]
 [-0.15056483 -0.1264454  -0.14772983 -0.15539516 -0.12414187 -0.10950039
  -0.13727039  0.95104786]]


In [17]:
# Start with a random normalized vector
vec = np.random.rand(8)
vec /= np.linalg.norm(vec)
num_iterations = 1000

# Iteration
for k in range(num_iterations):
    new_vec = np.dot(cov_matrix, vec)  
    new_vec /= np.linalg.norm(new_vec)  

    vec = new_vec

print("\nTop Eigenvector Approximation:\n", vec)


Top Eigenvector Approximation:
 [ 0.20050247  0.05309918 -0.76177738  0.36888356  0.32840843  0.10370944
 -0.34530753  0.05248183]


In [18]:
# Check that the definition of eigenvector is satisfied
c_v = np.dot(cov_matrix, vec)
lambda_est = np.linalg.norm(c_v) / np.linalg.norm(vec)
lambda_v = lambda_est * vec

print("C * v = ", c_v)
print("lambda * v = ", lambda_v)

if np.allclose(c_v, lambda_v):
    print('The definition of eigenvector is satisfied. ')

C * v =  [ 0.24259466  0.06424648 -0.92169998  0.44632458  0.39735237  0.12548153
 -0.41779916  0.06349952]
lambda * v =  [ 0.24259466  0.06424648 -0.92169998  0.44632458  0.39735237  0.12548153
 -0.41779916  0.06349952]
The definition of eigenvector is satisfied. 


In [19]:

# Fractional European ancestry 
european_ancestry = np.array([0.12, 0.19, 0.43, 0.20, 0.11, 0.07, 0.35, 0.08])

# Compute the Pearson correlation between the eigenvector and ancestry vector
correlation = np.corrcoef(vec, european_ancestry)[0, 1]

print("Correlation between the top eigenvector and % European ancestry:", correlation)

Correlation between the top eigenvector and % European ancestry: -0.8421089448622563


*comments*
Since the definition of eigenvector is satisfied, we conclude that the computed vector is indeed an eigenvector of the covariance matrix.
The high correlation of 0.8421 suggests that the top eigenvector from the covariance matrix is strongly associated with the proportion of European ancestry. In other words, the main axis of genetic variation (captured by the top eigenvector) is reflecting the differences in European admixture among the individuals in this sample population.

## (4) Local ancestry inference with ancestral populations

(Local ancestry inference with ancestral populations) For each of the first 8 ASW samples, estimate their local ancestry (0, 1 or 2 European copies) at each location on chromosome 1 by splitting chromosome 1 into 10Mb windows and then, for each 10Mb window, applying supervised clustering (using CEU and YRI ancestral populations) within that window in a mode that allows 0%, 50% or 100% ancestry within that window.  Average together the local ancestry estimates to estimate the % European ancestry of each sample on chromosome 1.  How correlated are these estimates to the estimates of the same quantity produced in Problem 2?
(Note: any 10Mb windows containing 0 SNPs should be ignored.)


### Solution

In [20]:
# Get SNP positions for a specific chromosome (e.g., chromosome 1)
chromosome = 1  
start, stop = get_chr_range(chromosome)
snp_positions = SNPs.iloc[start:stop]['position'].values

# Calculate the average distance between consecutive SNPs
snp_differences = np.diff(snp_positions)
avg_distance = np.mean(snp_differences)  # average distance between SNPs in base pairs (bp)

snp_per_mb = 1_000_000 / avg_distance  # SNPs per 1Mb

snp_per_10mb = snp_per_mb * 10  # SNPs per 10Mb
print(f"SNPs in 10Mb: {snp_per_10mb}")


SNPs in 10Mb: 2272.0451632999484


In [21]:
# Splitting chromosome 1 into 10 Mb windows
window_size = int(snp_per_10mb) 

# Estimate local ancestry 
def estimate_local_ancestry(geno, pop1_geno, pop2_geno, window_size):
    num_samples = geno.shape[1]
    num_snps = geno.shape[0]
    windows = np.arange(0, num_snps, window_size)  # SNP-based windows
    local_ancestry = np.zeros((num_samples, len(windows)))  # Store ancestry per window

    for w, start in enumerate(windows):
        end = min(start + window_size, num_snps)

        geno_window = geno[start:end, :]
        pop1_window = pop1_geno[start:end, :]
        pop2_window = pop2_geno[start:end, :]

        valid_snps = ~np.isnan(geno_window).all(axis=1)
        if np.sum(valid_snps) == 0:
            continue  # Skip empty windows

        geno_window = geno_window[valid_snps, :]
        pop1_window = pop1_window[valid_snps, :]
        pop2_window = pop2_window[valid_snps, :]

        # Compute allele frequencies
        p1 = compute_allele_frequencies(pop1_window)  # CEU allele frequency
        p2 = compute_allele_frequencies(pop2_window)  # YRI allele frequency

        for i in range(num_samples):
            sample_geno = geno_window[:, i]

            # Compute log-likelihoods 
            ll_0 = compute_log_likelihood(sample_geno, p1, p2, alpha=0.0)  # 0% European
            ll_1 = compute_log_likelihood(sample_geno, p1, p2, alpha=0.5)  # 50% European
            ll_2 = compute_log_likelihood(sample_geno, p1, p2, alpha=1.0)  # 100% European

            # Assign ancestry based on maximum likelihood
            ancestry_state = np.argmax([ll_0, ll_1, ll_2])  
            local_ancestry[i, w] = ancestry_state

    # Compute final % European ancestry per sample
    return np.mean(local_ancestry, axis=1)/2  

In [64]:
local_ancestry = estimate_local_ancestry(ASW_sample, CEU_geno, YRI_geno, window_size)
for i in range(8):
    print(f'ASW Sample {i+1}: {local_ancestry[i]*100} % European Ancestry')

# Compute the correlation
european_ancestry = np.array([0.12, 0.19, 0.43, 0.20, 0.11, 0.07, 0.35, 0.08])
fractional_local_correlation = np.corrcoef(local_ancestry, european_ancestry)[0, 1]

print("Correlation between European ancestry based on local ancestry and fractional ancestry:", fractional_local_correlation)

ASW Sample 1: 0.14 % European Ancestry
ASW Sample 2: 0.22 % European Ancestry
ASW Sample 3: 0.4 % European Ancestry
ASW Sample 4: 0.24 % European Ancestry
ASW Sample 5: 0.12 % European Ancestry
ASW Sample 6: 0.04 % European Ancestry
ASW Sample 7: 0.38 % European Ancestry
ASW Sample 8: 0.08 % European Ancestry
Correlation between European ancestry based on local ancestry and fractional ancestry: 0.9793552469889495


*comments*
The high correlation between the calculated European ancestry based on local ancestry and fractional ancestry indicates that supervised clustering for estimating local ancestry is very accurate in reflecting the true European ancestry at each location on chromosome 1. 

## (5) Local ancestry inference with ancestral populations

For each of the first 8 ASW samples, estimate their local ancestry at each location on chromosome 1 (analogous to Problem 4) by either (a) restricting to chromosome 1 SNPs with allele frequency difference |pCEU – pYRI| ≥ 0.70, or (b) restricting to a random subset of chromosome 1 SNPs based on the fraction of SNPs retained in (a).  Discuss how results, including correlations to results of Problem 4, change in each case.



### Solution

In [129]:
def local_ancestry_restricted(geno, pop1_geno, pop2_geno):
    # Determine the number of ASW samples (columns in the genotype matrix)
    num_samples = geno.shape[1]
    
    # Compute allele frequencies for the reference populations (CEU and YRI)
    p_CEU_all = compute_allele_frequencies(pop1_geno)
    p_YRI_all = compute_allele_frequencies(pop2_geno)
    
    valid_mask = ~np.isnan(p_CEU_all) & ~np.isnan(p_YRI_all) 
    combined_mask = valid_mask & (np.abs(p_CEU_all - p_YRI_all) >= 0.70) 
    
    # Apply the mask to restrict the genotype matrices to only these SNPs
    geno_filtered = geno[combined_mask, :]
    pop1_geno_filtered = pop1_geno[combined_mask, :]
    pop2_geno_filtered = pop2_geno[combined_mask, :]
    
    # Count the number of SNPs remaining after filtering
    num_significant = geno_filtered.shape[0]
    print("Number of significant SNPs:", num_significant)
    
    # Initialize an array to store the ancestry state (0, 1, or 2 European copies) for each sample at each SNP
    ancestry_states = np.zeros((num_samples, num_significant))
    
    # Loop over each filtered SNP
    for s in range(num_significant):
        p1 = np.mean(pop1_geno_filtered[s, :]) / 2  # CEU allele frequency at SNP s
        p2 = np.mean(pop2_geno_filtered[s, :]) / 2  # YRI allele frequency at SNP s
        
        # Loop over each ASW sample to estimate ancestry at SNP s
        for i in range(num_samples):
            geno_val = np.ma.filled(geno_filtered[s, i], -1)
            p1_arr = np.array([p1])
            p2_arr = np.array([p2])
            
            # Compute log-likelihoods for three ancestry states:
            ll_0 = compute_log_likelihood(geno_val, p1_arr, p2_arr, alpha=0.0)
            ll_1 = compute_log_likelihood(geno_val, p1_arr, p2_arr, alpha=0.5)
            ll_2 = compute_log_likelihood(geno_val, p1_arr, p2_arr, alpha=1.0)
            
            # Choose the state with the highest likelihood.
            state = np.argmax([ll_0, ll_1, ll_2])
            ancestry_states[i, s] = state
    
    # Average the ancestry state across all significant SNPs for each sample.
    final_ancestry = np.mean(ancestry_states, axis=1) / 2    
    return final_ancestry


In [130]:
new_local_ancestry_values = local_ancestry_restricted(ASW_sample, CEU_geno, YRI_geno)
for i in range(8):
    print(f'ASW Sample {i+1}: {new_local_ancestry_values[i]} % European Ancestry based on SNPs with allele frequency difference ≥ 0.70')

# Compute the correlation
fractional_correlation = np.corrcoef(local_ancestry, new_local_ancestry_values)[0, 1]
print("Correlation between calculated local ancestry using all SNPs and using significant SNPs:", fractional_correlation)

Number of significant SNPs: 217
ASW Sample 1: 0.16589861751152074 % European Ancestry based on SNPs with allele frequency difference ≥ 0.70
ASW Sample 2: 0.23271889400921658 % European Ancestry based on SNPs with allele frequency difference ≥ 0.70
ASW Sample 3: 0.44930875576036866 % European Ancestry based on SNPs with allele frequency difference ≥ 0.70
ASW Sample 4: 0.27419354838709675 % European Ancestry based on SNPs with allele frequency difference ≥ 0.70
ASW Sample 5: 0.2465437788018433 % European Ancestry based on SNPs with allele frequency difference ≥ 0.70
ASW Sample 6: 0.2119815668202765 % European Ancestry based on SNPs with allele frequency difference ≥ 0.70
ASW Sample 7: 0.39631336405529954 % European Ancestry based on SNPs with allele frequency difference ≥ 0.70
ASW Sample 8: 0.15207373271889402 % European Ancestry based on SNPs with allele frequency difference ≥ 0.70
Correlation between calculated local ancestry using all SNPs and using significant SNPs: 0.910018301983057

In [120]:
def local_ancestry_random(geno, pop1_geno, pop2_geno, num_significant):
    # Determine the number of ASW samples (columns in the genotype matrix)
    num_samples = geno.shape[1]
    
    # Randomly select 217 SNPs from chromosome 1 
    snp_indices = np.random.choice(geno.shape[0], num_significant, replace=False)
    
    # Filter the genotype matrices based on the randomly selected SNPs
    geno_filtered = geno[snp_indices, :]
    pop1_geno_filtered = pop1_geno[snp_indices, :]
    pop2_geno_filtered = pop2_geno[snp_indices, :]
    
    # Initialize an array to store the ancestry state 
    ancestry_states = np.zeros((num_samples, num_significant))
    
    # Loop over each selected SNP
    for s in range(num_significant):
        p1 = np.mean(pop1_geno_filtered[s, :]) / 2  
        p2 = np.mean(pop2_geno_filtered[s, :]) / 2  
        
        # Loop over each ASW sample 
        for i in range(num_samples):
            geno_val = np.ma.filled(geno_filtered[s, i], -1)
            p1_arr = np.array([p1])
            p2_arr = np.array([p2])
            
            # Compute log-likelihoods for three ancestry states:
            ll_0 = compute_log_likelihood(geno_val, p1_arr, p2_arr, alpha=0.0)
            ll_1 = compute_log_likelihood(geno_val, p1_arr, p2_arr, alpha=0.5)
            ll_2 = compute_log_likelihood(geno_val, p1_arr, p2_arr, alpha=1.0)
            
            # Choose the state with the highest likelihood
            state = np.argmax([ll_0, ll_1, ll_2])
            ancestry_states[i, s] = state
    
    # Average the ancestry state across all randomly selected SNPs for each sample
    local_ancestry_estimates = np.mean(ancestry_states, axis=1) / 2  
    return local_ancestry_estimates

In [126]:
# Number of significant SNPs
num_significant = 217
np.random.seed(11)

random_local_ancestry_values = local_ancestry_random(ASW_sample, CEU_geno, YRI_geno, num_significant=217)
for i in range(8):
    print(f'ASW Sample {i+1}: {random_local_ancestry_values[i]} % European Ancestry based on random 217 SNPs')

# Compute the correlation
random_correlation = np.corrcoef(local_ancestry, random_local_ancestry_values)[0, 1]
print("Correlation between calculated local ancestry using all SNPs and using random SNPs:", random_correlation)

ASW Sample 1: 0.47465437788018433 % European Ancestry based on random 217 SNPs
ASW Sample 2: 0.4608294930875576 % European Ancestry based on random 217 SNPs
ASW Sample 3: 0.45852534562211983 % European Ancestry based on random 217 SNPs
ASW Sample 4: 0.49078341013824883 % European Ancestry based on random 217 SNPs
ASW Sample 5: 0.4447004608294931 % European Ancestry based on random 217 SNPs
ASW Sample 6: 0.4078341013824885 % European Ancestry based on random 217 SNPs
ASW Sample 7: 0.4470046082949309 % European Ancestry based on random 217 SNPs
ASW Sample 8: 0.4423963133640553 % European Ancestry based on random 217 SNPs
Correlation between calculated local ancestry using all SNPs and using random SNPs: 0.4157118515847857


*comments*
When using the SNPs with allele frequency difference |pCEU – pYRI| ≥ 0.70, we found that the estimated European ancestrty is very accurate. Also, the correlation of 0.91 between the calculated local ancestry using all SNPs and the significant SNPs is very high, indicating that the significant SNPs capture much of the information we need to infer local ancestry. However, because fewer markers are used, there are still slight deviations from the full SNP estimates. 
When randomly selecting 217 SNPs from chromosome 1, the estimated European ancestry is relatively unstable based on the SNPs we choose. The correlation between the calculated ancestry using all SNPs and the random SNPs is also much lower (0.42), suggesting that the random SNPs do not correlate as strongly with the full set of SNPs, therefore would not be a good proxy for local ancestry inference. 