In [1]:
import os
import gzip
import pandas as pd
from scipy.stats import spearmanr

In [2]:
def get_fp_file(pop_csv_file, fp_folder):
    """Returns dictionary of fingerprint paths with population as the key
    """
    # get sample names and population as dataframe

    pop_csv = pd.read_csv(pop_csv_file)

    # build dictionary of fp_file_dict
    fp_file_dict = {}

    for fp_file in os.listdir(fp_folder):
        if fp_file.endswith(".outn.gz"):        
            sample = fp_file.split('.')[0]
            row = pop_csv.loc[pop_csv['Sample'] == sample]
            pop = row.iloc[0,1]

            if pop not in fp_file_dict:
                fp_file_dict[pop] = []

            l = fp_file_dict[pop]
            l.append(fp_folder + fp_file)
            fp_file_dict[pop] = l
    
    return fp_file_dict

In [3]:
## parse for fingerprints of L=120
def read_fp(fp_file,L=120):
    fp = {}
    with gzip.open(fp_file,'rt') as f:
        for i,line in enumerate(f):
            s = line.strip().split('\t')
            if s[0] == str(L):
                fp[s[1]] = [float(i) for i in s[2:]]
    
    return fp

In [8]:
def read_pop_fp(fp_file_dict, pop):
    """Reads all fingerprints from a population
    
    Returns a dictionary of all fingerprints from a population
    """
    
    pop_fp = {}
    for fp_path in fp_file_dict[pop]:
        fp_file = fp_path.split('/')[-1]
        sample = fp_file.split('.')[0]
        
        pop_fp[sample] = read_fp(fp_path)
        
    
    return pop_fp

In [5]:
def compute_loo_population(pop_fp, loo_sample):
    """compute leave-one-out population fingerprint
    
        Arguments
            pop_fp {dict} -- dictionary of all fingerprints from a population
            loo_sample {string} -- sample to leave out of the population fingerprint
        
        Returns
            loo_pop {dict} -- a dictionary representing the population fingerprint
    """
    loo_pop = {}
    snp = list(pop_fp[loo_sample].keys())
    
    return None

In [16]:
# parameters
pop_csv_file = '/Users/victor/github/genome-fingerprints/data/sample_names_6_pop.csv'
fp_folder = '/Users/victor/Documents/results_chr21/k2/mean/'

# get fingerprint location by population
fp_file_dict = get_fp_file(pop_csv_file, fp_folder)

# read in fingerprints from population
pop = list(fp_file_dict.keys())[0]   #choose a population

pop_fp = read_pop_fp(fp_file_dict,pop)


# compute leave-one-out population fingerprint
loo_sample = list(pop_fp.keys())[0]  #choose a sample to leave out
loo_sample

'HG02088'

In [13]:
pop_fp.keys()

dict_keys(['HG02088', 'HG01865', 'HG02512', 'HG02031', 'HG01598', 'HG02122', 'HG02081', 'HG02028', 'HG02140', 'HG02075', 'HG02017', 'HG01843', 'HG01853', 'HG02133', 'HG02141', 'HG02029', 'HG02064', 'HG01852', 'HG01842', 'HG02016', 'HG02513', 'HG02020', 'HG01874', 'HG01864', 'HG01599', 'HG02082', 'HG01600', 'HG02131', 'HG02049', 'HG02121', 'HG01840', 'HG01850', 'HG02076', 'HG02128', 'HG02050', 'HG02138', 'HG02040', 'HG02032', 'HG01866', 'HG01859', 'HG01849', 'HG02139', 'HG01867', 'HG02023', 'HG02116', 'HG01848', 'HG01858', 'HG02142', 'HG02058', 'HG02048', 'HG02130', 'HG01851', 'HG01841', 'HG02067', 'HG01844', 'HG01596', 'HG02072', 'HG02086', 'HG02113', 'HG02019', 'HG01862', 'HG01872', 'HG02026', 'HG01873', 'HG01863', 'HG01855', 'HG01845', 'HG01597', 'HG02522', 'HG02073', 'HG02087', 'HG02134', 'HG02078', 'HG02035', 'HG02025', 'HG01861', 'HG01871', 'HG02057', 'HG02047', 'HG02061', 'HG01595', 'HG01847', 'HG01857', 'HG02136', 'HG01878', 'HG01868', 'HG02085', 'HG02060', 'HG02070', 'HG02521',

In [15]:
pop_fp['HG02088']['ACAC']

[-0.908,
 2.726,
 0.198,
 -0.437,
 -0.763,
 -1.006,
 0.329,
 0.529,
 1.521,
 -0.828,
 -0.578,
 -0.65,
 1.444,
 0.565,
 0.586,
 1.747,
 -0.551,
 -0.618,
 -0.755,
 -0.883,
 -0.783,
 0.113,
 -0.979,
 1.661,
 -0.868,
 0.821,
 -0.052,
 -0.916,
 -0.68,
 0.137,
 -0.79,
 -0.639,
 -0.859,
 -0.878,
 0.04,
 0.102,
 0.865,
 -0.03,
 0.879,
 -0.606,
 -0.95,
 1.076,
 -0.568,
 -0.872,
 -0.775,
 0.139,
 0.233,
 -0.009,
 -0.849,
 -0.653,
 -0.097,
 -0.793,
 -0.759,
 0.181,
 0.143,
 2.927,
 0.091,
 -0.849,
 1.004,
 -1.081,
 -0.921,
 1.153,
 -0.673,
 -0.666,
 1.231,
 1.044,
 -0.915,
 0.283,
 0.051,
 -0.62,
 0.044,
 -1.004,
 0.136,
 -0.701,
 -0.701,
 -0.683,
 0.271,
 1.089,
 -0.689,
 4.345,
 -0.776,
 -0.68,
 1.231,
 -0.931,
 1.242,
 0.21,
 -0.745,
 -0.866,
 -0.692,
 0.397,
 -0.763,
 1.063,
 -0.655,
 -0.582,
 -0.57,
 1.233,
 -0.894,
 -0.719,
 2.421,
 1.556,
 0.232,
 0.254,
 1.43,
 1.437,
 -0.976,
 0.257,
 0.478,
 0.473,
 0.336,
 -0.795,
 0.302,
 -0.798,
 0.064,
 -0.812,
 -0.532,
 2.459,
 -0.764,
 0.451,
 -0.