In [1]:
import pandas as pd
import time
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
from IPython.display import clear_output, display
from ipywidgets import IntProgress, IntText, Text

data_path = 'HGDP/hgdp/HGDP_FinalReport_Forward.txt'
pops_path = 'HGDP/hgdp/HGDP-CEPH-ID_populations.csv'
n, m = 1043, 660918

pops = pd.read_csv(pops_path)

In [2]:
print(pops.columns)

pops['Geographic origin'].unique()

Index(['CEPH ID', 'population', 'Geographic origin', 'Region', 'Pop7Groups',
       'Sex', 'All LCLs (H1063)', 'Unrelated (1st and 2nd degree) (H951)'],
      dtype='object')


array(['Pakistan', 'Central African Republic',
       'Democratic Republic of Congo', 'Bougainville', 'France',
       'New Guinea', 'Israel (Carmel)', 'Israel (Negev)', 'Italy',
       'Israel (Central)', 'Colombia', 'Cambodia', 'Japan', 'China',
       'Orkney Islands', 'Brazil', 'Mexico', 'Russia', 'Senegal',
       'Nigeria', 'Siberia', 'Namibia', 'South Africa', 'Italy (Bergamo)',
       'Algeria (Mzab)', 'Russia Caucasus', 'Kenya'], dtype=object)

# Read and pre-process data SNP by SNP

In [3]:
dsizebox = IntText(value=1000, description='dump_size:')
display(dsizebox)

IntText(value=1000, description='dump_size:')

In [9]:
chunk_size = 1
dump_size = dsizebox.value

data = np.zeros((n, 0))
log = []

part = 0
log_num = 0
snp_names = []

progress = IntProgress(value=0, max=m, description='0.00%')
num_used = IntText(value=0)
text_box = Text(value='dump_size=%d' % dump_size)

display(progress)
display(num_used)
display(text_box)

last_processed = (-1, None)
try:
    for i, chunk in enumerate(
        pd.read_csv(data_path, sep='\t', index_col=0, chunksize=chunk_size)
    ):

        # ====================================================================
        # FIRST, verify whether we want to use this SNP in our experiments
        bases = defaultdict(int)

        skip = False
        skip_reason = None
        # (s_list is length-one; todo: change to chunk.values[0] w/ no loop)
        for s_list in chunk.values:
            for s in s_list:
                assert len(s) == 2, 's = %s' % s

                # don't include SNPs with missing data
                if '-' in s:
                    skip = True
                    skip_reason = "contains '-'"
                    break
                else:
                    bases[s[0]] += 1
                    bases[s[1]] += 1

                # only include SNPs that have exactly 
                # 2 observed base pairs across the population
                if len(bases.keys()) > 2:
                    skip = True
                    skip_reason = "contains 3+ bases"
                    break

            if skip:
                break

        # don't use alleles which only take one base pair
        if len(bases.keys()) == 1 or skip:
            if skip_reason is None:
                skip_reason = 'num bases = 1'

            progress.value += 1
            progress.description = '%.2f%%' % (progress.value / m)
            log.append('Skipping %s - %s' % (chunk.index[0], skip_reason))
            last_processed = (i, chunk.index[0])
            continue

        snp_names.append(chunk.index[0])
        # ====================================================================
        # SECOND, encode the SNP values in {0, 1, 2} and add to design matrix
        bk, bv = list(bases.keys()), list(bases.values())
        # use major allele as reference allele
        maj_all = bk[0] if bv[0] >= bv[1] else bk[1]
#         print(maj_all)

        # use 8-bit integer to save memory (since we only need to encode {0, 1, 2})
        new_col = np.zeros((n, 1), dtype=np.int8) 

        for r_num, s in enumerate(chunk.values[0]):
            new_col[r_num, 0] = len([b for b in s if b == maj_all])

        data = np.hstack([data, new_col])

        num_used.value += 1
        progress.value += 1
        progress.description = '%.2f%%' % (progress.value / m)
        log.append('Keeping %s' % chunk.index[0])
        last_processed = (i, chunk.index[0])

        # periodically write log to disk, clear from memory
        if len(log) == 10000:
            log_path = 'data/logs/logpart=%d_dsize=%d.txt' % (log_num, dump_size)

            f = open(log_path, 'w')
            for line in log:
                f.write(line+'\n')
            f.close()

            log = []

            text_box.value = 'Log part %d written (i=%d; ds=%d)...' \
                                % (log_num, i, dump_size)
            log_num += 1

        # periodically write processed data to disk, clear from memory
        if data.shape[1] == dump_size: 

            to_save = pd.DataFrame(
                data=data, index=chunk.columns, columns=snp_names
            )
            snp_names = []

            filepath = 'data/part=%d_dsize=%d.csv' % (part, dump_size)

            to_save.to_csv(filepath)
            data = np.zeros((n, 0))

            text_box.value = 'Part %d of the data is saved (i=%d; ds=%d).' \
                                    % (part, i, dump_size)
            part += 1


    if data.shape[1] > 0:
        print('Saving part %d of the data (last part).' % part)
        to_save = pd.DataFrame(
            data=data, index=chunk.columns, columns=snp_names
        )
        snp_names = []

        filepath = 'data/part=%d_dsize=%d.csv' % (part, dump_size)

        to_save.to_csv(filepath)

    if len(log) > 0:
        log_path = 'data/logs/logpart=%d_dsize=%d.txt' % (part, dump_size)
        f = open(log_path, 'w')
        for line in log:
            f.write(line+'\n')
        f.close()
except KeyboardInterrupt:
    if len(log) > 0:
        log_path = 'data/logs/logpart=%d_dsize=%d.txt' % (part, dump_size)
        f = open(log_path, 'w')
        for line in log:
            f.write(line+'\n')
        f.close()
    
    f = open('data/logs/LASTPROCESSED_dsize=%d.txt' % dump_size, 'w')
    f.write('i=%d; name=%s' % (last_processed[0], last_processed[1]))
    f.close()
    
progress.close()
num_used.close()
text_box.close()


        

IntProgress(value=0, description='0.00%', max=660918)

IntText(value=0)

Text(value='dump_size=1000')

TypeError: unorderable types: list() > int()

In [10]:
log_path = 'data/logs/logpart=%d_dsize=%d.txt' % (part, dump_size)
f = open(log_path, 'w')
for line in log:
    f.write(line+'\n')
f.close()

IndentationError: unexpected indent (<ipython-input-10-3abc36eef79c>, line 2)