# Format ABO groups as derived genotypes

Written to `.gen` file format as it appears one of the simplest Plink-compatible formats.

In [1]:
import pandas as pd

In [2]:
blood_groups_df = pd.read_csv('blood_groups.tsv', sep='\t')

## 1. Make `.gen` file

https://www.cog-genomics.org/plink/1.9/formats#gen

In [3]:
gen_df = (
    blood_groups_df
    .filter(items=['IID', 'A', 'AB', 'B', 'O'])
    .set_index('IID')
    .stack()
    .reset_index()
    .rename(columns={'level_1': 'RSID', 0: 'A1_homo'})
    .assign(
        A1 = 'BG',
        A2 = 'NBG',
        BP = 1,
        A2_hetero = 0,
        A2_homo = lambda df: 1 - df['A1_homo'],
        CHR = 9
    )
    .pivot_table(index=['CHR', 'RSID', 'BP', 'A1', 'A2'], columns=['IID'], 
                 values=['A1_homo', 'A2_hetero', 'A2_homo'])
    .swaplevel(0, 1, axis=1)
    .sort_index(axis=1)
)

gen_df.columns = ['_'.join(map(str, col)).strip() for col in gen_df.columns.values]

gen_df = gen_df.reset_index()

# Get IIDs in the correct order
iid_order = [int(name.split('_')[0]) for name in gen_df.columns if '_A1_homo' in name]

In [4]:
gen_df.to_csv('../data/ukb_cov_tested.gen', sep='\t')

## 2. Make `.sample` file

https://www.cog-genomics.org/plink/1.9/formats#sample

In [5]:
extra_header = pd.DataFrame({'ID_1': [0], 'ID_2': [0], 'missing': [0], 'sex': ['D'], 'phenotype': ['B']})

In [6]:
sample_df = (
    blood_groups_df
    .drop(labels=['A', 'AB', 'B', 'O'], axis=1)
    # Order according to the .gen file
    .set_index('IID')
    .loc[iid_order, :]
    .reset_index()
    # Format as .sample file
    .assign(missing = 0)
    .rename(columns={'FID': 'ID_1', 'IID': 'ID_2', 'SEX': 'sex', 'infection': 'phenotype'})
    .pipe(lambda df: pd.concat([extra_header, df]))
)

In [7]:
sample_df.to_csv('../data/ukb_cov_tested.sample', sep='\t')