In [3]:
import pyspark
import dxpy
import dxdata

In [4]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [5]:
import dxpy
import dxdata

# Automatically discover dispensed dataset ID and load the dataset
dispensed_dataset_id = dxpy.find_one_data_object(typename='Dataset', name='app*.dataset', folder='/', name_mode='glob')['id']
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [6]:
participant = dataset['participant']

In [7]:
def field_names_for_ids(field_ids):
    from distutils.version import LooseVersion
    fields = []
    for field_id in field_ids:
        field_id = 'p' + str(field_id)
        fields += participant.find_fields(name_regex=r'^{}(_i\d+)?(_a\d+)?$'.format(field_id))
    return sorted([field.name for field in fields], key=lambda n: LooseVersion(n))

field_ids =  ['23100', '23101', '12144', '50', '31', '22000', '22001', '22003', '22004',
             '22006', '22009', '21022',
             '22010', '22011', '22012',
             '22013', '22018', '22019',
             '22021', '22027']
field_names = ['eid'] + field_names_for_ids(field_ids)

In [8]:
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect())
pdf = df.toPandas()

In [9]:
len(pdf)

502387

In [10]:
pdf_qced = pdf[
           (pdf['p23100_i0'].notnull()) &  # fat mass not missing&
           (pdf['p23101_i0'].notnull())    # fat-free mass not missing
]

len(pdf_qced)

491369

In [11]:
pdf_qced = pdf[
           (pdf['p22006']==1) &            # in_white_british_ancestry_subset
           (pdf['p23100_i0'].notnull()) &  # fat mass not missing&
           (pdf['p23101_i0'].notnull())    # fat-free mass not missing
]

len(pdf_qced)

401714

In [12]:
pdf_qced = pdf[
           (pdf['p31'] == pdf['p22001']) & # Filter in sex and genetic sex are the same
           (pdf['p22006']==1) &            # in_white_british_ancestry_subset
           (pdf['p22019'].isnull()) &      # Not Sex chromosome aneuploidy
           (pdf['p23100_i0'].notnull()) &  # fat mass not missing&
           (pdf['p23101_i0'].notnull())    # fat-free mass not missing
]

len(pdf_qced)

401026

In [13]:
pdf_qced = pdf[
           (pdf['p31'] == pdf['p22001']) & # Filter in sex and genetic sex are the same
           (pdf['p22006']==1) &            # in_white_british_ancestry_subset
           (pdf['p22019'].isnull()) &      # Not Sex chromosome aneuploidy
           (pdf['p22021']!=10) &           # Not Ten or more third-degree relatives identified (not 'excess_relatives')
           (pdf['p23100_i0'].notnull()) &  # fat mass not missing&
           (pdf['p23101_i0'].notnull())    # fat-free mass not missing
]

len(pdf_qced)

400865

In [14]:
pdf_qced = pdf[
           (pdf['p31'] == pdf['p22001']) & # Filter in sex and genetic sex are the same
           (pdf['p22006']==1) &            # in_white_british_ancestry_subset
           (pdf['p22019'].isnull()) &      # Not Sex chromosome aneuploidy
           (pdf['p22021']!=10) &           # Not Ten or more third-degree relatives identified (not 'excess_relatives')
           (pdf['p22027'].isnull()) &      # Not het_missing_outliers
           (pdf['p23100_i0'].notnull()) &  # fat mass not missing&
           (pdf['p23101_i0'].notnull())    # fat-free mass not missing
]

len(pdf_qced)

400154

In [26]:
pdf_qced = pdf[
           (pdf['p31'] == pdf['p22001']) & # Filter in sex and genetic sex are the same
           (pdf['p22006']==1) &            # in_white_british_ancestry_subset
           (pdf['p22019'].isnull()) &      # Not Sex chromosome aneuploidy
           (pdf['p22021']!=10) &           # Not Ten or more third-degree relatives identified (not 'excess_relatives')
           (pdf['p22027'].isnull()) &      # Not het_missing_outliers
           (pdf['p23100_i0'].notnull()) &  # fat mass not missing&
           (pdf['p23101_i0'].notnull())    # fat-free mass not missing
]

In [27]:
pdf_qced.shape[0]

400154

In [29]:
import re
pdf_qced = pdf_qced.rename(columns=lambda x: re.sub('p22009_a','pc',x))
pdf_qced = pdf_qced.rename(columns={'eid':'IID', 'p31': 'sex', 'p22006': 'ethnic_group',
                          'p22019': 'sex_chromosome_aneuploidy',
                          'p21022': 'age',      
                          'p22000': 'genotype_measurement_batch',
                          'p22021': 'kinship_to_other_participants',
                          'p22027': 'outliers_for_heterozygosity_or_missing',
                          'p23100_i0': 'fat_mass',
                          'p23101_i0': 'fat_free_mass'})

In [30]:
# Add FID column -- required input format for regenie 
pdf_qced['FID'] = pdf_qced['IID']

In [31]:
# Create a phenotype table from our QCed data
pdf_phenotype = pdf_qced[['FID', 'IID', 'fat_mass', 'fat_free_mass', 'sex', 'age',
                          'genotype_measurement_batch'] + [f'pc{i}' for i in range(1, 11)]]


In [53]:
pdf_phenotype.to_csv('fm_ffm_4regenie.phe', sep='\t', na_rep='NA', index=False, quoting=3)


In [54]:
%%bash
dx upload fm_ffm_4regenie.phe -p --path /path_to_phenofiles/ --brief

file-GQ10zXjJxk60qYqgkgvjyPvv
