# Fine-Mapping Preprocessing

Jupyter Noteboook Tutorial

In [1]:
import os, glob
import pandas as pd

## Setup OpenAI API endpoint for ChatGPT integration

### Run the following commands in your terminal:


`$ export OPENAI_ORGANIZATION='<KEY>'`        # https://platform.openai.com/settings/organization/general

`$ export OPENAI_PROJECT='<KEY>'`             # https://platform.openai.com/settings/ -> project

`$ export OPENAI_API_KEY='<KEY>'`             # https://platform.openai.com/settings/profile?tab=api-keys


In [2]:
from openai import OpenAI

# organization = os.getenv('OPENAI_ORGANIZATION')
# project = os.getenv('OPENAI_PROJECT')
# api_key = os.getenv('OPENAI_API_KEY')

# these are Sam's personal keys -- you can use until it runs out (caps out at $5 usage total)
organization = 'org-7Tz4ejgn3Rw1nC3LMSjTcdVN'
project = 'proj_JwOi33NfPxndlxwTCQL9UIiK'
api_key = 'sk-None-DvEi4YfrZBudVnrZK1nTT3BlbkFJQsggOLv29hV8bq6Sm8Mf'

openai_client = OpenAI(
    organization = organization,
    project = project,
    api_key = api_key
)

## Use path to folder containing your various sumstats files

Verify these paths are correct

In [3]:
directory_of_sumstats = '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European' # YOUR INPUT PATH
my_input_folder = glob.glob(directory_of_sumstats + '/*')

for path in my_input_folder:
    print(path)

output_directory = '' # YOUR OUTPUT PATH

/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90000583_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST007228_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90092856_buildGRCh37.tsv.gz
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90086092_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST007545_buildGRCh37.txt
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST006906_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90014122_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90092916_buildGRCh37.tsv.gz
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90092844_buildGRCh37.tsv.gz
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST010774_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/Eur

# Instantiate Preprocess class with your 
    - OpenAI client
    - Directory where you want output files located

In [4]:
from preprocessing import Preprocess

ft = Preprocess(
    client=openai_client, 
    out_dir=output_directory
)

# optional parameters
ft.significance_threshold = 5e-8 # default value is already 5e-8
ft.ancestry = 'EUR' # default value is 'EUR' but can be statically
# changed here or dynamically changed in the loop below.
# Ancestry may play a role in GPT's selection of a column, e.g., say there
# are two pval columns (1) pval_afr (2) pval_eur, then GPT will select
# the column matching the ancestry variable set by user.

## Iterate over each file

In [5]:
len(my_input_folder)

68

## Run one example

In [6]:
fp = '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90104541_buildGRCh37.tsv.gz'

In [7]:
res = ft.loadmap_sumstats_table(
    fp,
    verbose=True)

found GCST90104541_buildGRCh37.tsv.gz

Sumstat cols:	['chromosome', 'base_pair_location', 'effect_allele_frequency', 'beta', 'standard_error', 'p_value', 'odds_ratio', 'ci_lower', 'ci_upper', 'effect_allele', 'other_allele'] ->
GPT mapping:	['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'beta', 'standard_error', 'p_value', 'odds_ratio']

Saved chunk to 
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90104541_buildGRCh37_preprocessed.tsv
Saved chunk to 
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90104541_buildGRCh37_preprocessed.tsv
Saved chunk to 
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90104541_buildGRCh37_preprocessed.tsv
Saved chunk to 
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90104541_buildGRCh37_preprocessed.tsv
Saved reformatted sumstats file to: GCST90104541_buildGRCh37_preprocessed.tsv



In [8]:
ft.create_leadsnp_table(verbose=True)

expects pre-filtered sumstats
  CHR         BP         locus
0   1  170193825   1.170193825
1   4  111688752   4.111688752
2   4  187213883   4.187213883
3   9  136150466   9.136150466
4  10  105324774  10.105324774

Saved lead SNP file to 
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90104541_buildGRCh37_preprocessed_leadSNPs.tsv



Unnamed: 0,CHR,BP,locus
0,1,170193825,1.170193825
1,4,111688752,4.111688752
2,4,187213883,4.187213883
3,9,136150466,9.136150466
4,10,105324774,10.105324774
5,16,73048367,16.73048367


## Run a batch

In [9]:
ct=0
for path in my_input_folder[::]:

    print(f'==> {path} {ct}')

    try:

        res = ft.loadmap_sumstats_table(
            path,
            verbose=False)
        
        ct += res

        if res == 1:
            ft.create_leadsnp_table(verbose=False)
    except:
        print("SKIPPING due to error")
     
print(ct)

==> /gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90000583_buildGRCh37.tsv 0
Sumstat cols:	['chromosome', 'base_pair_location', 'p_value', 'variant_id', 'effect_allele', 'other_allele'] ->
GPT mapping:	['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'NA', 'NA', 'p_value', 'NA']
GPT couldn't make out all the columns.
==> /gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST007228_buildGRCh37.tsv 0
Sumstat cols:	['chromosome', 'base_pair_location', 'p_value', 'chr:pos(hg19)', 'effect_allele', 'other_allele', 'eaf', 'OR', 'OR_se', 'OR_95L', 'OR_95U', 'z', 'p.value', 'X_.log10_p.value', 'q_statistic', 'q_p.value', 'i2', 'n_studies', 'n_samples', 'effects'] ->
GPT mapping:	['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'NA', 'OR_se', 'p_value', 'OR']
GPT couldn't make out all the columns.
==> /gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90092856_buildGRCh37.tsv.gz 0
Sumstat cols:	['va

  for chunk in pd.read_table(
  sumstats_df = pd.concat(chunks)


==> /gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90043954_buildGRCh37.tsv.gz 4
Sumstat cols:	['chromosome', 'variant_id', 'base_pair_location', 'effect_allele', 'other_allele', 'N', 'effect_allele_frequency', 'T', 'SE_T', 'P_noSPA', 'beta', 'standard_error', 'p_value', 'CONVERGE'] ->
GPT mapping:	['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'beta', 'standard_error', 'p_value', 'NA']
Saved reformatted sumstats file to: GCST90043954_buildGRCh37_preprocessed.tsv
==> /gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90086091_buildGRCh37.tsv 5
Sumstat cols:	['chromosome', 'base_pair_location', 'rs_id_all', 'info_all', 'other_allele', 'effect_allele', 'all_maf', 'all_total', 'freq_effect_allele', 'standard_error', 'beta', 'p_value'] ->
GPT mapping:	['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'beta', 'standard_error', 'p_value', 'NA']
Saved reformatted sumstats file to: GCST90086091_buildGRCh37_prepr

  for chunk in pd.read_table(


Saved reformatted sumstats file to: GCST006414_buildGRCh37_preprocessed.tsv


  for chunk in pd.read_table(


==> /gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90132314_buildGRCh37.tsv 34
Sumstat cols:	['p_value', 'chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'effect_allele_frequency', 'odds_ratio', 'beta', 'standard_error', 'markername', 'freqse', 'minfreq', 'maxfreq', 'direction', 'hetisq', 'hetchisq', 'hetdf', 'hetpval', 'cases', 'effective_cases', 'n', 'meta_analysis'] ->
GPT mapping:	['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'beta', 'standard_error', 'p_value', 'odds_ratio']
Saved reformatted sumstats file to: GCST90132314_buildGRCh37_preprocessed.tsv
35


## Verify files were outputted to the specified directory

In [10]:
fps = glob.glob(f'{output_directory}/*')
fps

['/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90026612_buildGRCh37_preprocessed.tsv',
 '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90104543_buildGRCh37_preprocessed.tsv',
 '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90104542_buildGRCh37_preprocessed.tsv',
 '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90245878_buildGRCh37_preprocessed.tsv',
 '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90027266_buildGRCh37_preprocessed.tsv',
 '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90043954_buildGRCh37_preprocessed.tsv',
 '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90239676_buildGRCh37_preprocessed_leadSNPs.tsv',
 '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/preprocessed/European/GCST90271714_buildGRCh37_preprocessed.tsv',
 '/gpfs/commons/groups/sanjana_