# Fine-Mapping Preprocessing

Jupyter Noteboook Tutorial

https://jupyter.org/install
```
$ jupyter lab
```
port forwarding on your local machine:
```
$ ssh username@xx.xx.xx.xx -NL 8888:localhost:8888
```

In [None]:
!pip3 install -r requirements.txt  # if needed

In [4]:
import os, glob
import pandas as pd
import importlib

import preprocessing
import cols
importlib.reload(cols)
importlib.reload(preprocessing)

from cols import Cols
from preprocessing import Preprocess

## Setup API endpoint for GPT integration


#### OpenAI 
not free

Run the following commands in your terminal:

`$ export OPENAI_ORGANIZATION='<KEY>'`        # https://platform.openai.com/settings/organization/general

`$ export OPENAI_PROJECT='<KEY>'`             # https://platform.openai.com/settings/ -> project

`$ export OPENAI_API_KEY='<KEY>'`             # https://platform.openai.com/settings/profile?tab=api-keys

In [None]:
from openai import OpenAI

organization = os.getenv('OPENAI_ORGANIZATION')
project = os.getenv('OPENAI_PROJECT')
api_key = os.getenv('OPENAI_API_KEY')

openai_client = OpenAI(
    organization = organization,
    project = project,
    api_key = api_key
)

Alternatively:

#### Google Gemini

`$ export OPENAI_API_KEY='<KEY>'` # Gemini API https://ai.google.dev/gemini-api/docs/api-key: FREE @ 15 RPM

In [9]:
import google.generativeai as genai

genai.configure(api_key = os.getenv('GEMINI_API_KEY'))
client = genai.GenerativeModel('gemini-pro')

## To gather all of your files

Use the path to folder containing your various sumstats files

Verify these paths are correct

In [38]:
directory_of_sumstats = '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European' # YOUR INPUT PATH
my_input_folder = glob.glob(directory_of_sumstats + '/*')

for path in my_input_folder:
    print(path)

len(my_input_folder)

/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90000583_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST007228_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90092930_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90092856_buildGRCh37.tsv.gz
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90086092_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST007545_buildGRCh37.txt
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST006906_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90014122_buildGRCh37.tsv
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90092916_buildGRCh37.tsv.gz
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90092844_buildGRCh37.tsv.gz
/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/E

69

## Run a MANUAL example or batch

In [41]:
gemini_api_key = os.getenv('GEMINI_API_KEY')

genai.configure(api_key = gemini_api_key)
client = genai.GenerativeModel('gemini-pro')

In [40]:
output_directory = '~' # YOUR OUTPUT PATH - Logs and outputs go here

In [47]:
# Instantiate Preprocess class with your 
 #   - GPT client
 #   - Directory where you want output files located

ft = Preprocess(
    client = client, 
    out_dir = output_directory
)

In [42]:
files = iter(my_input_folder) # ['/path/to/file1', '/path/to/file2', ...]

In [43]:
fp = next(files) # '/path/to/file1'
fp

'/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90000583_buildGRCh37.tsv'

In [48]:
ft.get_columns(fp)

['chromosome',
 'base_pair_location',
 'p_value',
 'variant_id',
 'effect_allele',
 'other_allele']

In [None]:
ft.suggest_columns(fp) # uses GPT to suggest columns

In [50]:
cols = ['chromosome',
 'base_pair_location',
 'effect_allele',
 'other_allele',
 'beta',
 'standard_error',
 'p_value']


print(cols)

['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'beta', 'standard_error', 'p_value']


In [None]:
ft.loadmap_sumstats_table(path, manual_columns = cols, verbose = True)

In [None]:
ft.create_leadsnp_table(verbose = True)

## Run an AUTOMATIC example

In [39]:
output_directory = '~' # YOUR OUTPUT PATH

In [None]:
from preprocessing import Preprocess

ft = Preprocess(
    client=openai_client, 
    out_dir=output_directory
)

# optional parameters
ft.significance_threshold = 5e-8 # default value is already 5e-8
ft.ancestry = 'EUR' # default value is 'EUR' but can be statically
# changed here or dynamically changed in the loop below.
# Ancestry may play a role in GPT's selection of a column, e.g., say there
# are two pval columns (1) pval_afr (2) pval_eur, then GPT will select
# the column matching the ancestry variable set by user.

In [None]:
fp = '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90104541_buildGRCh37.tsv.gz'

In [None]:
res = ft.loadmap_sumstats_table(
    fp,
    verbose=True)

In [None]:
ft.create_leadsnp_table(verbose=True)

## Run a batch

In [None]:
for path in my_input_folder[::]:

    print(f'==> {path}')

    try:

        res = ft.loadmap_sumstats_table(
            path,
            verbose=False)
        
        if res != 0:
            continue

        ft.create_leadsnp_table(verbose=False)

    except:
        print("SKIPPING due to error")
     