# Fine-Mapping Preprocessing

Jupyter Noteboook Tutorial

https://jupyter.org/install

In [None]:
!pip3 install -r requirements.txt

In [61]:
import os, glob
import pandas as pd
import importlib

import preprocessing
import cols
importlib.reload(cols)
importlib.reload(preprocessing)

from cols import Cols
from preprocessing import Preprocess

## Setup API endpoint for GPT integration

### Run the following commands in your terminal:
OpenAI not free

`$ export OPENAI_ORGANIZATION='<KEY>'`        # https://platform.openai.com/settings/organization/general

`$ export OPENAI_PROJECT='<KEY>'`             # https://platform.openai.com/settings/ -> project

`$ export OPENAI_API_KEY='<KEY>'`             # https://platform.openai.com/settings/profile?tab=api-keys

In [None]:
from openai import OpenAI

organization = os.getenv('OPENAI_ORGANIZATION')
project = os.getenv('OPENAI_PROJECT')
api_key = os.getenv('OPENAI_API_KEY')

openai_client = OpenAI(
    organization = organization,
    project = project,
    api_key = api_key
)

Alternatively:

`$ export OPENAI_API_KEY='<KEY>'` # Gemini API https://ai.google.dev/gemini-api/docs/api-key: FREE @ 15 RPM

In [None]:
from openai import OpenAI
import google.generativeai as genai

genai.configure(api_key = os.getenv('GEMINI_API_KEY'))
client = genai.GenerativeModel('gemini-pro')

## Use path to folder containing your various sumstats files

Verify these paths are correct

In [None]:
directory_of_sumstats = '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European' # YOUR INPUT PATH
my_input_folder = glob.glob(directory_of_sumstats + '/*')

for path in my_input_folder:
    print(path)

output_directory = '~' # YOUR OUTPUT PATH

len(my_input_folder)

# Instantiate Preprocess class with your 
    - GPT client
    - Directory where you want output files located

In [None]:
from preprocessing import Preprocess

ft = Preprocess(
    client=openai_client, 
    out_dir=output_directory
)

# optional parameters
ft.significance_threshold = 5e-8 # default value is already 5e-8
ft.ancestry = 'EUR' # default value is 'EUR' but can be statically
# changed here or dynamically changed in the loop below.
# Ancestry may play a role in GPT's selection of a column, e.g., say there
# are two pval columns (1) pval_afr (2) pval_eur, then GPT will select
# the column matching the ancestry variable set by user.

## Run a MANUAL example or batch

In [62]:
genai.configure(api_key = os.getenv('GEMINI_API_KEY'))
client = genai.GenerativeModel('gemini-pro')

In [63]:
ft = Preprocess(
    client = client, 
    out_dir = output_directory
)

In [64]:
files = iter(my_input_folder)

In [65]:
fp = next(files)
fp

'/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90000583_buildGRCh37.tsv'

In [66]:
ft.get_columns(path)

['p_value',
 'chromosome',
 'base_pair_location',
 'effect_allele',
 'other_allele',
 'effect_allele_frequency',
 'odds_ratio',
 'beta',
 'standard_error',
 'markername',
 'freqse',
 'minfreq',
 'maxfreq',
 'direction',
 'hetisq',
 'hetchisq',
 'hetdf',
 'hetpval',
 'cases',
 'effective_cases',
 'n',
 'meta_analysis']

In [67]:
ft.suggest_columns(path)

['chromosome',
 'base_pair_location',
 'effect_allele',
 'other_allele',
 'beta',
 'standard_error',
 'p_value']

In [68]:
cols = ['chromosome',
 'base_pair_location',
 'effect_allele',
 'other_allele',
 'beta',
 'standard_error',
 'p_value'
]

In [None]:
ft.loadmap_sumstats_table(path, manual_columns = cols, verbose = True)

In [None]:
ft.create_leadsnp_table(verbose = True)

## Run an AUTOMATIC example

In [None]:
fp = '/gpfs/commons/groups/sanjana_lab/mdrabkin/gwas_data/raw/European/GCST90104541_buildGRCh37.tsv.gz'

In [None]:
res = ft.loadmap_sumstats_table(
    fp,
    verbose=True)

In [None]:
ft.create_leadsnp_table(verbose=True)

## Run a batch

In [None]:
for path in my_input_folder[::]:

    print(f'==> {path}')

    try:

        res = ft.loadmap_sumstats_table(
            path,
            verbose=False)
        
        if res != 0:
            continue

        ft.create_leadsnp_table(verbose=False)

    except:
        print("SKIPPING due to error")
     