# Overview

We want to create sets of variants that have already been identified for a number of phenotypes. 
We want variant-trait associations that satisfy:
1. Recorded in the GWAS Catalog
2. Associations for traits we've chosen
3. Supported by studies that didn't use the UK Biobank as their discovery cohort
4. Supported by studies using European data

We will perform this filtering as follows:
1. Download data on all studies in the UK Biobank
2. Filter for studies using European ancestry individuals in the discovery cohort
3. Filter for studies investigating one of the phenotypes we've chosen
4. Submit a query through the PubMed API to retrieve abstracts of the studies passing the above filters. Remove any studies whose titles or abstracts mention the UK Biobank.

In [1]:
library(tidyverse)
library(xml2)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



## 1. Specify traits of interest

In [2]:
traits_of_interest <- list(
    'Asthma' = 'EFO_0000270; EFO_0009759; EFO_1002011; EFO_0010638; EFO_0004591',
    'COPD' = 'EFO_0000341; EFO_0006505; EFO_0000464',
    'Dementia' = 'HP_0000726',
    'All cause dementia' = 'HP_0000726; EFO_0002608; EFO_0004718; Orphanet_282; EFO_0006792',
    'Motor neurone disease' = 'EFO_0000253; EFO_0001357',
    'Myocardial infarction' = 'EFO_0000612',
    'Parkinsons' = 'EFO_0002508; Orphanet_2828',
    'Stroke' = 'EFO_0000712; EFO_1001976; HP_0002140; EFO_0005524; EFO_1001504; EFO_0005669; EFO_0010177; EFO_0010178',
    'Ischemic stroke' = 'HP_0002140',
    'Idiopathic pulmonary fibrosis' = 'EFO_0000768',
    'Venous thromboembolism' = 'EFO_0004286'
)

traits_of_interest_df <- data.frame(trait = names(traits_of_interest), 
                                    code = as.character(traits_of_interest)) %>%
    separate_rows(code, sep = '; ')

traits_of_interest_df %>% head(2)

trait,code
<chr>,<chr>
Asthma,EFO_0000270
Asthma,EFO_0009759


## 2. Load GWAS Catalog data

In [3]:
associations_df <- read_tsv(
    '../../data/gwas_catalog_v1.0.2-associations_e104_r2021-08-16.tsv',
    col_types = cols(.default = col_character())
)

“45 parsing failures.
   row   col           expected actual                                                               file
151168 STUDY delimiter or quote        '../../data/gwas_catalog_v1.0.2-associations_e104_r2021-08-16.tsv'
151168 STUDY delimiter or quote      T '../../data/gwas_catalog_v1.0.2-associations_e104_r2021-08-16.tsv'
151168 STUDY delimiter or quote        '../../data/gwas_catalog_v1.0.2-associations_e104_r2021-08-16.tsv'
151168 STUDY delimiter or quote      T '../../data/gwas_catalog_v1.0.2-associations_e104_r2021-08-16.tsv'
151168 STUDY delimiter or quote        '../../data/gwas_catalog_v1.0.2-associations_e104_r2021-08-16.tsv'
...... ..... .................. ...... ..................................................................
See problems(...) for more details.
”


In [4]:
ancestry_df <- read_tsv('../../data/gwas_catalog-ancestry_r2021-08-16.tsv', 
                        col_types = cols(`.default` = col_character()))

ancestry_df %>% head(2)

STUDY ACCESSION,PUBMEDID,FIRST AUTHOR,DATE,INITIAL SAMPLE DESCRIPTION,REPLICATION SAMPLE DESCRIPTION,STAGE,NUMBER OF INDIVDUALS,BROAD ANCESTRAL CATEGORY,COUNTRY OF ORIGIN,COUNTRY OF RECRUITMENT,ADDITONAL ANCESTRY DESCRIPTION
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
GCST000176,18391950,Lettre G,2008-04-06,"15,821 European ancestry individuals","Up to 17,801 European ancestry individuals",initial,15821,European,NR,"Finland, Sweden, U.S., Italy, Germany",
GCST000176,18391950,Lettre G,2008-04-06,"15,821 European ancestry individuals","Up to 17,801 European ancestry individuals",replication,17801,European,NR,"Finland, U.S., Germany",


## 3. Filter associations for relevant traits and cohorts of European ancestry

In [5]:
# Filter for all relevant associations
relevant_associations_df <- associations_df %>% 
    select(MAPPED_TRAIT, MAPPED_TRAIT_URI, `STUDY ACCESSION`, PUBMEDID, `INITIAL SAMPLE SIZE`, 
           SNPS, SNP_ID_CURRENT, `P-VALUE`) %>%
    # Some associations are reported for multiple traits in one row. Separate to filter 
    # for traits relevant to the current work
    separate_rows(MAPPED_TRAIT_URI, sep = ', ') %>%
    mutate(code = str_extract(MAPPED_TRAIT_URI, '(?<=/)[A-Za-z0-9_]+$')) %>%
    select(-MAPPED_TRAIT_URI) %>%
    right_join(traits_of_interest_df, by = 'code')

relevant_associations_df %>% head(2)

MAPPED_TRAIT,STUDY ACCESSION,PUBMEDID,INITIAL SAMPLE SIZE,SNPS,SNP_ID_CURRENT,P-VALUE,code,trait
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
asthma,GCST000389,19426955,"359 European ancestry cases, 846 European ancestry controls",rs1588265,1588265,3e-08,EFO_0000270,Asthma
amyotrophic lateral sclerosis,GCST000820,20801717,"4,857 European ancestry cases, 8,987 European ancestry controls",rs4799088,4799088,9e-06,EFO_0000253,Motor neurone disease


In [6]:
european_studies_df <- ancestry_df %>% 
    filter(`BROAD ANCESTRAL CATEGORY` == 'European', STAGE == 'initial') %>%
    distinct(`STUDY ACCESSION`, PUBMEDID)

european_studies_df %>% nrow

european_studies_df %>% distinct(`STUDY ACCESSION`) %>% nrow

european_studies_df %>% distinct(`PUBMEDID`) %>% nrow

potential_studies_df <- european_studies_df %>% 
    inner_join(relevant_associations_df, by = c('STUDY ACCESSION', 'PUBMEDID'))

## 4. Remove studies using the UK Biobank

Use PubMed API to pull abstracts for all potential studies, removing those that contain "UK Biobank".

In [7]:
pubmed_ids <- potential_studies_df %>% 
    distinct(`PUBMEDID`) %>%
    pull %>%
    paste(collapse = ',')

potential_studies_df %>% distinct(`PUBMEDID`) %>% nrow

url <- str_glue('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pubmed_ids}&rettype=Abstract')
url

r <- httr::GET(url)

p <- httr::content(r, 'parsed')

articles <- xml_children(p)

In [8]:
pmids <- c()
titles <- c()
abstracts <- c()

for (article in articles) {
    pmid <- article %>% 
        xml_find_all('.//PMID') %>% 
        map_chr(xml_text) %>% 
        paste(collapse = ';')
    pmids <- append(pmids, pmid)
    
    title <- article %>% 
        xml_find_all('.//ArticleTitle') %>% 
        map_chr(xml_text) %>% 
        paste(collapse = ' ')
    titles <- append(titles, title)
    
    abstract <- article %>% 
        xml_find_all('.//AbstractText') %>% 
        map_chr(xml_text) %>% 
        paste(collapse = ' ')
    abstracts <- append(abstracts, abstract)
}

abstracts_df <- data.frame(pmids, titles, abstracts) %>% 
    mutate(
        ukb = str_detect(abstracts, regex('uk biobank', ignore_case = T)) |
              str_detect(titles, regex('uk biobank', ignore_case = T))
    )

# Number of studies in total
abstracts_df %>% nrow

# Number of studies referencing the UK Biobank (these will be removed)
abstracts_df %>% filter(ukb) %>% nrow

In [9]:
# Only associations for relevant traits and from relevant studies
markers_df <- abstracts_df %>%
    filter(!ukb) %>%
    separate_rows(pmids, sep = ';') %>%
    distinct(pmids) %>%
    inner_join(relevant_associations_df, by = c('pmids' = 'PUBMEDID')) %>%
    # SNPS not NA but SNP_ID_CURRENT is NA indicates haplotype not individual SNP
    filter(!is.na(SNP_ID_CURRENT)) %>%
    select(trait, MAPPED_TRAIT, code, `STUDY ACCESSION`, pmids, SNP_ID_CURRENT, `P-VALUE`)

markers_df %>% head(2)

trait,MAPPED_TRAIT,code,STUDY ACCESSION,pmids,SNP_ID_CURRENT,P-VALUE
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Asthma,asthma,EFO_0000270,GCST000389,19426955,1588265,3e-08
Motor neurone disease,amyotrophic lateral sclerosis,EFO_0000253,GCST000820,20801717,4799088,9e-06


## 5. Summarize markers for each trait

In [10]:
markers_df %>%
    group_by(trait) %>%
    summarize(n_snps = n_distinct(SNP_ID_CURRENT, na.rm = T), .groups = 'drop')

trait,n_snps
<chr>,<int>
All cause dementia,68
Asthma,451
COPD,107
Idiopathic pulmonary fibrosis,21
Ischemic stroke,63
Motor neurone disease,79
Myocardial infarction,53
Parkinsons,215
Stroke,200
Venous thromboembolism,176


In [11]:
markers_df %>%
    group_by(trait) %>%
    mutate(n_trait_snps = n_distinct(SNP_ID_CURRENT, na.rm = T)) %>%
    group_by(trait, MAPPED_TRAIT, n_trait_snps) %>%
    summarize(n_mapped_trait_snps = n_distinct(SNP_ID_CURRENT), .groups = 'drop')

trait,MAPPED_TRAIT,n_trait_snps,n_mapped_trait_snps
<chr>,<chr>,<int>,<int>
All cause dementia,"age at onset, Frontotemporal dementia",68,8
All cause dementia,AIDS dementia,68,4
All cause dementia,"Alzheimer's disease, vascular dementia",68,2
All cause dementia,"amyotrophic lateral sclerosis, Frontotemporal dementia",68,1
All cause dementia,"brain infarction, neuritic plaque measurement, Lewy body dementia, cerebral amyloid angiopathy, neurofibrillary tangles measurement",68,17
All cause dementia,Frontotemporal dementia,68,21
All cause dementia,Lewy body dementia,68,17
Asthma,"age at onset, asthma",451,10
Asthma,asthma,451,179
Asthma,"asthma, traffic air pollution measurement",451,5


In [12]:
markers_df %>%
    distinct(trait, SNP_ID_CURRENT) %>%
    mutate(snp_id = str_c('rs', SNP_ID_CURRENT)) %>%
    select(trait, snp_id) %>%
    arrange(trait, snp_id) %>%
    write_tsv('../../data/markers/all_gwas_catalog.tsv')