In [1]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.2.1     ✔ purrr   0.3.2
✔ tibble  2.1.3     ✔ dplyr   0.8.3
✔ tidyr   1.0.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [3]:
gwas_df <- read_csv('../data/gwas_catalog/combined.csv')

gwas_df %>% head(2)

Parsed with column specification:
cols(
  .default = col_character(),
  `DATE ADDED TO CATALOG` = col_date(format = ""),
  PUBMEDID = col_double(),
  DATE = col_date(format = ""),
  CHR_POS = col_double(),
  UPSTREAM_GENE_DISTANCE = col_double(),
  DOWNSTREAM_GENE_DISTANCE = col_double(),
  MERGED = col_double(),
  SNP_ID_CURRENT = col_double(),
  INTERGENIC = col_double(),
  `P-VALUE` = col_double(),
  PVALUE_MLOG = col_double(),
  `OR or BETA` = col_double()
)
See spec(...) for full column specifications.


DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,⋯,PVALUE_MLOG,P-VALUE (TEXT),OR or BETA,95% CI (TEXT),PLATFORM [SNPS PASSING QC],CNV,MAPPED_TRAIT,MAPPED_TRAIT_URI,STUDY ACCESSION,GENOTYPING TECHNOLOGY
<date>,<dbl>,<chr>,<date>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
2019-07-12,31049640,Chen J,2019-05-02,Diabetologia,www.ncbi.nlm.nih.gov/pubmed/31049640,Genome-wide association study of type 2 diabetes in Africa.,Type 2 diabetes,"2,633 African ancestry cases, 1,714 African ancestry controls",,⋯,5.39794,,1.34,[1.17-1.53],"Affymetrix, Illumina [12148595] (imputed)",N,type II diabetes mellitus,http://www.ebi.ac.uk/efo/EFO_0001360,GCST008114,Genome-wide genotyping array
2019-07-12,31049640,Chen J,2019-05-02,Diabetologia,www.ncbi.nlm.nih.gov/pubmed/31049640,Genome-wide association study of type 2 diabetes in Africa.,Type 2 diabetes,"2,633 African ancestry cases, 1,714 African ancestry controls",,⋯,5.045757,,1.84,[1.42-2.38],"Affymetrix, Illumina [12148595] (imputed)",N,type II diabetes mellitus,http://www.ebi.ac.uk/efo/EFO_0001360,GCST008114,Genome-wide genotyping array


In [51]:
gwas_df %>%
    mutate(
        CHR_ID = if_else(CHR_ID == 'X', true = 'X', false = as.character(as.integer(CHR_ID)))
    ) %>% 
    select('STUDY ACCESSION', CHR_ID, CHR_POS, 'P-VALUE', 'P-VALUE (TEXT)') %>%
    pivot_wider(id_cols = c(CHR_ID, CHR_POS), names_from = 'STUDY ACCESSION', 
                values_from = 'P-VALUE') %>%
    drop_na(CHR_ID, CHR_POS) %>%
    filter(!all(is.na(GCST008114), is.na(GCST005047), is.na(GCST007847),))

“Values in `P-VALUE` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list(P-VALUE = length)` to identify where the duplicates arise
* Use `values_fn = list(P-VALUE = summary_fun)` to summarise duplicates”

CHR_ID,CHR_POS,GCST008114,GCST005047,GCST007847
<chr>,<dbl>,<list<dbl>>,<list<dbl>>,<list<dbl>>
8,3359886,9e-06,,
6,41201241,2e-06,,
19,41341832,1e-06,,
7,102299678,9e-06,,
7,1729716,3e-06,,
12,98386008,3e-06,,
12,63394241,8e-06,,
11,2185703,2e-07,,
14,64761876,7e-06,,
6,104114435,4e-06,,


In [27]:
gwas_df %>%
    group_by(!!as.name('STUDY ACCESSION')) %>%
    summarize(
        n_row = n(),
        n_location = n_distinct(c(CHR_ID, CHR_POS)),
        n_snp = n_distinct(SNPS),
        n_snp_gene_ids = n_distinct(SNP_GENE_IDS),
        n_snp_id = n_distinct(SNP_ID_CURRENT),
        n_reported_genes = n_distinct(!!as.name('REPORTED GENE(S)')),
        n_mapped_genes = n_distinct(MAPPED_GENE)
    )

STUDY ACCESSION,n_row,n_location,n_snp,n_snp_gene_ids,n_snp_id,n_reported_genes,n_mapped_genes
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
GCST005047,110,82,64,39,64,50,56
GCST007847,126,146,126,78,126,105,120
GCST008114,40,57,40,22,40,40,40


In [28]:
gwas_df %>%
    select(SNPS) %>%
    filter(SNPS %>% str_count('rs') > 1)

SNPS
<chr>


In [36]:
gwas_df %>%
    select('STUDY ACCESSION', CHR_ID, CHR_POS, 'P-VALUE') %>%
    pivot_wider(id_cols = c(CHR_ID, CHR_POS), names_from = 'STUDY ACCESSION', 
                values_from = 'P-VALUE') %>%
    drop_na(CHR_ID, CHR_POS) %>%
    

“Values in `P-VALUE` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list(P-VALUE = length)` to identify where the duplicates arise
* Use `values_fn = list(P-VALUE = summary_fun)` to summarise duplicates”

CHR_ID,CHR_POS,GCST008114,GCST005047,GCST007847
<chr>,<dbl>,<list<dbl>>,<list<dbl>>,<list<dbl>>
8.0,3359886,9e-06,,
6.0,41201241,2e-06,,
19.0,41341832,1e-06,,
7.0,102299678,9e-06,,
7.0,1729716,3e-06,,
12.0,98386008,3e-06,,
12.0,63394241,8e-06,,
11.0,2185703,2e-07,,
14.0,64761876,7e-06,,
6.0,104114435,4e-06,,
