# UK Biobank COVID-19 data

http://biobank.ndph.ox.ac.uk/ukb/exinfo.cgi?src=COVID19_tests

In [1]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.3.0           ✔ purrr   0.3.4      
✔ tibble  3.0.1           ✔ dplyr   0.8.99.9003
✔ tidyr   1.0.0           ✔ stringr 1.4.0      
✔ readr   1.3.1           ✔ forcats 0.4.0      
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


## 1. Assign blood groups

We adapted the mapping reported by Melzer et al. (https://doi.org/10.1371/journal.pgen.1000072)

### Variant identities

| RSID | UKBB | Melzer et al. |
| ---- | ---- | ------------- |
| rs8176746 | G > T | C > A |
| rs8176719 | T > TC | del G |

### Variant map

rs8176746  rs8176719

| Melzer et al. | UKBB | rs8176746_G / rs8176719_T | Phenotype |
| ------------- | ---- | -------| --------- |
| C/C del/del | G/G T/T | 2 / 2 | O |
| C/C del/G | G/G T/TC | 2 / 1 | A |
| C/C G/G | G/G TC/TC | 2 / 0 | A |
| C/A del/del | G/T T/T | 1 / 2 | O |
| A/A G/G | T/T TC/TC | 0 / 0 | B |
| C/A del/G | G/T T/TC | 1 / 1 | B |
| C/A G/G | G/T TC/TC | 1 / 0 | AB |

In [2]:
imputed_genotypes <- read_tsv('../ukbb/imputed_genotypes.raw', col_types = cols(.default = col_double())) %>%
    select(FID, IID, SEX, rs8176746_G, rs8176719_T)

In [3]:
snps_to_blood_group <- list(
    '2 / 2' = 'O',
    '2 / 1' = 'A',
    '2 / 0' = 'A',
    '1 / 2' = 'O',
    '1 / 1' = 'B',
    '1 / 0' = 'AB',
    '0 / 0' = 'B'
)

ukb_blood_groups <- imputed_genotypes %>%
    unite('genotypes', c(rs8176746_G, rs8176719_T), remove = F, sep = ' / ') %>%
    mutate(blood_group = genotypes %>% recode(!!!snps_to_blood_group, .default = 'Other'))

ukb_blood_groups %>% head(0)

ukb_blood_groups %>% 
    group_by(blood_group) %>%
    tally

ukb_inferred_bg <- ukb_blood_groups %>%
    select(FID, IID, SEX, blood_group)

FID,IID,SEX,genotypes,rs8176746_G,rs8176719_T,blood_group
<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>


blood_group,n
<chr>,<int>
A,211478
AB,17642
B,46860
O,210925
Other,137


## 2. SARS-CoV-2 infection status

In [4]:
# Infection test results are stored in MySQL database
con <- DBI::dbConnect(
    RMariaDB::MariaDB(),
    host = "localhost",
    user = "mnz2108",
    dbname = "clinical_ukbb",
    password = read_file('../mysql_password.txt') %>% str_replace('\n', '')
)

In [5]:
ukb_cov_tests <- con %>% tbl('covid19_result_050720') %>% as_tibble

ukb_cov_tests %>% nrow
ukb_cov_tests %>% head(0)

eid,specdate,spectype,laboratory,origin,result
<int>,<chr>,<int>,<int>,<int>,<int>


## 3. Combine information sources

In [6]:
blood_groups_cov_info_df <- ukb_inferred_bg %>% 
    left_join(ukb_cov_tests, by = c('IID' = 'eid')) %>%
    select(FID, IID, SEX, blood_group, result) %>%
    mutate(cov_tested = as.integer(!is.na(result))) %>%
    replace_na(list(result = 0)) %>%
    group_by(IID) %>%
    filter(result == max(result), blood_group != 'Other') %>%
    filter(row_number() == 1) %>%
    ungroup %>%
    mutate(cohort = if_else(result == 1, 'cov_pos', if_else(cov_tested == 1, 'cov_neg', 'general_pop')))

blood_groups_cov_info_df %>% head(0)

FID,IID,SEX,blood_group,result,cov_tested,cohort
<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<int>,<chr>


## 4. Contingency table for meta-analysis

In [7]:
contingency <- blood_groups_cov_info_df %>%
    group_by(blood_group, cohort) %>%
    tally %>%
    ungroup %>%
    pivot_wider(names_from = cohort, values_from = n) %>%
    select(blood_group, cov_pos, cov_neg, general_pop)

contingency %>% write_tsv('../data/ukb_2020-05-07.tsv')

contingency

blood_group,cov_pos,cov_neg,general_pop
<chr>,<int>,<int>,<int>
A,466,830,210182
AB,40,54,17548
B,110,187,46563
O,412,761,209752


## 5. Export to create genetic files

`.gen` and `.sample` files are created in the next notebook using the table exported here.

In [8]:
genetic_analysis_cohort_df <- blood_groups_cov_info_df %>%
    filter(cov_tested == 1) %>%
    pivot_wider(id_cols = c(FID, IID, SEX, result), names_from = blood_group, values_from = cov_tested,
                values_fill = list(cov_tested = 0)) %>%
    rename(infection = result)

genetic_analysis_cohort_df %>% write_tsv('blood_groups.tsv')