# 4. Scores

In [21]:
try(library(tidyverse), silent=TRUE)
library(lubridate)
library(data.table)
library(glue)
library(jsonlite)
dataset_name = "210223_cvd_gp"
path = "/data/analysis/ag-reils/steinfej/code/umbrella/pre/ukbb"
data_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS/data"
dataset_path = glue("{data_path}/3_datasets_post/{dataset_name}")
dataset_path_pre = glue("{data_path}/2_datasets_pre/{dataset_name}")

In [2]:
#data = arrow::read_feather(glue("{dataset_path}/data_merged.feather"))
description = arrow::read_feather(glue("{dataset_path}/description.feather"))

In [3]:
files = c()
for (i in 0:9){
    files = c(files, glue("{dataset_path}/partition_{i}/test/data_imputed.feather"))
}

In [4]:
data = map(files, arrow::read_feather) %>% bind_rows()

data = data %>% mutate_at(c("sex", "overall_health_rating", "smoking_status", "ethnic_background"), as.factor)
data = data %>% mutate(sex=fct_relevel(sex, c("Male", "Female")),
                       overall_health_rating=fct_relevel(overall_health_rating, c("Excellent", "Good", "Fair", "Poor")),
                       smoking_status=fct_relevel(smoking_status, c("Current", "Previous", "Never")))

covariates = (data_description %>% filter(isTarget==FALSE) %>% filter(!dtype=="Date"))$covariate[-1]
targets = (data_description %>% filter(isTarget==TRUE))$covariate

load(file = glue("{path}data/phenotypes.rda"))
load(file = glue("{path}data/medications_list.rda"))
load(file = glue("{path}data/endpoints.rda"))
load(file = glue("{path}data/scores.rda"))
death_endpoints = c(death = c("death"), death_cvd = c("death_cvd"))
competing_endpoints = c(CE_QRISK3death=c("CE_QRISK3death"))
endpoints = append(endpoints, death_endpoints)
endpoints = append(endpoints, scores)

## Missing values

In [5]:
na_count <-data.frame(sapply(data, function(y) sum(length(which(is.na(y))))))
na_count %>% filter(sapply(data, function(y) sum(length(which(is.na(y)))))>0)

sapply.data..function.y..sum.length.which.is.na.y.....
<int>


In [6]:
map_smoking = jsonlite::fromJSON(str_replace_all(str_replace_all((description %>% filter(covariate == "smoking_status"))$mapping, "'", '"'), ", nan: -2", ""))
map_smoking = setNames(names(map_smoking), map_smoking)
map_gender = jsonlite::fromJSON(str_replace_all(str_replace_all((description %>% filter(covariate == "sex"))$mapping, "'", '"'), ", nan: -2", ""))
map_gender = setNames(names(map_gender), map_gender)
map_ethnicity = jsonlite::fromJSON(str_replace_all(str_replace_all((description %>% filter(covariate == "ethnic_background"))$mapping, "'", '"'), ", nan: -2", ""))
map_ethnicity = setNames(names(map_ethnicity), map_ethnicity)

In [7]:
data$smoking_status = recode(data$smoking_status, !!!map_smoking)
data$sex = recode(data$sex, !!!map_gender)
data$ethnic_background = recode(data$ethnic_background, !!!map_ethnicity)

# Scores

## FRAMINGHAM RISK SCORE (Anderson 1991, Wilson 1998)

In [None]:
women_bmi <- function(age, bmi, sbp, smoking, diabetes, treated_bp){
    b_age=2.72107
    b_bmi=0.51125
    b_smoking=0.61868
    b_diabetes=0.77763
    if (treated_bp==FALSE){b_sbp=2.81291}
    if (treated_bp==TRUE){b_sbp=2.88267}
    E = b_age*age + b_bmi*bmi + b_smoking*smoking + b_diabetes*diabetes + b_sbp*sbp
    return (1-0.94833^(E-26.0145))
}

women_chol <- function(age, chol, hdl_chol, sbp, smoking, diabetes, bp_treated){
    b_age=2.32888
    b_chol = 1.20904
    b_hdl_chol = -0.70833
    b_smoking=0.52873
    b_diabetes=0.69154
    if (bp_treated==FALSE){b_sbp=2.76157}
    if (bp_treated==TRUE){b_sbp=2.82263}
    E = b_age*age + b_chol*chol + b_hdl_chol*hdl_chol + b_smoking*smoking + b_diabetes*diabetes + b_sbp*sbp
    return (1-0.95012^(E-26.1931))
}

men_bmi <- function(age, bmi, sbp, smoking, diabetes, treated_bp){
    b_age=3.11296
    b_bmi=0.79277
    b_smoking=0.70953
    b_diabetes=0.53160
    if (treated_bp==FALSE){b_sbp=1.85508}
    if (treated_bp==TRUE){b_sbp=1.92672}
    E = b_age*age + b_bmi*bmi + b_smoking*smoking + b_diabetes*diabetes + b_sbp*sbp
    return (1-0.88431^(E-23.9388))
}

men_chol <- function(age, chol, hdl_chol, sbp, smoking, diabetes, bp_treated){
    b_age=3.06117
    b_chol = 1.12370
    b_hdl_chol = -0.93263
    b_smoking=0.65451
    b_diabetes=0.57367
    if (bp_treated==FALSE){b_sbp=1.93303}
    if (bp_treated==TRUE){b_sbp=1.99881}
    E = b_age*age + b_chol*chol + b_hdl_chol*hdl_chol + b_smoking*smoking + b_diabetes*diabetes + b_sbp*sbp
    return (1-0.88936^(E-23.9802))
}

calculateFRS <- function(sex, age, bmi, chol, hdl_chol, sbp, smoking, diabetes, bp_treated){
    print(bp_treated)
    if (sex=="M"){
        frs_bmi = men_bmi(age, bmi, sbp, smoking, diabetes, bp_treated)
        frs_chol = men_chol(age, chol, hdl_chol, smoking, diabetes, bp_treated)
    }
    if (sex=="F"){
        frs_bmi = women_bmi(age, bmi, sbp, smoking, diabetes, bp_treated)
        frs_chol = women_chol(age, chol, hdl_chol, smoking, diabetes, bp_treated)
    }  
    return (list(frs_bmi, frs_chol))
}


In [18]:
frs = calculateFRS(sex="M", age=30, bmi=22.5, chol=180, hdl_chol=45, sbp=125, smoking=0, diabetes=0, bp_treated=FALSE)

[1] FALSE


ERROR: Error in men_chol(age, chol, hdl_chol, smoking, diabetes, bp_treated): argument "bp_treated" is missing, with no default


In [7]:
frs

ERROR: Error in eval(expr, envir, enclos): object 'frs' not found


In [8]:
head(data)

eid,PGS000011,PGS000013,PGS000016,PGS000018,PGS000039,PGS000057,PGS000058,PGS000059,PGS000116,⋯,death_cvd_event,death_cvd_event_time,SCORE_event,SCORE_event_time,ASCVD_event,ASCVD_event_time,QRISK3_event,QRISK3_event_time,MACE_event,MACE_event_time
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>
1000018,3.17077,14.18216,25.4786,-7.910004,0.1960993,3.690275,8.51323,4.060235,2.543097,⋯,0,10.6256,0,10.6256,0,10.335387,0,10.335387,0,10.335387
1000020,3.935565,14.20361,25.50886,-7.338207,0.3820538,4.17,8.665529,3.64,2.575014,⋯,0,12.35592,0,12.35592,0,12.065708,0,12.065708,0,12.065708
1000037,4.223654,14.24818,25.54972,-7.533061,0.1899884,4.288627,8.540908,5.541451,2.441255,⋯,0,11.62765,0,11.62765,1,7.969884,1,7.969884,0,11.33744
1000043,3.529575,14.1765,25.39414,-7.768648,0.2893883,4.13,8.515447,4.261098,2.740589,⋯,0,11.06913,0,11.06913,1,5.122519,1,5.122519,1,5.122519
1000051,3.618199,14.21691,25.39648,-7.486226,0.1776932,3.87,8.043932,4.207255,2.460508,⋯,0,14.05065,0,14.05065,0,13.760438,0,13.760438,0,13.760438
1000066,3.55504,14.19133,25.47087,-7.501505,0.453426,4.1,8.345422,3.521569,2.746012,⋯,0,10.53525,0,10.53525,0,10.245038,0,10.245038,0,10.245038


## ESC SCORE (Conroy 2003)

[Conroy 2003](http://eurheartj.oxfordjournals.org/content/24/11/987.full.pdf)

In [8]:
calculateRisk <- function(age, cholesterol, SBP, currentSmoker, betaSmoker, betaSBP, betaChol, coefs) {
  # step 1 risks
  Sage0 = exp(-exp(coefs["alpha"])*(age - 20)^coefs["p"])
  Sage10 = exp(-exp(coefs["alpha"])*(age - 10)^coefs["p"])
  # step 2 weights
  w = betaChol*(cholesterol - 6) + betaSBP*(SBP - 120) + betaSmoker*currentSmoker
  # step 3 weighted risks
  Sage   = (Sage0)^exp(w) 
  Sage1 = (Sage10)^exp(w) 
  # step 4 - 10 years survival
  S10 = Sage1/Sage
  # step 5 - endpoint
  Risk10 = 1 - S10
  Risk10
}

calculateScoreEur <- function(age, cholesterol, SBP, currentSmoker, gender = "Men", risk = "Low risk") {
  betaSmoker = c(0.71, 0.63)
  betaSBP    = c(0.018, 0.022)
  betaChol   = c(0.24, 0.02)
  
  coeffs <- array(c(-22.1, 4.71, -26.7, 5.64, -29.8, 6.36, -31.0, 6.62, -21.0, 4.62, -25.7, 5.47, -28.7, 6.23, -30.0, 6.42), 
                  c(2,2,2,2),
                  dimnames = list(c("alpha", "p"), c("CHD", "non CHD"), c("Male", "Female"), c("Low risk", "High risk")))
  
  # step 6 - score
  CVDrisk = calculateRisk(age, cholesterol, SBP, currentSmoker,
                          betaSmoker[1], betaSBP[1], betaChol[1], coeffs[,"CHD",gender,risk])
  NonCVDrisk = calculateRisk(age, cholesterol, SBP, currentSmoker,
                          betaSmoker[2], betaSBP[2], betaChol[2], coeffs[,"non CHD",gender,risk])
 
  CVDrisk + NonCVDrisk
}

In [9]:
sex="Male"
age_at_recruitment = 64
cholesterol = 6.8
systolic_blood_pressure = 140
current_smoker = 0

In [10]:
calculateScoreEur(age_at_recruitment, cholesterol, systolic_blood_pressure, current_smoker, sex,risk="Low risk")

In [11]:
temp = data.table(data %>% select(eid, age_at_recruitment, cholesterol, systolic_blood_pressure, smoking_status, sex) %>% mutate(current_smoker = case_when(smoking_status=="Current" ~ 1, TRUE ~ 0)))
SCORE_df = temp[, score_SCORE:=calculateScoreEur(age_at_recruitment, cholesterol, systolic_blood_pressure, current_smoker, sex,risk="Low risk"), by="eid"] %>% select(c(eid, score_SCORE))
SCORE_df

eid,score_SCORE
<int>,<dbl>
1000285,0.0343943723
1000686,0.0401695746
1000974,0.0709517856
1001073,0.0319272214
1001098,0.0022416772
1001119,0.0157146322
1001458,0.0653505615
1001517,0.0630663630
1001591,0.0071638047
1001741,0.0025406444


## ACC/AHA ASCVD (Goff 2014)

[Goof 2014](https://www.ahajournals.org/doi/pdf/10.1161/01.cir.0000437741.48606.98)

In [12]:
coefs_string = '"ln_age" "ln_age_squared" "ln_total_cholest" "ln_age_totcholest" "ln_hdlC" "ln_age_hdlC" "ln_treated_BP" "ln_age_BP" "ln_untreated_BP" "ln_age_ln_untreated_BP" "smoker" "nonsmoker" "ln_age_smoker" "diabetes" "nondiabetes" "meancoef" "baseline"
                "white_female" -29.799 4.884 13.54 -3.114 -13.578 3.149 2.019 0 1.957 0 7.574 0 -1.665 0.661 0 -29.18 0.9665
                "afroamer_female" 17.114 0 0.94 0 -18.92 4.475 29.291 -6.432 27.82 -6.087 0.691 0 0 0.874 0 86.61 0.9533
                "white_male" 12.344 0 11.853 -2.664 -7.99 1.769 1.797 0 1.764 0 7.837 0 -1.795 0.658 0 61.18 0.9144
                "afroamer_male" 2.469 0 0.302 0 -0.307 0 1.916 0 1.809 0 0.549 0 0 0.645 0 19.54 0.8954'

In [13]:
# from Appendix 7
coefs <- read.table(text=coefs_string, row.names=1)
#coefs

calculateASCVD <- function(coefs=coefs, sex="Male", ethnicity="White", age=53, cholesterol=2, hdl_cholesterol=1.1, systolic_blood_pressure=120, antihypertensives=1, diabetes=0, smoking=1) {
    
    if (!is.na(ethnicity)) {
            
        if ((sex=="Female")&(ethnicity != "Black")){const=coefs[1,]} 
        if ((sex=="Female")&(ethnicity=="Black")){const=coefs[2,]} 
        if ((sex=="Male")&(ethnicity != "Black")){const=coefs[3,]} 
        if ((sex=="Male")&(ethnicity=="Black")){const=coefs[4,]} 

        if (smoking==TRUE){smokc=const$smoker} else {smokc=const$nonsmoker}
        if (smoking==TRUE){smokcov=1} else {smokcov=0}

        if (antihypertensives==TRUE){BPc=const$ln_treated_BP} else {BPc=const$ln_untreated_BP}
        if (antihypertensives==TRUE){BPcov=const$ln_age_BP} else {BPcov=const$ln_age_ln_untreated_BP}

        if (diabetes==TRUE){diab=const$diabetes} else {diab=const$nondiabetes}

       # meancoef = const$meancoef

        calc = log(age)*const$ln_age+log(age)*log(age)*const$ln_age_squared+
            log(cholesterol*38.67)*const$ln_total_cholest+
            log(age)*log(cholesterol*38.67)*const$ln_age_totcholest+
            log(hdl_cholesterol*38.67)*const$ln_hdlC+
            log(age)*log(hdl_cholesterol*38.67)*const$ln_age_hdlC+
            smokc+smokcov*log(age)*const$ln_age_smoker+
            log(systolic_blood_pressure)*BPc+
            log(age)*log(systolic_blood_pressure)*BPcov+diab
        
        ASCVD<-(1-(const$baseline^exp(calc-const$meancoef)))
    } else {ASCVD=NA}
    return (ASCVD)
}

In [14]:
temp = data.table(data %>% select(eid, age_at_recruitment, ethnic_background, sex, cholesterol, hdl_cholesterol, systolic_blood_pressure, antihypertensives, diabetes2, smoking_status) %>%
                  mutate(current_smoker = case_when(smoking_status=="Current" ~ TRUE, TRUE ~ FALSE)))
ASCVD_df = temp[, score_ASCVD:=calculateASCVD(coefs, sex, ethnic_background, age_at_recruitment, cholesterol, hdl_cholesterol, systolic_blood_pressure, antihypertensives, diabetes2, current_smoker), by=eid] %>% select(c(eid, score_ASCVD))
head(ASCVD_df)

eid,score_ASCVD
<int>,<dbl>
1000285,0.14320501
1000686,0.13781988
1000974,0.38666508
1001073,0.07376216
1001098,0.01139464
1001119,0.04382689


In [15]:
temp

eid,age_at_recruitment,ethnic_background,sex,cholesterol,hdl_cholesterol,systolic_blood_pressure,antihypertensives,diabetes2,smoking_status,current_smoker,score_ASCVD
<int>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<chr>,<lgl>,<dbl>
1000285,64,White,Male,4.484,0.918,135.5,FALSE,FALSE,Never,FALSE,0.143205008
1000686,58,White,Male,7.139,1.159,147.5,FALSE,FALSE,Never,FALSE,0.137819883
1000974,69,White,Male,4.302,1.075,154.0,FALSE,TRUE,Previous,FALSE,0.386665084
1001073,64,White,Female,6.944,1.608,148.0,FALSE,FALSE,Never,FALSE,0.073762156
1001098,47,White,Female,5.389,1.477,139.5,FALSE,FALSE,Never,FALSE,0.011394640
1001119,62,White,Female,4.833,1.461,137.5,FALSE,FALSE,Previous,FALSE,0.043826887
1001458,59,White,Male,4.879,1.097,153.0,FALSE,FALSE,Current,TRUE,0.184407293
1001517,66,White,Male,6.353,1.879,143.5,FALSE,FALSE,Previous,FALSE,0.154071008
1001591,50,White,Male,5.675,1.418,112.5,FALSE,FALSE,Never,FALSE,0.028630793
1001741,47,White,Female,4.880,1.157,149.0,FALSE,FALSE,Previous,FALSE,0.015815727


## UK QRISK3 (Hippisley-Cox 2017)

[Hippisley-Cox 2017](https://www.bmj.com/content/357/bmj.j2099)

In [16]:
options(warn=-1)
library(QRISK3)
temp = head(data)
temp = as.data.frame(data %>% 
                     mutate_if(is.logical, as.integer) %>%
                     mutate(cholesterol_HDL_ratio=cholesterol/hdl_cholesterol, 
                            gender=case_when(sex=="Female"~1, sex=="Male"~0),
                            ethnicity=case_when(ethnic_background == "White" ~ 1,
                                                ethnic_background == "Asian" ~ 5,
                                                ethnic_background == "Black" ~ 1,
                                                ethnic_background == "Chinese" ~ 5,
                                                ethnic_background == "Mixed" ~ 1,
                                                TRUE ~ 1
                                               ),
                            smoke=case_when(smoking_status == "Never" ~ 1,
                                            smoking_status == "Previous" ~ 2,
                                            smoking_status == "Current" ~ 4,
                                            ),
                            age = as.numeric(age_at_recruitment),
                            std_systolic_blood_pressure = 9.002537727355957,
                           ) #%>% drop_na()
                    )

In [17]:
options(warn=0)
QRISK3_df = QRISK3_2017(data = temp, 
            patid="eid", 
            gender="gender", 
            age="age", 
            atrial_fibrillation="atrial_fibrillation", 
            atypical_antipsy="atypical_antipsychotics", 
            regular_steroid_tablets="glucocorticoids",
            erectile_disfunction="erectile_dysfunction",
            migraine="migraine",
            rheumatoid_arthritis="rheumatoid_arthritis",
            chronic_kidney_disease="chronic_kidney_disease",
            severe_mental_illness="severe_mental_illness",
            systemic_lupus_erythematosis="systemic_lupus_erythematosus",
            blood_pressure_treatment="antihypertensives",
            diabetes1="diabetes1",
            diabetes2="diabetes2",
            weight="weight",
            height="standing_height",
            ethiniciy="ethnicity",
            heart_attack_relative="fh_heart_disease",
            cholesterol_HDL_ratio = "cholesterol_HDL_ratio",
            systolic_blood_pressure = "systolic_blood_pressure",
            std_systolic_blood_pressure = "std_systolic_blood_pressure", ### MISSING!
            smoke = "smoke",
            townsend = "townsend_deprivation_index_at_recruitment")
QRISK3_df = QRISK3_df %>% mutate(score_QRISK3=QRISK3_2017/100) %>% select(c(eid, score_QRISK3)) 


This R package was based on open-sourced original QRISK3-2017 algorithm.

<https://qrisk.org/three/src.php> Copyright 2017 ClinRisk Ltd.


The risk score calculated from this R package can only be used for research purpose.


Please refer to QRISK3 website for more information

<https://qrisk.org/three/index.php>


Important: Please double check whether your variables are coded the same as the QRISK3 calculator


Height should have unit as (cm)

Weight should have unit as (kg)


Ethiniciy should be coded as: 

   Ethiniciy_category Ethinicity
1 White or not stated          1
2              Indian          2
3           Pakistani          3
4         Bangladeshi          4
5         Other Asian          5
6     Black Caribbean          6


Smoke should be coded as: 

               Smoke_category Smoke
1                  non-smoker     1
2                   ex-smoker     2
3 light smoker (less than 10)     3
4  moderate smoker (10 to 19)     4
5   heavy smoker (20 or over)     5


The 

In [18]:
nrow(QRISK3_df)

# JOIN SCORE OUTPUTS

In [23]:
score_df = SCORE_df %>% left_join(ASCVD_df, by="eid") %>% left_join(QRISK3_df, by="eid") %>% arrange(eid)
score_df %>% write_csv(glue("{dataset_path}/predictions_scores.csv"))

In [25]:
glue("{dataset_path}/predictions_scores.csv")

In [24]:
"/data/analysis/ag-reils/steinfej/data/2_datasets_pre/210223_cvd_gp/predictions_coxph.csv"

In [26]:
'/data/analysis/ag-reils/steinfej/data/3_datasets_post/210223_cvd_gp/predictions_scores.csv'