# Overview

This notebook is to generate the test dataset for the UKBB analysis.

In [262]:
library(lubridate)
library(readr)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(DT)
library(tidyr)
library(gtools)
library(knitr)

In [263]:
N = 100000

# Generate 6-digit random ID numbers
id_numbers <- sample(100000:999999, N, replace = FALSE)

sex <- sample(0:1, N, replace = TRUE)
dob <- as.Date(runif(N, 50, 80) * 365, origin = "1895-01-01")

toy_data <- data.frame(ID = id_numbers, Sex = sex, DOB = dob)
toy_data$f.131057 <- 0

# Assign NA to 80% of the population
toy_data$f.131057[sample(1:N, N*0.80, replace = FALSE)] <- NA
# Assign 50 to 85% of the remaining 5%
toy_data$f.131057[sample(which(is.na(toy_data$f.131057)), N*0.20*0.85, replace = FALSE)] <- 50
# Assign other numbers to the remaining individuals
remaining_inds <- which(toy_data$f.131057==0)
toy_data$f.131057[which(toy_data$f.131057==0)] <- sample(c(40,51), length(remaining_inds), replace = TRUE)

toy_data<- toy_data %>%
    mutate(f.131057.isvalid = ifelse(is.na(f.131057), 0, 1))

generate_correlated_values <- function(input_values, correlation_value) {
  n <- length(input_values)
  z <- rnorm(n)
  y <- correlation_value * input_values + sqrt(1 - correlation_value^2) * z
  return(pmax(pmin(y, max(input_values)), min(input_values)))
}

# Generate "f.131369" column based on the correlation with "f.131057"
toy_data$f.131369.isvalid <- generate_correlated_values(as.numeric(toy_data$f.131057.isvalid), correlation_value)
toy_data$f.131369.tmp <- rbinom(nrow(toy_data), 1, toy_data$f.131369.isvalid)

toy_data$f.131369 <- ifelse(toy_data$f.131369.tmp == 0, NA, 
                           ifelse(toy_data$f.131369.tmp == 1, 
                                  sample(c(40, 50, 51), sum(toy_data$f.131369.tmp == 1), 
                                         replace = TRUE, 
                                         prob = c(0.10, 0.85, 0.05)),
                                  NA))

In [264]:
toy_data <- toy_data %>%
  select(!f.131369.isvalid & !f.131369.tmp) %>%
  rename(f.131057.0.0 = f.131057, f.131369.0.0 = f.131369) %>%
  mutate(
    f.131056.0.0 = ifelse(!is.na(f.131057.0.0) & !is.na(DOB) & (DOB + years(30) <= as.Date("2020-01-01")),
      runif(sum(!is.na(f.131057.0.0)),
        min = as.numeric(DOB + years(30)),
        max = as.Date("2020-01-01")
      ),
      NA
    ),
    f.131056.0.0 = as.Date(f.131056.0.0, origin = "1970-01-01"), # Assuming 1970 is the correct origin
  ) %>%
  mutate(
    f.131368.0.0 = ifelse(!is.na(f.131369.0.0) & !is.na(DOB) & (DOB + years(40) <= as.Date("2020-01-01")),
      runif(sum(!is.na(f.131369.0.0)),
        min = as.numeric(DOB + years(30)),
        max = as.Date("2020-01-01")
      ),
      NA
    ),
    f.131368.0.0 = as.Date(f.131368.0.0, origin = "1970-01-01"), # Assuming 1970 is the correct origin
    f.31.0.0 = Sex,
    f.33.0.0 = DOB
  )


[1m[22m[36mℹ[39m In argument: `f.131056.0.0 = ifelse(...)`.
[33m![39m NAs produced”
[1m[22m[36mℹ[39m In argument: `f.131368.0.0 = ifelse(...)`.
[33m![39m NAs produced”


## generate PRS

In [265]:
toy_data_tia_PRS_cases <- toy_data %>% filter(!is.na(f.131056.0.0))
toy_data_tia_PRS_ctrls <- toy_data %>% filter(is.na(f.131056.0.0))
toy_data_tia_PRS_cases$PRS = rnorm(n=nrow(toy_data_tia_PRS_cases),mean=0.45,sd=0.3)
toy_data_tia_PRS_cases$pheno_tia=1
toy_data_tia_PRS_ctrls$PRS = rnorm(n=nrow(toy_data_tia_PRS_ctrls),mean=0.3,sd=0.5)
toy_data_tia_PRS_ctrls$pheno_tia=0

toy_data_tia_PRS = rbind(toy_data_tia_PRS_cases,toy_data_tia_PRS_ctrls)

min_max_normalize <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}

# Normalize the specified column
toy_data_tia_PRS$tia_PRS <- min_max_normalize(toy_data_tia_PRS$PRS)
head(toy_data_tia_PRS)
# toy_data_PRS <- toy_data_PRS %>% select(ID, PRS

Unnamed: 0_level_0,ID,Sex,DOB,f.131057.0.0,f.131057.isvalid,f.131369.0.0,f.131056.0.0,f.131368.0.0,f.31.0.0,f.33.0.0,PRS,pheno_tia,tia_PRS
Unnamed: 0_level_1,<int>,<int>,<date>,<dbl>,<dbl>,<dbl>,<date>,<date>,<int>,<date>,<dbl>,<dbl>,<dbl>
1,433625,1,1946-07-25,40,1,,2002-03-14,,1,1946-07-25,-0.1343569,1,0.3769788
2,969201,0,1946-03-28,40,1,50.0,1991-02-23,1980-05-11,0,1946-03-28,0.4562502,1,0.5195925
3,934809,0,1954-01-21,51,1,50.0,1993-12-09,2004-03-02,0,1954-01-21,0.3195133,1,0.4865747
4,506123,1,1959-05-19,50,1,,2001-07-18,,1,1959-05-19,0.4209205,1,0.5110615
5,180502,1,1961-08-13,40,1,50.0,2009-08-27,2016-04-20,1,1961-08-13,0.345569,1,0.4928664
6,465219,0,1960-06-20,50,1,40.0,2012-03-30,2018-05-26,0,1960-06-20,0.8379627,1,0.6117646


In [266]:
toy_data_stroke_PRS_cases <- toy_data %>% filter(!is.na(f.131368.0.0))
toy_data_stroke_PRS_ctrls <- toy_data %>% filter(is.na(f.131368.0.0))
toy_data_stroke_PRS_cases$PRS <- rnorm(n = nrow(toy_data_stroke_PRS_cases), mean = 0.6, sd = 0.3)
toy_data_stroke_PRS_cases$pheno_stroke <- 1
toy_data_stroke_PRS_ctrls$PRS <- rnorm(n = nrow(toy_data_stroke_PRS_ctrls), mean = 0.2, sd = 0.5)
toy_data_stroke_PRS_ctrls$pheno_stroke <- 0

toy_data_stroke_PRS <- rbind(toy_data_stroke_PRS_cases, toy_data_stroke_PRS_ctrls)

# Normalize the specified column
toy_data_stroke_PRS$stroke_PRS <- min_max_normalize(toy_data_stroke_PRS$PRS)

toy_data_stroke_PRS <- toy_data_stroke_PRS %>% select(ID, stroke_PRS)

toy_data_PRS <- merge(toy_data_tia_PRS, toy_data_stroke_PRS, by="ID") %>% select(ID, tia_PRS, stroke_PRS) %>% rename(TIA_PRS=tia_PRS)
head(toy_data_PRS)
# toy_data_PRS <- toy_data_PRS %>% select(ID, 

Unnamed: 0_level_0,ID,TIA_PRS,stroke_PRS
Unnamed: 0_level_1,<int>,<dbl>,<dbl>
1,100006,0.5664577,0.5474659
2,100031,0.4250724,0.4730236
3,100032,0.5231551,0.7543777
4,100051,0.4581116,0.5662735
5,100052,0.5058307,0.6957714
6,100062,0.4289423,0.6892172


In [267]:
write.table(toy_data, file = "~/student_test_2024/data/toy_data.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
write.table(toy_data_PRS, file = "~/student_test_2024/data/toy_data_PRS.tsv", sep = "\t", quote = FALSE, row.names = FALSE)