# Overview

This notebook is to generate the test dataset for the UKBB analysis.

In [92]:
library(readr)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(DT)
library(tidyr)
library(gtools)
library(knitr)

In [93]:
N = 100000

# Generate 6-digit random ID numbers
id_numbers <- sample(100000:999999, N, replace = FALSE)

# Generate sex information (0 or 1)
sex <- sample(0:1, N, replace = TRUE)

dob <- as.Date(runif(N, 50, 80) * 365, origin = "1895-01-01")

# Create a data frame
toy_data <- data.frame(ID = id_numbers, Sex = sex, DOB = dob)

# Generate "f.42009" column based on the specified distribution
toy_data$f.42009 <- 0

# Assign NA to 95% of the population
toy_data$f.42009[sample(1:N, N*0.95, replace = FALSE)] <- NA

# Assign 11 to 85% of the remaining 5%
toy_data$f.42009[sample(which(is.na(toy_data$f.42009)), N*0.05*0.85, replace = FALSE)] <- 11

# Assign 21 to 5% of the remaining 5%
toy_data$f.42009[sample(which(is.na(toy_data$f.42009)), N*0.05*0.10, replace = FALSE)] <- 21

# Assign other numbers to the remaining individuals
remaining_inds <- which(toy_data$f.42009==0)
toy_data$f.42009[sample(remaining_inds, length(remaining_inds), replace = TRUE)] <- sample(c(0, 1, 2, 12, 22), length(remaining_inds), replace = TRUE)
toy_data$f.42009 <- as.character(toy_data$f.42009)
toy_data<- toy_data %>%
    mutate(f.42009.isvalid = ifelse(is.na(f.42009), 0, 1))


In [94]:
# Function to generate correlated values
generate_correlated_values <- function(input_values, correlation_value) {
  n <- length(input_values)
  z <- rnorm(n)
  y <- correlation_value * input_values + sqrt(1 - correlation_value^2) * z
  return(pmax(pmin(y, max(input_values)), min(input_values)))
}

# Generate "f.42007" column based on the correlation with "f.42009"
toy_data$f.42007.isvalid <- generate_correlated_values(as.numeric(toy_data$f.42009.isvalid), correlation_value)
toy_data$f.42007.tmp <- rbinom(nrow(toy_data), 1, toy_data$f.42007.isvalid)

In [95]:
# Assuming 'toy_data' is your data frame and 'f.42007.tmp' is your binary column
toy_data$f.42007 <- ifelse(toy_data$f.42007.tmp == 0, NA, 
                           ifelse(toy_data$f.42007.tmp == 1, 
                                  sample(c(0, 1, 2, 11, 12, 21, 22), sum(toy_data$f.42007.tmp == 1), 
                                         replace = TRUE, prob = c(0.01, 0.02, 0.03, 0.85, 0.01, 0.10, 0.01)),
                                  NA))

In [None]:
toy_data <- toy_data %>% select(ID, Sex, DOB, f.42009, f.42007) %>%
    rename(f.31.0.0 = Sex, f.33.0.0 = DOB, f.42009.0.0 = f.42009, f.42007.0.0=f.42007)
   

In [99]:
head(toy_data)  

Unnamed: 0_level_0,ID,f.31.0.0,f.33.0.0,f.42009.0.0,f.42007.0.0
Unnamed: 0_level_1,<int>,<int>,<date>,<chr>,<dbl>
1,633627,0,1966-10-26,,
2,542438,0,1962-08-23,,
3,727287,1,1966-02-14,,
4,355926,1,1951-06-24,,
5,714555,0,1952-06-02,,
6,194930,0,1971-07-21,,


In [100]:
write.table(toy_data, file = "~/student_test_2024/data/toy_data.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
