-
Notifications
You must be signed in to change notification settings - Fork 0
/
NSSI_LONGITIDUNAL.Rmd
253 lines (190 loc) · 9.47 KB
/
NSSI_LONGITIDUNAL.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
---
title: "NSSI_Longitidunal"
author: "Yama Chang"
output: github_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(foreign)
library(haven)
```
## Clean data for longitidunal data
```{r}
## load data
baseline = read_sav('./data/baseline.sav')
baseline_v1 = read_sav('./data/baseline_v1.sav')
wave2 = read_sav('./data/wave2.sav')
wave3 = read_sav('./data/wave3.sav')
baseline = cbind(wave = 1, baseline) %>% janitor::clean_names()
baseline_v1 = cbind(baseline_v1 = 1, baseline_v1) %>% janitor::clean_names()
wave2 = cbind(wave = 2, wave2) %>% janitor::clean_names()
wave3 = cbind(wave = 3, wave3) %>% janitor::clean_names()
## merge datasets from baseline, wave2, wave3
# There are four major ways join dataframes x and y:
# Inner: keeps data that appear in both x and y
# Left: keeps data that appear in x
# Right: keeps data that appear in y
# Full: keeps data that appear in either x or y
# .x: 出現第一次
# .y: 出現第二次
# 無標記: 只出現一次/出現第三次
longitidunal_data =
full_join(baseline, wave2, by = "id") %>%
full_join(., wave3, by = "id")
longitidunal_data2 =
left_join(baseline, wave2, by = "id") %>%
left_join(., wave3, by = "id")
```
## NSSI Longitidunal Study
```{r}
## Select the data we need as NSSI
# select: 選要的column
# filter: 選要的row
# drop_na: remove rows with missing value
# mutate: change columns or create new ones
NSSI = longitidunal_data %>%
select(id, d81.x:d106.x, birth_sex.x: meps_idi.x, d81.y:d106.y, age.y:cope14, if_ideation:d106z, age:meps_idi.y)
```
## Clean for LASSO
```{r}
library(tidyverse)
library(caret)
library(glmnet)
library("mice")
## NSSI related variables in baseline
NSSI_1 = NSSI %>%
select(id, d91.x, d96, self_inj_year.x, d97.x, d81.x, d84.x, d87.x, d88.x, d89.x, fs.x, es.x, tis.x, ti_spride.x, ti_sshame.x, ti_salien.x, ti_spass.x, gend_dysph.x, tcs.x, safety.x, cc.x, soc_sup.x, fam_sup.x, frd_sup.x, sp_sup.x, bsi_som_t.x, bsi_dep_t.x, bsi_anx_t.x, gs_it.x, yrs_horm, if_surg.x)
## NSSI related variables in 12-month follow up
NSSI_2 = NSSI %>%
select(id, d91.y, self_inj_year.y, d97.y, d81.y, d84.y, d87.y, d88.y, d89.y, fs.y, es.y, tis.y, ti_spride.y, ti_sshame.y, ti_salien.y, ti_spass.y, gend_dysph.y, tcs.y, safety.y, cc.y, soc_sup.y, fam_sup.y, frd_sup.y, sp_sup.y, bsi_som_t.y, bsi_dep_t.y, bsi_anx_t.y, gs_it.y, if_surg.y)
## NSSI related variavbles in baseline & 12-month follow up
NSSI_LONG = left_join(NSSI_1, NSSI_2, by = "id")
## NSSI related variables in baseline
## outcome variable: 12-month follow up engagement in NSSI
NSSI_LASSO = NSSI %>%
select(id, self_inj_year.y, d91.x, d96, self_inj_year.x, d97.x, d81.x, d84.x, d87.x, d88.x, d89.x, fs.x, es.x, tis.x, ti_spride.x, ti_sshame.x, ti_salien.x, ti_spass.x, gend_dysph.x, tcs.x, safety.x, cc.x, soc_sup.x, fam_sup.x, frd_sup.x, sp_sup.x, bsi_som_t.x, bsi_dep_t.x, bsi_anx_t.x, gs_it.x, yrs_horm, if_surg.x)
## Transform predictors
# Subsitute all NA with 0
NSSI_LASSO[is.na(NSSI_LASSO)] = 0
# Transform to Y/N
NSSI_LASSO$d84.x_b = ifelse(NSSI_LASSO$d84.x >0, 1, 0)
NSSI_LASSO$d89.x_b = ifelse(NSSI_LASSO$d89.x >0, 1, 0)
NSSI_LASSO$yrs_horm_b = ifelse(NSSI_LASSO$yrs_horm >0, 1, 0)
# Transform to T/F with condition (>63)
NSSI_LASSO$bsi_som_t.x_b = ifelse(NSSI_LASSO$bsi_som_t.x >= 63, 1, 0)
NSSI_LASSO$bsi_dep_t.x_b = ifelse(NSSI_LASSO$bsi_dep_t.x >= 63, 1, 0)
NSSI_LASSO$bsi_anx_t.x_b = ifelse(NSSI_LASSO$bsi_anx_t.x >= 63, 1, 0)
# T score of 63 or higher for the GSI or a positive case for at least 2 of the subscales indicated a positive case for the GSI
NSSI_LASSO$gs_it.x_b = ifelse(NSSI_LASSO$gs_it.x >= 63, 1, 0)
NSSI_LASSO$gsi_b = ifelse(NSSI_LASSO$bsi_som_t.x_b + NSSI_LASSO$bsi_dep_t.x_b + NSSI_LASSO$bsi_anx_t.x_b >=2 | NSSI_LASSO$gs_it.x_b == 1, 1, 0)
# final dataset for LASSO analysis
NSSI_LASSO_final = NSSI_LASSO %>% select(self_inj_year.y, d91.x, d96, self_inj_year.x, d97.x, d81.x, d84.x_b, d87.x, d88.x, d89.x, d89.x_b, fs.x, es.x, tis.x, ti_spride.x, ti_sshame.x, ti_salien.x, ti_spass.x, gend_dysph.x, tcs.x, safety.x, cc.x, soc_sup.x, fam_sup.x, frd_sup.x, sp_sup.x, bsi_som_t.x_b, bsi_dep_t.x_b, bsi_anx_t.x_b, gsi_b, yrs_horm_b, if_surg.x)
```
## LASSO preparation
```{r}
# set seeds for reproducibility
set.seed(123)
# Split the data into training and test set
train_sample = NSSI_LASSO_final$self_inj_year.y %>%
createDataPartition(p = 0.8, list = FALSE)
train = NSSI_LASSO_final[train_sample, ]
test = NSSI_LASSO_final[-train_sample, ]
# Dumy code categorical predictor variables
# model.matrix() helps to create the matrix of predictors and also automatically converts categorical predictors to appropriate dummy variables, which is required for the glmnet() function.
x = model.matrix(self_inj_year.y ~ ., train)[,-1]
# Convert the outcome (class) to a numerical variable
y = as.numeric(train$self_inj_year.y)
```
## Fit the lasso penalized regression model
```{r}
library(glmnet)
# Find the best lambda using cross-validation
# Find the optimal value of lambda that minimizes the cross-validation error:
set.seed(123)
cv_lasso = cv.glmnet(x, y, alpha = 1, family = "binomial")
plot(cv_lasso)
cv_lasso$lambda.min
# Fit the final model on the training data
model = glmnet(x, y, alpha = 1, family = "binomial", lambda = cv_lasso$lambda.min)
# Display regression coefficients
coef(model)
# Make predictions on the test data
x_test = model.matrix(self_inj_year.y ~ ., test)[,-1]
probabilities = model %>% predict(newx = x_test)
predicted.classes = ifelse(probabilities > 0.5, "1", "0")
# Model accuracy
observed.classes = test$self_inj_year.y
mean(predicted.classes == observed.classes)
```
## Notes
```{r}
# Find the optimal value of lambda that minimizes the cross-validation error
# The plot displays the cross-validation error according to the log of lambda. The left dashed vertical line indicates that the log of the optimal value of lambda is approximately -5, which is the one that minimizes the prediction error. This lambda value will give the most accurate model. The exact value of lambda can be viewed as follow
set.seed(123)
cv_lasso = cv.glmnet(x, y, alpha = 1, family = "binomial")
plot(cv_lasso)
cv_lasso$lambda.min
# Using lambda.min as the best lambda, gives the following regression coefficients
coef(cv_lasso, cv_lasso$lambda.min)
# Using lambda.1se as the best lambda, gives the following regression coefficients
coef(cv_lasso, cv_lasso$lambda.1se)
# Setting lambda = lambda.1se produces a simpler model compared to lambda.min, but the model might be a little bit less accurate than the one obtained with lambda.min.
```
# Prediction accuracy
## AREA UNDER THE CURVE (AUC)
```{r}
# Using AREA UNDER THE CURVE (AUC) to assess the prediction accuracy
# How good is a classifier
library(ROCR)
a = as.data.frame(observed.classes)
pred = prediction(probabilities, a)
perf = performance(pred, 'tpr', 'fpr')
plot_auc = plot(perf)
ggsave("auc.png", plot_auc, dpi = 300)
AUC = performance(pred, 'auc')
AUC
```
preds = predict(cv_lasso, newx = x_test, type = 'response')
# Plot correlation
```{r}
# install.packages("GGally")
# install.packages("ggcorrplot")
library("GGally")
library(ggcorrplot)
# GGally
corr_plot_a = ggcorr(NSSI_LASSO_final,
label = TRUE,
label_alpha = FALSE,
digits = 2,
label_size = 1,
geom = "tile",
palette = "PuOr")
corr_plot_a
corr = round(cor(NSSI_LASSO_final), 1)
p = cor_pmat(NSSI_LASSO_final)
corr_plopt_b = ggcorrplot(corr, hc.order = TRUE, type = "lower",
lab = TRUE,
lab_size = 2,
outline.col = "white",
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"),
legend.title = "Correlation",
title = "Correlation of predictors")
corr_plopt_b
ggsave("correlation.png", corr_plopt_b)
corr_plopt_c = ggcorrplot(corr, p.mat = p, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 2,
outline.col = "white",
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"),
legend.title = "Correlation",
title = "Correlation of predictors"
) + labs(caption="X = correlation non-significant at p < 0.05") +
scale_y_discrete(labels = c("Social Support","Family Support", "Peer Support", "Support from Important Others", "Pride", "Community Connectedness", "Transgender Congruence", "Sense of Safety", "Alienation", "Lifetime Suicide Frequency", "Past Year Suicide Frequency", "Past Year Suicide Attempt", "Somatization", "Anxiety", "Depression", "BSI-GSI", "Lifetime NSSI Frequency", "Past Year NSSI Frequency", "Felt Stigma", "Enacted Stigma", "Lifetime Suicide Ideation", "Lifetime Suicide Attempt", "Lifetime NSSI", "Past Year NSSI", "12-Month NSSI", "Past Year Suicidal Ideation", "Started surgery", "Had Surgery", "Gender Dysphoria", "Investment in Passing", "Internalized Transphobia")) +
scale_x_discrete(labels = c("Family Support", "Peer Support", "Support from Important Others", "Pride", "Community Connectedness", "Transgender Congruence", "Sense of Safety", "Alienation", "Lifetime Suicide Frequency", "Past Year Suicide Frequency", "Past Year Suicide Attempt", "Somatization", "Anxiety", "Depression", "BSI-GSI", "Lifetime NSSI Frequency", "Past Year NSSI Frequency", "Felt Stigma", "Enacted Stigma", "Lifetime Suicide Ideation", "Lifetime Suicide Attempt", "Lifetime NSSI", "Past Year NSSI", "12-Month NSSI", "Past Year Suicidal Ideation", "Started surgery", "Had Surgery", "Gender Dysphoria", "Investment in Passing", "Internalized Transphobia", "Shame"))
corr_plopt_c
ggsave("correlation_d.png", corr_plopt_c, width = 25, height = 25, units = "cm", dpi = 300)
```