-
Notifications
You must be signed in to change notification settings - Fork 2
/
LogP-Physchem Prediction Workflow.R
100 lines (60 loc) · 2.76 KB
/
LogP-Physchem Prediction Workflow.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#Author: Dan Zang, PhD
#Affiliation: Integrated Laboratory Systems, Inc.
#Contact: dzang@ils-inc.com
#Title: LogP.R
#Purpose: this code takes an input of PaDEL descriptors and uses the SVM model from e1071 #package to predict octonal-water partition coefficient (LogP)
#Notes: genetic algorithm was used to determine the most relevant molecular fingerprints from #the PaDEL descriptors.
#define output direcory
outDir<-"C:/PropertyPrediction/LogP"
# Load LogP data
LogPdata<-read.table(file.path(outDir, "LogP-FP600-MW.txt"), header=T, sep="\t", as.is=T)
dim(LogPdata) # 14193 606
# 600 fingerprint bits + MW + LogP
# There are 11371 training samples and 2837 test samples. Total: 14208
LogPdataTraining <-LogPdata[1:11360, 1:602]
LogPdataTest <-LogPdata[11361:14193, 1:602]
dim(LogPdataTraining) # 11360 602
dim(LogPdataTest) # 2833 602
# Load support vector machine (SVM) package
library(e1071)
# Use the function svm() to build the SVM model
LogPmodel <- svm(LogP~., data=LogPdataTraining, cost = 150, epsilon = 0.025, gamma = 0.00014)
# Correlation between experimental values and predicted values for training set
LogPTrainingCorr<-lm(LogPmodel$fitt ~ LogPdataTraining$LogP)
summary (LogPTrainingCorr)
# Predict LogS from the test set
LogPpred<-predict(LogPmodel, LogPdataTest)
# Correlation between experimental values and predicted values for test set
LogPTestCorr<-lm(LogPpred ~ LogPdataTest$LogP)
summary (LogPTestCorr)
setwd("C:/PropertyPrediction/")
save(LogPmodel, file="LogPmodel.rda")
load("LogPmodel.rda")
# Descriptor names for regression modeling
LogPfingerprints<-names(LogPdata[, c(1:601)])
filenames<-c("DSSTox-QSAR1-FP-MW.txt",
"DSSTox-QSAR2-FP-MW.txt",
"DSSTox-QSAR3-FP-MW.txt",
"DSSTox-QSAR4-FP-MW.txt",
"DSSTox-QSAR5-FP-MW.txt",
"DSSTox-QSAR6-FP-MW.txt",
"DSSTox-QSAR7-FP-MW.txt",
"DSSTox-QSAR8-FP-MW.txt",
"DSSTox-QSAR9-FP-MW.txt",
"DSSTox-QSAR10-FP-MW.txt",
"DSSTox-QSAR11-FP-MW.txt",
"DSSTox-QSAR12-FP-MW.txt",
"DSSTox-QSAR13-FP-MW.txt",
"DSSTox-QSAR14-FP-MW.txt",
"DSSTox-QSAR15-FP-MW.txt")
#define input direcory
inDir<-"C:/PropertyPrediction/Descriptors"
for(i in 1:length(filenames)){
data<-read.table(file.path(inDir, filenames[i]), header=T, sep="\t", as.is=T)
#select appropriate fingerprints
modelData<-data[, LogPfingerprints]
#run model
LogPpred<-predict(LogPmodel, modelData)
#save output
write.table(LogPpred, file.path(outDir, paste("logPoutput", i, ".txt", sep = "")))
}