-
Notifications
You must be signed in to change notification settings - Fork 3
/
training_sample_FVs.py
88 lines (78 loc) · 4.56 KB
/
training_sample_FVs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import sys,os,random
'''usage eg:
python2 training_sample_FVs.py spNeut.fvec spHard_ spSoft_ spPartialHard_ spPartialSoft_ 5 0,1,2,3,4,6,7,8,9,10 ./ neut.fvec,hard.fvec,linkedHard.fvec,soft.fvec,linkedSoft.fvec,partialHard.fvec,linkedPartialHard.fvec,partialSoft.fvec,linkedPartialSoft.fvec
'''
if len(sys.argv)!=10:
sys.exit("usage:\npython2 training_sample_FVs.py neutTrainingFileName hardTrainingFilesPrefix softTrainingFilesPrefix partialHardTrainingFilesPrefix partialSoftTrainingFilesPrefix sweepTrainingWindow linkedTrainingWindows sampledFVsDir sampledFVsFiles\n")
else:
neutTrainingFileName, hardTrainingFilesPrefix, softTrainingFilesPrefix, partialHardTrainingFilesPrefix, partialSoftTrainingFilesPrefix, sweepTrainingWindow, linkedTrainingWindows, sampledFVsDir, sampledFVsFiles = sys.argv[1:]
sweepFilePaths,linkedFilePaths = {},{}
for trainingFilePrefix in [hardTrainingFilesPrefix,softTrainingFilesPrefix,partialHardTrainingFilesPrefix,partialSoftTrainingFilesPrefix]:
trainingSetDir="/".join(trainingFilePrefix.split("/")[:-1])
trainingFilePrefixDirless=trainingFilePrefix.split("/")[-1]
sweepFilePaths[trainingFilePrefix]=[]
linkedFilePaths[trainingFilePrefix]=[]
for fileName in os.listdir(trainingSetDir):
if fileName.startswith(trainingFilePrefixDirless):
winNum=int(fileName.split("_")[1].split(".")[0])
if winNum==int(sweepTrainingWindow):
sweepFilePaths[trainingFilePrefix].append(trainingSetDir+"/"+fileName)
elif winNum in [int(x) for x in linkedTrainingWindows.split(",")]:
linkedFilePaths[trainingFilePrefix].append(trainingSetDir+"/"+fileName)
def getExamplesFromFVFile(simFileName):
try:
simFile=open(simFileName)
lines=[line.strip() for line in simFile.readlines()]
header=lines[0]
examples=lines[1:]
simFile.close()
return header,examples
except Exception:
return "",[]
def getExamplesFromFVFileLs(simFileLs):
examples=[]
keptHeader=""
for filePath in simFileLs:
header,currExamples=getExamplesFromFVFile(filePath)
if header:
keptHeader=header
examples+=currExamples
return keptHeader,examples
header,neutExamples=getExamplesFromFVFile(neutTrainingFileName)
hardHeader,hardExamples=getExamplesFromFVFileLs(sweepFilePaths[hardTrainingFilesPrefix])
linkedHardHeader,linkedHardExamples=getExamplesFromFVFileLs(linkedFilePaths[hardTrainingFilesPrefix])
softHeader,softExamples=getExamplesFromFVFileLs(sweepFilePaths[softTrainingFilesPrefix])
linkedSoftHeader,linkedSoftExamples=getExamplesFromFVFileLs(linkedFilePaths[softTrainingFilesPrefix])
partialHardHeader,partialHardExamples=getExamplesFromFVFileLs(sweepFilePaths[partialHardTrainingFilesPrefix])
linkedPartialHardHeader,linkedPartialHardExamples=getExamplesFromFVFileLs(linkedFilePaths[partialHardTrainingFilesPrefix])
partialSoftHeader,partialSoftExamples=getExamplesFromFVFileLs(sweepFilePaths[partialSoftTrainingFilesPrefix])
linkedPartialSoftHeader,linkedPartialSoftExamples=getExamplesFromFVFileLs(linkedFilePaths[partialSoftTrainingFilesPrefix])
def getMinButNonZeroExamples(lsLs):
counts=[]
for ls in lsLs:
if len(ls)>0:
counts.append(len(ls))
if not counts:
raise Exception
return min(counts)
trainingSetLs=[hardExamples,linkedHardExamples,softExamples,linkedSoftExamples,partialHardExamples,linkedPartialHardExamples,partialSoftExamples,linkedPartialSoftExamples]
numExamplesToKeep=getMinButNonZeroExamples(trainingSetLs)
for i in range(len(trainingSetLs)):
random.shuffle(trainingSetLs[i])
trainingSetLs[i]=trainingSetLs[i][:numExamplesToKeep]
hardExamples,linkedHardExamples,softExamples,linkedSoftExamples,partialHardExamples,linkedPartialHardExamples,partialSoftExamples,linkedPartialSoftExamples=trainingSetLs
if sampledFVsDir.lower() in ["none","false","default"]:
sampledFVsDir='./'
if sampledFVsFiles.lower() in ["none","false","default"]:
sampledFVsFiles=["neut.fvec","hard.fvec","linkedHard.fvec","soft.fvec","linkedSoft.fvec","partialHard.fvec","linkedPartialHard.fvec","partialSoft.fvec","linkedPartialSoft.fvec"]
else:
sampledFVsFiles=sampledFVsFiles.split(",")
assert len(sampledFVsFiles)==9
outExamples=[neutExamples,hardExamples,linkedHardExamples,softExamples,linkedSoftExamples,partialHardExamples,linkedPartialHardExamples,partialSoftExamples,linkedPartialSoftExamples]
for i in range(len(sampledFVsFiles)):
if outExamples[i]:
outFile=open('/'.join((sampledFVsDir+'/'+sampledFVsFiles[i]).split('//')),"w")
outFile.write("classLabel\t%s\n" %(hardHeader))
for example in outExamples[i]:
outFile.write("%s\t%s\n" %(sampledFVsFiles[i].replace(".fvec",""),example))
outFile.close()