@@ -80,7 +80,7 @@ def splitDataset(dataset, ratio, framesPerChunk, skipAction):
80
80
for i , (start , end ) in enumerate (dataset ):
81
81
trainingIdx = testingIdx = []
82
82
if (end - start ) < 2 * framesPerChunk :
83
- print ('Session %d is too short. Action: %s' % (i , skipAction ))
83
+ # print('Session %d is too short. Action: %s' % (i, skipAction))
84
84
if 'drop' == skipAction : continue
85
85
86
86
rng = np .arange (start , end )
@@ -93,6 +93,9 @@ def splitDataset(dataset, ratio, framesPerChunk, skipAction):
93
93
if 0 < len (trainingIdx ): trainingSet .append (trainingIdx )
94
94
if 0 < len (testingIdx ): testing .append (testingIdx )
95
95
continue
96
+ if (0 == len (trainingSet )) or (0 == len (testing )):
97
+ print ('No training or testing sets was created!' )
98
+ return [], []
96
99
# save training and testing sets
97
100
testing = np .sort (np .concatenate (testing ))
98
101
training = np .sort (np .concatenate (trainingSet ))
@@ -108,6 +111,7 @@ def splitDataset(dataset, ratio, framesPerChunk, skipAction):
108
111
109
112
def dropPadding (idx , padding ):
110
113
res = []
114
+ if len (idx ) < 2 : return res
111
115
# find consecutive frames chunks, save their start and end indices
112
116
gaps = np .where (1 < np .diff (idx ))[0 ]
113
117
gaps = np .concatenate (([0 ], 1 + gaps , [len (idx )]))
@@ -120,19 +124,43 @@ def dropPadding(idx, padding):
120
124
res = [chunk [padding :- padding ] for chunk in res ]
121
125
# remove chunks that are too short
122
126
res = [chunk for chunk in res if padding < len (chunk )]
123
- res = np .concatenate (res )
127
+ res = np .concatenate (res ) if 0 < len ( res ) else []
124
128
print ('Frames before: {}. Frames after: {}' .format (len (idx ), len (res )))
125
129
return res
126
130
127
- def processFolder (folder , timeDelta , testRatio , framesPerChunk , testPadding , skippedFrames , minimumFrames ):
131
+ def processFolder (folder , timeDelta , testRatio , framesPerChunk , testPadding , skippedFrames , minimumFrames , dropZeroDeltas ):
128
132
print ('Processing' , folder )
129
- dataset = loadNpz (folder )
130
- for k , v in dataset .items ():
131
- print (k , v .shape )
133
+ # load all.npz file if it exists
134
+ all_file = os .path .join (folder , 'all.npz' )
135
+ if os .path .exists (all_file ):
136
+ dataset = loadNpz (all_file )
137
+ else :
138
+ dataset = loadNpz (folder )
139
+ np .savez (all_file , ** dataset )
132
140
133
- if len (dataset ['time' ]) < minimumFrames :
141
+ # remove the npz files, except for all.npz
142
+ files = os .listdir (folder )
143
+ for fn in files :
144
+ if fn .endswith ('.npz' ) and not ('all.npz' == fn ):
145
+ os .remove (os .path .join (folder , fn ))
146
+ print ('Removed' , len (files ), 'files' )
147
+
148
+ if dropZeroDeltas : # drop frames with zero time deltas
149
+ deltas = np .diff (dataset ['time' ])
150
+ idx = np .where (0 == deltas )[0 ]
151
+ print ('Dropping {} frames with zero time deltas' .format (len (idx )))
152
+ dataset = {k : np .delete (v , idx ) for k , v in dataset .items ()}
153
+
154
+ N = len (dataset ['time' ])
155
+ # print total deltas statistics
156
+ print ('Dataset: {} frames' .format (N ))
157
+ deltas = np .diff (dataset ['time' ])
158
+ print ('Total time deltas: min={}, max={}, mean={}' .format (np .min (deltas ), np .max (deltas ), np .mean (deltas )))
159
+ deltas = None
160
+
161
+ if N < minimumFrames :
134
162
print ('Dataset is too short. Skipping...' )
135
- return 0 , 0
163
+ return 0 , 0 , True
136
164
# split dataset into sessions
137
165
sessions = Utils .extractSessions (dataset , float (timeDelta ))
138
166
# print sessions and their durations for debugging
@@ -142,14 +170,11 @@ def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, ski
142
170
session_time = dataset ['time' ][idx ]
143
171
delta = np .diff (session_time )
144
172
duration = session_time [- 1 ] - session_time [0 ]
145
- print ('Session {}: {} - {} ({}, {})' .format (i , start , end , end - start , duration ))
146
173
# print also min, max, and mean time deltas
147
- print ('Time deltas in session {}: min={}, max={}, mean={}' .format (i , np .min (delta ), np .max (delta ), np .mean (delta )))
174
+ print ('Session {} - {}: min={}, max={}, mean={}, frames={}, duration={} sec' .format (
175
+ start , end , np .min (delta ), np .max (delta ), np .mean (delta ), len (session_time ), duration
176
+ ))
148
177
continue
149
- # print total deltas statistics
150
- deltas = np .diff (dataset ['time' ])
151
- print ('Total time deltas: min={}, max={}, mean={}' .format (np .min (deltas ), np .max (deltas ), np .mean (deltas )))
152
- deltas = None
153
178
######################################################
154
179
# split each session into training and testing sets
155
180
training , testing = splitDataset (
@@ -158,10 +183,13 @@ def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, ski
158
183
framesPerChunk = int (framesPerChunk ),
159
184
skipAction = skippedFrames ,
160
185
)
161
-
162
186
if 0 < testPadding :
163
187
testing = dropPadding (testing , testPadding )
164
188
189
+ if (0 == len (training )) or (0 == len (testing )):
190
+ print ('No training or testing sets found!' )
191
+ return 0 , 0 , True
192
+
165
193
def saveSubset (filename , idx ):
166
194
print ('%s: %d frames' % (filename , len (idx )))
167
195
subset = {k : v [idx ] for k , v in dataset .items ()}
@@ -170,18 +198,12 @@ def saveSubset(filename, idx):
170
198
assert np .all (diff >= 0 ), 'Time is not monotonically increasing!'
171
199
np .savez (os .path .join (folder , filename ), ** subset )
172
200
return
173
-
174
- # remove the npz files
175
- files = os .listdir (folder )
176
- for fn in files :
177
- os .remove (os .path .join (folder , fn ))
178
- print ('Removed' , len (files ), 'files' )
179
201
# save training and testing sets
180
202
saveSubset ('train.npz' , training )
181
203
saveSubset ('test.npz' , testing )
182
204
183
205
print ('Processing ' , folder , 'done' )
184
- return len (testing ), len (training )
206
+ return len (testing ), len (training ), False
185
207
186
208
def main (args ):
187
209
stats = {
@@ -208,16 +230,19 @@ def main(args):
208
230
if not (sid in stats ['screenId' ]):
209
231
stats ['screenId' ].append (sid )
210
232
path = os .path .join (folder , placeId , userId , screenId )
211
- testFramesN , trainFramesN = processFolder (
233
+ testFramesN , trainFramesN , isSkipped = processFolder (
212
234
path ,
213
235
args .time_delta , args .test_ratio , args .frames_per_chunk ,
214
- args .test_padding , args .skipped_frames
236
+ args .test_padding , args .skipped_frames ,
237
+ minimumFrames = args .minimum_frames ,
238
+ dropZeroDeltas = args .drop_zero_deltas
215
239
)
216
- testFrames += testFramesN
217
- trainFrames += trainFramesN
218
- # store the number of frames per chunk
219
- sid = '%s/%s/%s' % (placeId , userId , screenId )
220
- framesPerChunk [sid ] = testFramesN + trainFramesN
240
+ if not isSkipped :
241
+ testFrames += testFramesN
242
+ trainFrames += trainFramesN
243
+ # store the number of frames per chunk
244
+ sid = '%s/%s/%s' % (placeId , userId , screenId )
245
+ framesPerChunk [sid ] = testFramesN + trainFramesN
221
246
continue
222
247
print ('Total: %d training frames, %d testing frames' % (trainFrames , testFrames ))
223
248
@@ -246,6 +271,7 @@ def main(args):
246
271
help = 'What to do with skipped frames ("train", "test", or "drop")'
247
272
)
248
273
parser .add_argument ('--minimum-frames' , type = int , default = 0 , help = 'Minimum number of frames in a dataset' )
274
+ parser .add_argument ('--drop-zero-deltas' , action = 'store_true' , help = 'Drop frames with zero time deltas' )
249
275
args = parser .parse_args ()
250
276
main (args )
251
277
pass
0 commit comments