Skip to content

Commit 159c4fb

Browse files
filtrate by delta
1 parent 0446554 commit 159c4fb

File tree

1 file changed

+55
-29
lines changed

1 file changed

+55
-29
lines changed

scripts/preprocess-remote.py

Lines changed: 55 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def splitDataset(dataset, ratio, framesPerChunk, skipAction):
8080
for i, (start, end) in enumerate(dataset):
8181
trainingIdx = testingIdx = []
8282
if (end - start) < 2 * framesPerChunk:
83-
print('Session %d is too short. Action: %s' % (i, skipAction))
83+
# print('Session %d is too short. Action: %s' % (i, skipAction))
8484
if 'drop' == skipAction: continue
8585

8686
rng = np.arange(start, end)
@@ -93,6 +93,9 @@ def splitDataset(dataset, ratio, framesPerChunk, skipAction):
9393
if 0 < len(trainingIdx): trainingSet.append(trainingIdx)
9494
if 0 < len(testingIdx): testing.append(testingIdx)
9595
continue
96+
if (0 == len(trainingSet)) or (0 == len(testing)):
97+
print('No training or testing sets was created!')
98+
return [], []
9699
# save training and testing sets
97100
testing = np.sort(np.concatenate(testing))
98101
training = np.sort(np.concatenate(trainingSet))
@@ -108,6 +111,7 @@ def splitDataset(dataset, ratio, framesPerChunk, skipAction):
108111

109112
def dropPadding(idx, padding):
110113
res = []
114+
if len(idx) < 2: return res
111115
# find consecutive frames chunks, save their start and end indices
112116
gaps = np.where(1 < np.diff(idx))[0]
113117
gaps = np.concatenate(([0], 1 + gaps, [len(idx)]))
@@ -120,19 +124,43 @@ def dropPadding(idx, padding):
120124
res = [chunk[padding:-padding] for chunk in res]
121125
# remove chunks that are too short
122126
res = [chunk for chunk in res if padding < len(chunk)]
123-
res = np.concatenate(res)
127+
res = np.concatenate(res) if 0 < len(res) else []
124128
print('Frames before: {}. Frames after: {}'.format(len(idx), len(res)))
125129
return res
126130

127-
def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, skippedFrames, minimumFrames):
131+
def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, skippedFrames, minimumFrames, dropZeroDeltas):
128132
print('Processing', folder)
129-
dataset = loadNpz(folder)
130-
for k, v in dataset.items():
131-
print(k, v.shape)
133+
# load all.npz file if it exists
134+
all_file = os.path.join(folder, 'all.npz')
135+
if os.path.exists(all_file):
136+
dataset = loadNpz(all_file)
137+
else:
138+
dataset = loadNpz(folder)
139+
np.savez(all_file, **dataset)
132140

133-
if len(dataset['time']) < minimumFrames:
141+
# remove the npz files, except for all.npz
142+
files = os.listdir(folder)
143+
for fn in files:
144+
if fn.endswith('.npz') and not ('all.npz' == fn):
145+
os.remove(os.path.join(folder, fn))
146+
print('Removed', len(files), 'files')
147+
148+
if dropZeroDeltas: # drop frames with zero time deltas
149+
deltas = np.diff(dataset['time'])
150+
idx = np.where(0 == deltas)[0]
151+
print('Dropping {} frames with zero time deltas'.format(len(idx)))
152+
dataset = {k: np.delete(v, idx) for k, v in dataset.items()}
153+
154+
N = len(dataset['time'])
155+
# print total deltas statistics
156+
print('Dataset: {} frames'.format(N))
157+
deltas = np.diff(dataset['time'])
158+
print('Total time deltas: min={}, max={}, mean={}'.format(np.min(deltas), np.max(deltas), np.mean(deltas)))
159+
deltas = None
160+
161+
if N < minimumFrames:
134162
print('Dataset is too short. Skipping...')
135-
return 0, 0
163+
return 0, 0, True
136164
# split dataset into sessions
137165
sessions = Utils.extractSessions(dataset, float(timeDelta))
138166
# print sessions and their durations for debugging
@@ -142,14 +170,11 @@ def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, ski
142170
session_time = dataset['time'][idx]
143171
delta = np.diff(session_time)
144172
duration = session_time[-1] - session_time[0]
145-
print('Session {}: {} - {} ({}, {})'.format(i, start, end, end - start, duration))
146173
# print also min, max, and mean time deltas
147-
print('Time deltas in session {}: min={}, max={}, mean={}'.format(i, np.min(delta), np.max(delta), np.mean(delta)))
174+
print('Session {} - {}: min={}, max={}, mean={}, frames={}, duration={} sec'.format(
175+
start, end, np.min(delta), np.max(delta), np.mean(delta), len(session_time), duration
176+
))
148177
continue
149-
# print total deltas statistics
150-
deltas = np.diff(dataset['time'])
151-
print('Total time deltas: min={}, max={}, mean={}'.format(np.min(deltas), np.max(deltas), np.mean(deltas)))
152-
deltas = None
153178
######################################################
154179
# split each session into training and testing sets
155180
training, testing = splitDataset(
@@ -158,10 +183,13 @@ def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, ski
158183
framesPerChunk=int(framesPerChunk),
159184
skipAction=skippedFrames,
160185
)
161-
162186
if 0 < testPadding:
163187
testing = dropPadding(testing, testPadding)
164188

189+
if (0 == len(training)) or (0 == len(testing)):
190+
print('No training or testing sets found!')
191+
return 0, 0, True
192+
165193
def saveSubset(filename, idx):
166194
print('%s: %d frames' % (filename, len(idx)))
167195
subset = {k: v[idx] for k, v in dataset.items()}
@@ -170,18 +198,12 @@ def saveSubset(filename, idx):
170198
assert np.all(diff >= 0), 'Time is not monotonically increasing!'
171199
np.savez(os.path.join(folder, filename), **subset)
172200
return
173-
174-
# remove the npz files
175-
files = os.listdir(folder)
176-
for fn in files:
177-
os.remove(os.path.join(folder, fn))
178-
print('Removed', len(files), 'files')
179201
# save training and testing sets
180202
saveSubset('train.npz', training)
181203
saveSubset('test.npz', testing)
182204

183205
print('Processing ', folder, 'done')
184-
return len(testing), len(training)
206+
return len(testing), len(training), False
185207

186208
def main(args):
187209
stats = {
@@ -208,16 +230,19 @@ def main(args):
208230
if not (sid in stats['screenId']):
209231
stats['screenId'].append(sid)
210232
path = os.path.join(folder, placeId, userId, screenId)
211-
testFramesN, trainFramesN = processFolder(
233+
testFramesN, trainFramesN, isSkipped = processFolder(
212234
path,
213235
args.time_delta, args.test_ratio, args.frames_per_chunk,
214-
args.test_padding, args.skipped_frames
236+
args.test_padding, args.skipped_frames,
237+
minimumFrames=args.minimum_frames,
238+
dropZeroDeltas=args.drop_zero_deltas
215239
)
216-
testFrames += testFramesN
217-
trainFrames += trainFramesN
218-
# store the number of frames per chunk
219-
sid = '%s/%s/%s' % (placeId, userId, screenId)
220-
framesPerChunk[sid] = testFramesN + trainFramesN
240+
if not isSkipped:
241+
testFrames += testFramesN
242+
trainFrames += trainFramesN
243+
# store the number of frames per chunk
244+
sid = '%s/%s/%s' % (placeId, userId, screenId)
245+
framesPerChunk[sid] = testFramesN + trainFramesN
221246
continue
222247
print('Total: %d training frames, %d testing frames' % (trainFrames, testFrames))
223248

@@ -246,6 +271,7 @@ def main(args):
246271
help='What to do with skipped frames ("train", "test", or "drop")'
247272
)
248273
parser.add_argument('--minimum-frames', type=int, default=0, help='Minimum number of frames in a dataset')
274+
parser.add_argument('--drop-zero-deltas', action='store_true', help='Drop frames with zero time deltas')
249275
args = parser.parse_args()
250276
main(args)
251277
pass

0 commit comments

Comments
 (0)