Skip to content

Commit e615453

Browse files
collect some statistics
1 parent 5620d06 commit e615453

File tree

1 file changed

+31
-3
lines changed

1 file changed

+31
-3
lines changed

scripts/preprocess-remote.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,10 @@ def dropPadding(idx, padding):
130130

131131
def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, skippedFrames, minimumFrames, dropZeroDeltas):
132132
print('Processing', folder)
133+
stats = {
134+
'deltas': [],
135+
'durations': [],
136+
}
133137
# load all.npz file if it exists
134138
all_file = os.path.join(folder, 'all.npz')
135139
if os.path.exists(all_file):
@@ -160,7 +164,7 @@ def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, ski
160164

161165
if N < minimumFrames:
162166
print('Dataset is too short. Skipping...')
163-
return 0, 0, True
167+
return 0, 0, True, None
164168
# split dataset into sessions
165169
sessions = Utils.extractSessions(dataset, float(timeDelta))
166170
# print sessions and their durations for debugging
@@ -174,6 +178,8 @@ def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, ski
174178
print('Session {} - {}: min={}, max={}, mean={}, frames={}, duration={} sec'.format(
175179
start, end, np.min(delta), np.max(delta), np.mean(delta), len(session_time), duration
176180
))
181+
stats['deltas'].append(delta)
182+
stats['durations'].append(duration)
177183
continue
178184
######################################################
179185
# split each session into training and testing sets
@@ -205,7 +211,7 @@ def saveSubset(filename, idx):
205211
print(', '.join(['%s: %s' % (k, v.shape) for k, v in dataset.items()]))
206212

207213
print('Processing ', folder, 'done')
208-
return len(testing), len(training), False
214+
return len(testing), len(training), False, stats
209215

210216
def main(args):
211217
stats = {
@@ -219,6 +225,10 @@ def main(args):
219225
folder = args.folder
220226
foldersList = lambda x: [nm for nm in os.listdir(x) if os.path.isdir(os.path.join(x, nm))]
221227
subfolders = foldersList(folder)
228+
globalStats = {
229+
'deltas': [],
230+
'durations': [],
231+
}
222232
for placeId in subfolders:
223233
if not (placeId in stats['placeId']):
224234
stats['placeId'].append(placeId)
@@ -232,7 +242,7 @@ def main(args):
232242
if not (sid in stats['screenId']):
233243
stats['screenId'].append(sid)
234244
path = os.path.join(folder, placeId, userId, screenId)
235-
testFramesN, trainFramesN, isSkipped = processFolder(
245+
testFramesN, trainFramesN, isSkipped, new_stats = processFolder(
236246
path,
237247
args.time_delta, args.test_ratio, args.frames_per_chunk,
238248
args.test_padding, args.skipped_frames,
@@ -245,6 +255,8 @@ def main(args):
245255
# store the number of frames per chunk
246256
sid = '%s/%s/%s' % (placeId, userId, screenId)
247257
framesPerChunk[sid] = testFramesN + trainFramesN
258+
for k, v in new_stats.items():
259+
globalStats[k].extend(v)
248260
continue
249261
print('Total: %d training frames, %d testing frames' % (trainFrames, testFrames))
250262

@@ -255,6 +267,22 @@ def main(args):
255267
print('-' * 80)
256268
for k, v in framesPerChunk.items():
257269
print('%s: %d frames' % (k, v))
270+
###########################################
271+
def plot_histogram(data, title, filename):
272+
import matplotlib.pyplot as plt
273+
plt.hist(data, bins=100)
274+
plt.title(title)
275+
plt.grid()
276+
plt.savefig(filename)
277+
plt.close()
278+
plt.clf()
279+
return
280+
281+
for k, v in globalStats.items():
282+
if 0 == len(v): continue
283+
v = np.concatenate(v)
284+
plot_histogram(v, 'Histogram of %s' % k, os.path.join(folder, '%s.png' % k))
285+
continue
258286
return
259287

260288
if __name__ == '__main__':

0 commit comments

Comments
 (0)