Skip to content

Commit

Permalink
save file sizes in scan disk stage
Browse files Browse the repository at this point in the history
  • Loading branch information
weaverba137 committed Aug 8, 2017
1 parent 7903771 commit d1482dc
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 111 deletions.
20 changes: 11 additions & 9 deletions hpsspy/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def process_missing(missing_cache, disk_root, hpss_root, dirmode='2770',
disk_chdir = dirname(h)
Lfile = join(get_tmpdir(), basename(h.replace('.tar', '.txt')))
htar_dir = None
Lfile_lines = '\n'.join([basename(f) for f in missing[h]])+'\n'
Lfile_lines = '\n'.join([basename(f) for f in missing[h]['files']])+'\n'
if test:
logger.debug(Lfile_lines)
else:
Expand Down Expand Up @@ -237,11 +237,11 @@ def process_missing(missing_cache, disk_root, hpss_root, dirmode='2770',
makedirs(dirname(h_file), mode=dirmode)
created_directories.add(dirname(h_file))
logger.debug("hsi('put', '%s', ':', '%s')",
join(disk_root, missing[h][0]), h_file)
join(disk_root, missing[h]['files'][0]), h_file)
if test:
out = "Test mode, skipping hsi command."
else:
out = hsi('put', join(disk_root, missing[h][0]), ':', h_file)
out = hsi('put', join(disk_root, missing[h]['files'][0]), ':', h_file)
logger.debug(out)
chdir(start_directory)
return
Expand Down Expand Up @@ -274,16 +274,18 @@ def scan_disk(disk_roots, disk_files_cache, clobber=False):
else:
logger.info("No disk cache file, starting scan.")
with open(disk_files_cache, 'w') as t:
t.write('Name,Size\n')
try:
for disk_root in disk_roots:
logger.debug("Starting os.walk at %s.", disk_root)
for root, dirs, files in os.walk(disk_root):
logger.debug("Scanning disk directory %s.", root)
disk_files = [join(root, f).replace(disk_root+'/',
'')+'\n'
for f in files
if not islink(join(root, f))]
t.writelines(disk_files)
for f in files:
fullname = join(root, f)
if not islink(fullname):
cachename = fullname.replace(disk_root+'/', '')
size = os.stat(fullname).st_size
t.write("{0},{1:d}\n".format(cachename, size))
except OSError:
logger.error('Exception encountered while creating ' +
'disk cache file!')
Expand Down Expand Up @@ -421,7 +423,7 @@ def main():
# Read disk files and cache.
#
disk_files_cache = join(options.cache,
'disk_files_{0}.txt'.format(options.release))
'disk_files_{0}.csv'.format(options.release))
logger.debug("disk_files_cache = '%s'", disk_files_cache)
disk_roots = [release_root.replace(basename(config['root']), d)
for d in config['physical_disks']]
Expand Down
212 changes: 110 additions & 102 deletions hpsspy/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def main():
import logging
import re
import json
import csv
from os import environ, stat
from os.path import basename, exists, isdir, join, splitext
from sys import argv
Expand Down Expand Up @@ -117,110 +118,117 @@ def main():
config = json_data['config']
release_root = join(config['root'], options.release)
hpss_release_root = join(config['hpss_root'], options.release)
if isdir(release_root):
#
# Read disk files and cache.
#
disk_files_cache = join(options.cache,
('disk_files_' +
'{0}.txt').format(options.release))
logger.debug("disk_files_cache = '%s'", disk_files_cache)
disk_roots = [release_root.replace(basename(config['root']), d)
for d in config['physical_disks']]
status = scan_disk(disk_roots, disk_files_cache,
clobber=options.clobber_disk)
if not status:
return 1
#
# Now that we've got the disk cache, assume HPSS is empty, and
# make sure all files get backed up.
#
nfiles = 0
nmissing = 0
nmultiple = 0
mapped_to_hpss = dict()
hpss_size = dict()
with open(disk_files_cache) as t:
for l in t:
f = l.strip()
if f in hpss_map["exclude"]:
logger.info("%s skipped.", f)
else:
section = f.split('/')[0]
try:
s = hpss_map[section]
except KeyError:
#
# If the section is not described, that's not
# good, but continue.
#
logger.error("%s is in a directory not " +
"described in the configuration!",
f)
continue
#
# If the section is blank, that's OK.
#
if not s:
logger.info("%s is in a directory not yet " +
"configured.",
f)
continue
if not isdir(release_root):
logger.critical("%s does not exist!", release_root)
return 1
#
# Read disk files and cache.
#
disk_files_cache = join(options.cache,
('disk_files_' +
'{0}.csv').format(options.release))
logger.debug("disk_files_cache = '%s'", disk_files_cache)
disk_roots = [release_root.replace(basename(config['root']), d)
for d in config['physical_disks']]
status = scan_disk(disk_roots, disk_files_cache,
clobber=options.clobber_disk)
if not status:
return 1
#
# Now that we've got the disk cache, assume HPSS is empty, and
# make sure all files get backed up.
#
nfiles = 0
nmissing = 0
nmultiple = 0
mapped_to_hpss = dict()
hpss_size = dict()
pattern_used = dict()
with open(disk_files_cache) as t:
reader = csv.DictReader(t)
for row in reader:
f = row['Name']
if f in hpss_map["exclude"]:
logger.info("%s skipped.", f)
else:
section = f.split('/')[0]
try:
s = hpss_map[section]
except KeyError:
#
# Now check if it is mapped.
# If the section is not described, that's not
# good, but continue.
#
mapped = 0
f_size = stat(join(release_root, f)).st_size
for r in s:
m = r[0].match(f)
if m is not None:
reName = r[0].sub(r[1], f)
if reName in mapped_to_hpss:
mapped_to_hpss[reName].append(f)
hpss_size[reName] += f_size
else:
mapped_to_hpss[reName] = [f]
hpss_size[reName] = f_size
mapped += 1
logger.debug("%s in %s.", f, reName)
if mapped == 0:
logger.error("%s is not mapped to any file on " +
"HPSS!", f)
nmissing += 1
if mapped > 1:
logger.error("%s is mapped to multiple files on " +
"HPSS!", f)
nmultiple += 1
nfiles += 1
if (nfiles % options.report) == 0:
logger.info("%9d files scanned.", nfiles)
missing_files_cache = join(options.cache,
('missing_files_{0}' +
'.json').format(options.release))
logger.debug("missing_files_cache = '%s'", missing_files_cache)
with open(missing_files_cache, 'w') as fp:
json.dump(mapped_to_hpss, fp, indent=2, separators=(',', ': '))
if nmissing > 0:
logger.critical("Not all files would be backed up with " +
"this configuration!")
logger.error("%s is in a directory not " +
"described in the configuration!",
f)
continue
#
# If the section is blank, that's OK.
#
if not s:
logger.info("%s is in a directory not yet " +
"configured.",
f)
continue
#
# Now check if it is mapped.
#
mapped = 0
for r in s:
if r[0].pattern not in pattern_used:
pattern_used[r[0].pattern] = 0
m = r[0].match(f)
if m is not None:
pattern_used[r[0].pattern] += 1
reName = r[0].sub(r[1], f)
if reName in mapped_to_hpss:
mapped_to_hpss[reName]['files'].append(f)
mapped_to_hpss[reName]['size'] += int(row['Size'])
else:
mapped_to_hpss[reName] = {'files': [f],
'size': int(row['Size'])}
mapped += 1
logger.debug("%s in %s.", f, reName)
if mapped == 0:
logger.error("%s is not mapped to any file on " +
"HPSS!", f)
nmissing += 1
if mapped > 1:
logger.error("%s is mapped to multiple files on " +
"HPSS!", f)
nmultiple += 1
nfiles += 1
if (nfiles % options.report) == 0:
logger.info("%9d files scanned.", nfiles)
missing_files_cache = join(options.cache,
('missing_files_{0}' +
'.json').format(options.release))
logger.debug("missing_files_cache = '%s'", missing_files_cache)
with open(missing_files_cache, 'w') as fp:
json.dump(mapped_to_hpss, fp, indent=2, separators=(',', ': '))
if nmissing > 0:
logger.critical("Not all files would be backed up with " +
"this configuration!")
return 1
if nmultiple > 0:
logger.critical("Some files would be backed up more than " +
"once with this configuration!")
return 1
for p in pattern_used:
if pattern_used[p] == 0:
logger.critical("Pattern '%s' was never used!", p)
return 1
if nmultiple > 0:
logger.critical("Some files would be backed up more than " +
"once with this configuration!")
for k in mapped_to_hpss:
logger.info('%s is %d bytes.', k, mapped_to_hpss[k]['size'])
if mapped_to_hpss[k]['size']/1024/1024/1024 > options.limit:
logger.critical("HPSS file %s would be too large!", k)
return 1
for k in hpss_size:
logger.info('%s is %d bytes.', k, hpss_size[k])
if hpss_size[k]/1024/1024/1024 > options.limit:
logger.critical("HPSS file %s would be too large!", k)
return 1
#
# All files map to a file on HPSS, so print out the commands
# that would do a full backup.
#
logger.setLevel(logging.DEBUG)
process_missing(missing_files_cache, release_root,
hpss_release_root, test=True)
else:
logger.critical("%s does not exist!", release_root)
return 1
#
# All files map to a file on HPSS, so print out the commands
# that would do a full backup.
#
logger.setLevel(logging.DEBUG)
process_missing(missing_files_cache, release_root,
hpss_release_root, test=True)
return 0

0 comments on commit d1482dc

Please sign in to comment.