Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ trainer = Trainer(max_nb_epochs=1, train_percent_check=0.1)
trainer.fit(model)

# view tensorboard logs
print('View tensorboard logs by running\ntensorboard --logdir %s' % os.getcwd())
print('and going to http://localhost:6006 on your browser')
logging.info(f'View tensorboard logs by running\ntensorboard --logdir {os.getcwd()}')
logging.info('and going to http://localhost:6006 on your browser')
```

When you're all done you can even run the test set separately.
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/Examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def optimize_on_cluster(hyperparams):
job_display_name = job_display_name[0:3]

# run hopt
print('submitting jobs...')
logging.info('submitting jobs...')
cluster.optimize_parallel_cluster_gpu(
main,
nb_trials=hyperparams.nb_hopt_trials,
Expand Down
7 changes: 4 additions & 3 deletions pl_examples/basic_examples/lightning_module_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Example template for defining a system
"""
import os
import logging
from argparse import ArgumentParser
from collections import OrderedDict

Expand Down Expand Up @@ -214,17 +215,17 @@ def __dataloader(self, train):

@pl.data_loader
def train_dataloader(self):
print('training data loader called')
logging.info('training data loader called')
return self.__dataloader(train=True)

@pl.data_loader
def val_dataloader(self):
print('val data loader called')
logging.info('val data loader called')
return self.__dataloader(train=False)

@pl.data_loader
def test_dataloader(self):
print('test data loader called')
logging.info('test data loader called')
return self.__dataloader(train=False)

@staticmethod
Expand Down
40 changes: 22 additions & 18 deletions pytorch_lightning/callbacks/pt_callbacks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import shutil

import logging
import warnings
import numpy as np

from pytorch_lightning.pt_overrides.override_data_parallel import LightningDistributedDataParallel
Expand Down Expand Up @@ -91,7 +92,7 @@ def __init__(self, monitor='val_loss',
self.stopped_epoch = 0

if mode not in ['auto', 'min', 'max']:
print('EarlyStopping mode %s is unknown, fallback to auto mode.' % mode)
logging.info(f'EarlyStopping mode {mode} is unknown, fallback to auto mode.')
mode = 'auto'

if mode == 'min':
Expand Down Expand Up @@ -121,9 +122,10 @@ def on_epoch_end(self, epoch, logs=None):
current = logs.get(self.monitor)
stop_training = False
if current is None:
print('Early stopping conditioned on metric `%s` '
'which is not available. Available metrics are: %s' %
(self.monitor, ','.join(list(logs.keys()))), RuntimeWarning)
warnings.warn(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warnings.warn and logging.warning are a completely different stream with different consequences so please be sure that you want this behaviour... You may consider also raise it as Error

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmm I'm not familiar with this, but I noticed that the previous code used warnings.warn a lot, I thought this was kind of best-practice.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe that it was related to using print, in such case, it is better to warnings
I do not say that is wrong, in this case it is good, just to be aware of it... :]

f'Early stopping conditioned on metric `{self.monitor}`'
f' which is not available. Available metrics are: {",".join(list(logs.keys()))}',
RuntimeWarning)
stop_training = True
return stop_training

Expand All @@ -141,7 +143,7 @@ def on_epoch_end(self, epoch, logs=None):

def on_train_end(self, logs=None):
if self.stopped_epoch > 0 and self.verbose > 0:
print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))
logging.info(f'Epoch {self.stopped_epoch + 1:05d}: early stopping')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not directly to this line, but we shall be sure that all this new logging is tested (covered by tests)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This did occur to me, but I didn't know how to test this logging, any idea on this?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I usually tested that the message does not crash the code...



class ModelCheckpoint(Callback):
Expand Down Expand Up @@ -187,8 +189,9 @@ def __init__(self, filepath, monitor='val_loss', verbose=0,
self.prefix = prefix

if mode not in ['auto', 'min', 'max']:
print('ModelCheckpoint mode %s is unknown, '
'fallback to auto mode.' % (mode), RuntimeWarning)
warnings.warn(
f'ModelCheckpoint mode {mode} is unknown, '
'fallback to auto mode.', RuntimeWarning)
mode = 'auto'

if mode == 'min':
Expand Down Expand Up @@ -232,25 +235,26 @@ def on_epoch_end(self, epoch, logs=None):
if self.save_best_only:
current = logs.get(self.monitor)
if current is None:
print('Can save best model only with %s available,'
' skipping.' % (self.monitor), RuntimeWarning)
warnings.warn(
f'Can save best model only with {self.monitor} available,'
' skipping.', RuntimeWarning)
else:
if self.monitor_op(current, self.best):
if self.verbose > 0:
print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
' saving model to %s'
% (epoch + 1, self.monitor, self.best,
current, filepath))
logging.info(
f'\nEpoch {epoch + 1:05d}: {self.monitor} improved'
f' from {self.best:0.5f} to {current:0.5f},',
f' saving model to {filepath}')
self.best = current
self.save_model(filepath, overwrite=True)

else:
if self.verbose > 0:
print('\nEpoch %05d: %s did not improve' %
(epoch + 1, self.monitor))
logging.info(
f'\nEpoch {epoch + 1:05d}: {self.monitor} did not improve')
else:
if self.verbose > 0:
print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
logging.info(f'\nEpoch {epoch + 1:05d}: saving model to {filepath}')
self.save_model(filepath, overwrite=False)


Expand Down Expand Up @@ -291,6 +295,6 @@ def on_epoch_begin(self, epoch, trainer):
losses = [10, 9, 8, 8, 6, 4.3, 5, 4.4, 2.8, 2.5]
for i, loss in enumerate(losses):
should_stop = c.on_epoch_end(i, logs={'val_loss': loss})
print(loss)
logging.info(loss)
if should_stop:
break
3 changes: 2 additions & 1 deletion pytorch_lightning/root_module/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import pandas as pd
import torch
import logging


class ModelSummary(object):
Expand Down Expand Up @@ -166,7 +167,7 @@ def print_mem_stack(): # pragma: no cover
for obj in gc.get_objects():
try:
if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
print(type(obj), obj.size())
logging.info(type(obj), obj.size())
except Exception:
pass

Expand Down
3 changes: 2 additions & 1 deletion pytorch_lightning/root_module/root_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pytorch_lightning.root_module.memory import ModelSummary
from pytorch_lightning.root_module.model_saving import ModelIO
from pytorch_lightning.trainer.trainer_io import load_hparams_from_tags_csv
import logging


class LightningModule(GradInformation, ModelIO, ModelHooks):
Expand Down Expand Up @@ -240,7 +241,7 @@ def load_from_checkpoint(cls, checkpoint_path):

def summarize(self, mode):
model_summary = ModelSummary(self, mode=mode)
print(model_summary)
logging.info(model_summary)

def freeze(self):
for param in self.parameters():
Expand Down
3 changes: 2 additions & 1 deletion pytorch_lightning/trainer/amp_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
APEX_AVAILABLE = True
except ImportError:
APEX_AVAILABLE = False
import logging


class TrainerAMPMixin(object):

def init_amp(self, use_amp):
self.use_amp = use_amp and APEX_AVAILABLE
if self.use_amp:
print('using 16bit precision')
logging.info('using 16bit precision')

if use_amp and not APEX_AVAILABLE: # pragma: no cover
msg = """
Expand Down
5 changes: 3 additions & 2 deletions pytorch_lightning/trainer/ddp_mixin.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re
import warnings
import logging

import torch
import torch.distributed as dist
Expand Down Expand Up @@ -59,7 +60,7 @@ def set_distributed_mode(self, distributed_backend, nb_gpu_nodes):
'To silence this warning set distributed_backend=ddp'
warnings.warn(w)

print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))
logging.info(f'gpu available: {torch.cuda.is_available()}, used: {self.on_gpu}')

def configure_slurm_ddp(self, nb_gpu_nodes):
self.is_slurm_managing_tasks = False
Expand Down Expand Up @@ -107,7 +108,7 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
gpu_str = ','.join([str(x) for x in data_parallel_device_ids])
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str

print(f'VISIBLE GPUS: {os.environ["CUDA_VISIBLE_DEVICES"]}')
logging.info(f'VISIBLE GPUS: {os.environ["CUDA_VISIBLE_DEVICES"]}')

def ddp_train(self, gpu_nb, model):
"""
Expand Down
6 changes: 5 additions & 1 deletion pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import os
import warnings
import logging

import torch
import torch.distributed as dist
Expand Down Expand Up @@ -148,7 +149,7 @@ def __init__(self,
Running in fast_dev_run mode: will run a full train,
val loop using a single batch
'''
print(m)
logging.info(m)

# set default save path if user didn't provide one
self.default_save_path = default_save_path
Expand Down Expand Up @@ -234,6 +235,9 @@ def __init__(self,
self.amp_level = amp_level
self.init_amp(use_amp)

# set logging options
logging.basicConfig(level=logging.INFO)

@property
def slurm_job_id(self):
try:
Expand Down
17 changes: 9 additions & 8 deletions pytorch_lightning/trainer/trainer_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import signal
import warnings
from subprocess import call
import logging

import torch
import torch.distributed as dist
Expand Down Expand Up @@ -87,7 +88,7 @@ def restore_state_if_checkpoint_exists(self, model):
if last_ckpt_name is not None:
last_ckpt_path = os.path.join(self.checkpoint_callback.filepath, last_ckpt_name)
self.restore(last_ckpt_path, self.on_gpu)
print(f'model and trainer restored from checkpoint: {last_ckpt_path}')
logging.info(f'model and trainer restored from checkpoint: {last_ckpt_path}')
did_restore = True

return did_restore
Expand All @@ -106,36 +107,36 @@ def register_slurm_signal_handlers(self):
pass

if on_slurm:
print('set slurm handle signals')
logging.info('set slurm handle signals')
signal.signal(signal.SIGUSR1, self.sig_handler)
signal.signal(signal.SIGTERM, self.term_handler)

def sig_handler(self, signum, frame):
if self.proc_rank == 0:
# save weights
print('handling SIGUSR1')
logging.info('handling SIGUSR1')
self.hpc_save(self.weights_save_path, self.logger)

# find job id
job_id = os.environ['SLURM_JOB_ID']
cmd = 'scontrol requeue {}'.format(job_id)

# requeue job
print('\nrequeing job {}...'.format(job_id))
logging.info('\nrequeing job {job_id}...')
result = call(cmd, shell=True)

# print result text
if result == 0:
print('requeued exp ', job_id)
logging.info('requeued exp {job_id}')
else:
print('requeue failed...')
logging.info('requeue failed...')

# close experiment to avoid issues
self.logger.close()

def term_handler(self, signum, frame):
# save
print("bypassing sigterm")
logging.info("bypassing sigterm")

# --------------------
# MODEL SAVE CHECKPOINT
Expand Down Expand Up @@ -328,7 +329,7 @@ def hpc_load(self, folderpath, on_gpu):
# call model hook
model.on_hpc_load(checkpoint)

print(f'restored hpc model from: {filepath}')
logging.info(f'restored hpc model from: {filepath}')

def max_ckpt_in_folder(self, path, name_key='ckpt_'):
files = os.listdir(path)
Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/trainer/training_tricks_mixin.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import torch

import logging
from pytorch_lightning.callbacks import GradientAccumulationScheduler


Expand All @@ -14,7 +14,7 @@ def print_nan_gradients(self):
model = self.get_model()
for param in model.parameters():
if torch.isnan(param.grad.float()).any():
print(param, param.grad)
logging.info(param, param.grad)

def configure_accumulated_gradients(self, accumulate_grad_batches):
self.accumulate_grad_batches = None
Expand Down
3 changes: 2 additions & 1 deletion tests/test_a_restore_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import logging

import pytest
import torch
Expand Down Expand Up @@ -44,7 +45,7 @@ def test_running_test_pretrained_model_ddp():
result = trainer.fit(model)

exp = logger.experiment
print(os.listdir(exp.get_data_path(exp.name, exp.version)))
logging.info(os.listdir(exp.get_data_path(exp.name, exp.version)))

# correct result and ok accuracy
assert result == 1, 'training failed to complete'
Expand Down