From 4c37f2948c6d2b49e5ff26599521afeac2564ce4 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Wed, 1 Mar 2023 01:55:43 -0600
Subject: [PATCH] Update docs and prep for release

---
 README.md                        |  4 +++-
 fastxtend/__init__.py            |  2 +-
 fastxtend/callback/progresize.py |  1 -
 fastxtend/utils/__init__.py      |  1 +
 nbs/callback.channelslast.ipynb  |  3 ++-
 nbs/callback.cutmixup.ipynb      |  3 ++-
 nbs/callback.ema.ipynb           |  6 +++---
 nbs/callback.progresize.ipynb    |  1 -
 nbs/index.ipynb                  |  7 +++++--
 nbs/metrics.ipynb                |  6 +++---
 nbs/optimizer.fused.ipynb        |  4 +++-
 nbs/utils.ipynb                  |  8 ++++++--
 nbs/vision.models.xresnet.ipynb  | 18 ++++++++++++------
 settings.ini                     |  2 +-
 14 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 3d89968..8a75f83 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,8 @@ addons for fastai
 
 - [Fused optimizers](optimizer.fused.html) which are 21 to 293 percent
   faster relative to fastai native optimizers.
+- Fused implementations of modern optimizers, such as
+  [Adan](optimizer.adan.html) and [Lion](optimizer.lion.html).
 - Flexible [metrics](metrics.html) which can log on train, valid, or
   both. Backwards compatible with fastai metrics.
 - Easily use [multiple losses](multiloss.html) and log each individual
@@ -120,7 +122,7 @@ compatible with the original fastai code.
 Use a fused ForEach optimizer:
 
 ``` python
-Learner(..., opt_func=adam(fused=True))
+Learner(..., opt_func=adam(foreach=True))
 ```
 
 Log an accuracy metric on the training set as a smoothed metric and
diff --git a/fastxtend/__init__.py b/fastxtend/__init__.py
index f18e5d0..a11f0b4 100644
--- a/fastxtend/__init__.py
+++ b/fastxtend/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.18"
+__version__ = "0.0.19"
diff --git a/fastxtend/callback/progresize.py b/fastxtend/callback/progresize.py
index cb46c72..efdc206 100644
--- a/fastxtend/callback/progresize.py
+++ b/fastxtend/callback/progresize.py
@@ -110,7 +110,6 @@ def before_fit(self):
                 self.learn.mixed_precision.autocast.__exit__(None, None, None)
 
             self.learn.loss.backward()
-            print('backwards')
             self.learn.opt.zero_grad()
 
         finally:
diff --git a/fastxtend/utils/__init__.py b/fastxtend/utils/__init__.py
index 28c4434..26fee71 100644
--- a/fastxtend/utils/__init__.py
+++ b/fastxtend/utils/__init__.py
@@ -6,6 +6,7 @@
 # %% ../../nbs/utils.ipynb 1
 # Contains code from:
 # fastai - Apache License 2.0 - Copyright (c) 2023 fast.ai
+# mish-cuda - MIT License - Copyright (c) 2019 thomasbrandon https://github.com/thomasbrandon/mish-cuda
 
 # %% ../../nbs/utils.ipynb 3
 import torch, random, gc
diff --git a/nbs/callback.channelslast.ipynb b/nbs/callback.channelslast.ipynb
index 0cf2e46..6e8d4bb 100644
--- a/nbs/callback.channelslast.ipynb
+++ b/nbs/callback.channelslast.ipynb
@@ -18,10 +18,11 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "With `MixedPrecision`, image models trained in channels last format on Tensor Cores can increase training throughput over contiguous format. PyTorch observed a [22% improvment](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html#performance-gains) in ResNet50 training speed using channels last and 8-35% improvement across a selection of models tested on a V100.\n",
+    "With `fastai.callback.fp16.MixedPrecision`, image models trained in channels last format on Tensor Cores can increase training throughput over contiguous format. PyTorch observed a [22% improvment](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html#performance-gains) in ResNet50 training speed using channels last and 8-35% improvement across a selection of models tested on a V100.\n",
     "\n",
     "Channels last format is compatible with modern GPUs (Volta, Turing, or newer) and modern CPUs (Ice Lake or newer).\n",
     "\n",
diff --git a/nbs/callback.cutmixup.ipynb b/nbs/callback.cutmixup.ipynb
index 3b632fb..1c59199 100644
--- a/nbs/callback.cutmixup.ipynb
+++ b/nbs/callback.cutmixup.ipynb
@@ -29,10 +29,11 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "fastxtend replaces fastai's `MixUp` and `CutMix` with backwards compatible versions that support optional training with `MultiLoss` via `MixHandlerX`. \n",
+    "fastxtend replaces `fastai.callback.mixup.MixUp` and `fastai.callback.mixup.CutMix` with backwards compatible versions that support optional training with `MultiLoss` via `MixHandlerX`. \n",
     "\n",
     "`CutMixUp` and `CutMixUpAugment` allow applying MixUp, CutMix, and Augmentations using one callback. Optionally element-wise on the same batch."
    ]
diff --git a/nbs/callback.ema.ipynb b/nbs/callback.ema.ipynb
index 45eb2d4..0a4f8d1 100644
--- a/nbs/callback.ema.ipynb
+++ b/nbs/callback.ema.ipynb
@@ -64,7 +64,7 @@
     "\n",
     ": For Loop EMA Step vs Fused ForEach EMA Step {#tbl-fused}\n",
     "\n",
-    "[^faster]: `EMACallback` performance was benchmarked on a GeForce 3080 Ti using PyTorch 1.13.1, Cuda 11.7, Mixed Precision, and [Channels Last](callback.channelslast.html) (except DeBERTa and ViT). Results may differ on other models, hardware, and across benchmarking runs. Speedup is calculated from the total time spent on the EMA step and rounded down to the nearest whole number."
+    "[^faster]: `EMACallback` performance was benchmarked on a GeForce 3080 Ti using PyTorch 1.13.1, Cuda 11.7, Mixed Precision, and [Channels Last](callback.channelslast.html) (except DeBERTa and ViT). Results may differ on other models, hardware, and across benchmarking runs. Speedup is calculated from the total time spent on the EMA step."
    ]
   },
   {
@@ -224,7 +224,7 @@
     "\n",
     "If `all_buffers=False`, only persistent [buffers](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer) are included in the EMA calculation.\n",
     "\n",
-    "If `skip_ema=True` (the default), then the EMA calculation will not apply if any other callback raises a `CancelBatchException` or `CancelStepException`. This is intended to handle the `MixedPrecision` AMP scaler and `GradientAccumulation` skipping the optimizer update step, which means the model weights won't have changed so the EMA step should not be calculated. If needed this behavior can be turned off. In general, this argument should be left unchanged."
+    "If `skip_ema=True` (the default), then the EMA calculation will not apply if any other callback raises a `CancelBatchException` or `CancelStepException`. This is intended to handle the `fastai.callback.fp16.MixedPrecision` AMP scaler and `fastai.callback.training.GradientAccumulation` skipping the optimizer update step, which means the model weights won't have changed so the EMA step should not be calculated. If needed this behavior can be turned off. In general, this argument should be left unchanged."
    ]
   },
   {
@@ -315,7 +315,7 @@
    "source": [
     "`EMAWarmupCallback` extends `EMACallback` by adding a schedulable EMA decay value from an initial value of `start_decay` to `final_decay` for the rest of training. The change in the EMA decay occurs between `start_epoch` and `final_epoch`.\n",
     "\n",
-    "The EMA warmup `schedule` can be one of [`SchedCos`](https://docs.fast.ai/callback.schedule.html#schedcos) (the default), [`SchedLin`](https://docs.fast.ai/callback.schedule.html#schedlin),[`SchedExp`](https://docs.fast.ai/callback.schedule.html#schedexp), [`SchedPoly`](https://docs.fast.ai/callback.schedule.html#schedpoly), or a custom [fastai annealer](https://docs.fast.ai/callback.schedule.html#annealer) based schedule. `SchedPoly` must be passed as partial function: `partial(SchedPoly, power=0.5)`.\n",
+    "The EMA warmup `schedule` can be one of [`SchedCos`](https://docs.fast.ai/.html#schedcos) (the default), [`SchedLin`](https://docs.fast.ai/callback.schedule.html#schedlin), [`SchedExp`](https://docs.fast.ai/callback.schedule.html#schedexp), [`SchedPoly`](https://docs.fast.ai/callback.schedule.html#schedpoly), or a custom [fastai annealer](https://docs.fast.ai/callback.schedule.html#annealer) based schedule. `SchedPoly` must be passed as partial function: `partial(SchedPoly, power=0.5)`.\n",
     "\n",
     "::: {.callout-warning}\n",
     "<code>EMAWarmupCallback</code> does not support resumed training while EMA warmup is in progress. This is due to fastai not fully supporting resumable training.\n",
diff --git a/nbs/callback.progresize.ipynb b/nbs/callback.progresize.ipynb
index 4a03d6a..084a240 100644
--- a/nbs/callback.progresize.ipynb
+++ b/nbs/callback.progresize.ipynb
@@ -184,7 +184,6 @@
     "                self.learn.mixed_precision.autocast.__exit__(None, None, None)\n",
     "\n",
     "            self.learn.loss.backward()\n",
-    "            print('backwards')\n",
     "            self.learn.opt.zero_grad()\n",
     "\n",
     "        finally:\n",
diff --git a/nbs/index.ipynb b/nbs/index.ipynb
index fee41ff..d2e0f3d 100644
--- a/nbs/index.ipynb
+++ b/nbs/index.ipynb
@@ -18,6 +18,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -26,6 +27,7 @@
     "**General Features**\n",
     "\n",
     "* [Fused optimizers](optimizer.fused.html) which are 21 to 293 percent faster relative to fastai native optimizers.\n",
+    "* Fused implementations of modern optimizers, such as [Adan](optimizer.adan.html) and [Lion](optimizer.lion.html).\n",
     "* Flexible [metrics](metrics.html) which can log on train, valid, or both. Backwards compatible with fastai metrics.\n",
     "* Easily use [multiple losses](multiloss.html) and log each individual loss on train and valid.\n",
     "* A [simple profiler](callback.simpleprofiler.html) for profiling fastai training.\n",
@@ -103,13 +105,14 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Examples\n",
     "Use a fused ForEach optimizer:\n",
     "```python\n",
-    "Learner(..., opt_func=adam(fused=True))\n",
+    "Learner(..., opt_func=adam(foreach=True))\n",
     "```\n",
     "\n",
     "Log an accuracy metric on the training set as a smoothed metric and validation set like normal:\n",
@@ -120,7 +123,7 @@
     "\n",
     "Log multiple losses as individual metrics on train and valid:\n",
     "```python\n",
-    "mloss = MultiLoss(loss_funcs=[nn.MSELoss, nn.L1Loss], \n",
+    "mloss = MultiLoss(loss_funcs=[nn.MSELoss, nn.L1Loss],\n",
     "                  weights=[1, 3.5], loss_names=['mse_loss', 'l1_loss'])\n",
     "\n",
     "Learner(..., loss_func=mloss, metrics=RMSE(), cbs=MultiLossCallback)\n",
diff --git a/nbs/metrics.ipynb b/nbs/metrics.ipynb
index dbfecb2..3903807 100644
--- a/nbs/metrics.ipynb
+++ b/nbs/metrics.ipynb
@@ -77,10 +77,10 @@
     "fastxtend metrics add the following features to fastai metrics:\n",
     "\n",
     "1. fastxtend metrics can independently log on train, valid, or both train and valid\n",
-    "2. All fastxtend metrics can use the activation support of fastai's `AccumMetric`, inherited from `MetricX`\n",
-    "3. fastxtend metrics add `AvgSmoothMetric`, a metric version of `AvgSmoothLoss`\n",
+    "2. All fastxtend metrics can use the activation support of `fastai.metrics.AccumMetric`, inherited from `MetricX`\n",
+    "3. fastxtend metrics add `AvgSmoothMetricX`, a metric version of `fastai.learner.AvgSmoothLoss`\n",
     "\n",
-    "There are three main metric types: `AvgMetricX`, `AccumMetricX`, and `AvgSmoothMetricX`. These correspond one-to-one with fastai's `AvgMetric`, `AccumMetric`, and `AvgSmoothMetric`. fastxtend metrics inherit from fastai's `Metric` and run on `Learner` via a modified `Recorder` callback.\n",
+    "There are three main metric types: `AvgMetricX`, `AccumMetricX`, and `AvgSmoothMetricX`. These correspond one-to-one with `fastai.learner.AvgMetric`, `fastai.metrics.AccumMetric`, and `fastai.learner.AvgSmoothLoss`. fastxtend metrics inherit from `fastai.learner.Metric` and run on `fastai.learner.Learner` via a modified `fastai.learner.Recorder` callback.\n",
     "\n",
     "To jump to the fastxtend metrics reference, click [here](#metrics).\n",
     "\n",
diff --git a/nbs/optimizer.fused.ipynb b/nbs/optimizer.fused.ipynb
index 1e29e0f..b140750 100644
--- a/nbs/optimizer.fused.ipynb
+++ b/nbs/optimizer.fused.ipynb
@@ -33,7 +33,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "fastxtend's fused optimizers are 21 to 293 percent faster, drop-in replacements for fastai native optimizers. Like fastai optimizers, fastxtend fused optimizers support both discriminative learning rates across multiple parameter groups and per-parameter weight decay without any extra setup.\n",
+    "fastxtend's fused optimizers are 21 to 293 percent faster, drop-in replacements for fastai native optimizers.\n",
+    "\n",
+    "Like fastai optimizers, fastxtend fused optimizers support both discriminative learning rates across multiple parameter groups and per-parameter weight decay without any extra setup.\n",
     "\n",
     "While all fastai optimizers have vertically fused TorchScript implementations, only a subset have horizontally fused ForEach implementations. These optimizers, [SGD](#sgd-optimizer), [Adam](#adam-optimizer), [RAdam](#radam-optimizer), [Lamb](#lamb-optimizer), and [Ranger](#ranger-optimizer), usually outperform their TorchScript counterparts in all but the tiniest models.\n",
     "\n",
diff --git a/nbs/utils.ipynb b/nbs/utils.ipynb
index 32e4f89..b48765e 100644
--- a/nbs/utils.ipynb
+++ b/nbs/utils.ipynb
@@ -17,7 +17,8 @@
    "source": [
     "#|exporti\n",
     "# Contains code from:\n",
-    "# fastai - Apache License 2.0 - Copyright (c) 2023 fast.ai"
+    "# fastai - Apache License 2.0 - Copyright (c) 2023 fast.ai\n",
+    "# mish-cuda - MIT License - Copyright (c) 2019 thomasbrandon https://github.com/thomasbrandon/mish-cuda"
    ]
   },
   {
@@ -86,12 +87,15 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "A random state manager which provides some reproducibility without sacrificing potential training speed.\n",
     "\n",
-    "Unlike fast.ai's [`no_random`](https://docs.fast.ai/torch_core.html#no_random), `less_random` does not set `torch.backends.cudnn.benchmark = False` so it's possible to train faster. Training runs on the same GPU, PyTorch, & CUDA setup should be reproducible, but different hardware/software setup will probably have less reproducibility then using `no_random`."
+    "Unlike `fastai.torch_core.no_random`, `less_random` does not set `torch.backends.cudnn.benchmark = False`. This allows PyTorch to select the fastest Cuda kernels and potentially train faster than `no_random`.\n",
+    "\n",
+    "`less_random` training runs on the same GPU, PyTorch, & Cuda setup should be close to `no_random` reproducibility, but different hardware/software setup will have less reproducibility than using `no_random`."
    ]
   },
   {
diff --git a/nbs/vision.models.xresnet.ipynb b/nbs/vision.models.xresnet.ipynb
index 201eca2..c8bea88 100644
--- a/nbs/vision.models.xresnet.ipynb
+++ b/nbs/vision.models.xresnet.ipynb
@@ -29,10 +29,11 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "fastxtend's `XResNet` is backwards compatible with fastai's `XResNet`. \n",
+    "fastxtend's `XResNet` is backwards compatible with `fastai.vision.models.xresnet.XResNet`. \n",
     "\n",
     "It adds the following features to `XResNet`:\n",
     "\n",
@@ -62,10 +63,11 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## ResBlock -"
+    "## ResNet Blocks"
    ]
   },
   {
@@ -139,10 +141,11 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Squeeze and Excitation -"
+    "## Squeeze & Excitation Blocks"
    ]
   },
   {
@@ -175,10 +178,11 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Efficient Channel Attention -"
+    "## Efficient Channel Attention Blocks"
    ]
   },
   {
@@ -211,10 +215,11 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Shuffle Attention -"
+    "## Shuffle Attention Blocks"
    ]
   },
   {
@@ -247,10 +252,11 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Triplet Attention -"
+    "## Triplet Attention Blocks"
    ]
   },
   {
diff --git a/settings.ini b/settings.ini
index 3b44f4b..4f4ee51 100644
--- a/settings.ini
+++ b/settings.ini
@@ -8,7 +8,7 @@ author = Benjamin Warner
 author_email = me@benjaminwarner.dev
 copyright = Benjamin Warner
 branch = main
-version = 0.0.18
+version = 0.0.19
 min_python = 3.8
 audience = Developers
 language = English