Add documentation to burn core nn (#1746)

* Updated documentation for unfold4d Added links between the struct and the config. Added a link to the related burn_tensor function in the documentation for the forward function. * Changing nn relu module documentation to functional api Removing the formula for relu from the module API to the functional API, citing a paper relevant to relu and mentionning the functional API in the module API * Linking gelu module API documentation to functional API documentation * Linear module : adding documentation Adding documentation to the Linear module mentionning that LinearConfig struct should be used when creating a Linear Layer Also adding links to the documentation that points people toward the right path * Updated documentation for dropout Added links between the struct and the config. Added a link to the struct in the forward function for more info. * embedding + swiglu * RotaryEncodying : adding documentation Adding documentation stating the RotaryEncoding should be created using a RotaryEncodingConfig * prelu: adding documentation Adding documentation to the prelu module: - Linking forward function documentation to the functional API - Citing the first paper to mention prelu - Adding documentation saying that prelu layer should be created using PReluConfig * pos_encoding: adding documentation * Updated documentation for mha Added links for more info. Added shape info at some places. * docs: Add documentation for Gru module Provide documentation for the Gru module, including its configuration and usage. Include a link to the paper that introduced the Gated Recurrent Unit (GRU) and specify that the module should be created using GruConfig. Also, mention that the forward function returns a state tensor with specific dimensions. * burn-core-nn-transformers: adding documentation Adding documentation: - Says to use config to create the layers - Add mathematical formula to the pwff forward pass - Add citation in the pwff to the "Attention is all you need" paper * Updated documentation: ConvTranspose1d and ConvTranspose2d * docs: Add documentation for Lstm and BiLstm modules Provide documentation for the Lstm and BiLstm modules, including their configurations and usage. Include links to the papers that introduced Long Short-Term Memory (LSTM) and Bidirectional LSTM. Specify that the modules should be created using LstmConfig and BiLstmConfig respectively. * docs: Update documentation for ConvTranspose1d and ConvTranspose2d modules * loss: Adding documenntation to the loss layers Adding documentation stating to use the config to create the layer * chore: Refactor Conv1d module imports and update documentation * docs: Add documentation for AdaptiveAvgPool1d and AdaptiveAvgPool2d modules Added references to the burn_tensor associated functions. Added links between the struct and the config. * Refactor Conv1d module imports and update documentation * chore: Refactor Conv2d module imports and update documentation * Add documentation for AvgPool1d and AvgPool2d modules Added references to the burn_tensor associated functions. Added links between the struct and the config. * Add documentation for MaxPool1d and MaxPool2d modules Added references to the burn_tensor associated functions. Added links between the struct and the config. * Add documentation for leaky_relu and removed Config generic Added references to the burn_tensor associated functions. Added links between the struct and the config. Removed the backend generic from the config since it's not needed (might be a breaking change). * refactor: Update BatchNormConfig initialization and add documentation. * Added link to config in embedding struct documentation * refactor: Update GroupNormConfig initialization and add documentation * refactor: Update InstanceNormConfig initialization and add documentation * feat: Update LayerNormConfig initialization and add documentation * refactor: Update RmsNormConfig initialization and add documentation * fixed: removed #derive accidentally * Added missing backticks in pools' shapes * Format nn doc * Make config fields public in nn modules * Update import statements in nn modules Changed burn_tensor imports to crate::tensor * Update import statements in nn modules' tests Changed burn_tensor imports to crate::tensor * breaking change refactor: Update GroupNormConfig and InstanceNormConfig initialization * Make SwiGlu fields public * grammar * slashes * input tensors grouping * copy-pasta mistake * a not an >:I * Capitalization * better desc * math 'n ticks * group_norm functional implementation * removed the ... struct * decoder typo * fmt * referring to private fn in docs --------- Co-authored-by: Thierry Cantin-Demers <piertcd@gmail.com> Co-authored-by: mepatrick73 <pameu17@ulaval.ca>
tracel-ai · Jun 13, 2024 · 5de1517 · 5de1517
1 parent 4393b33
commit 5de1517
Show file tree

Hide file tree

Showing 41 changed files with 467 additions and 289 deletions.
diff --git a/crates/burn-core/src/nn/attention/mask.rs b/crates/burn-core/src/nn/attention/mask.rs
@@ -1,6 +1,6 @@
 use alloc::vec::Vec;
 
-use burn_tensor::{backend::Backend, Bool, Data, ElementConversion, Int, Shape, Tensor};
+use crate::tensor::{backend::Backend, Bool, Data, ElementConversion, Int, Shape, Tensor};
 
 /// Generate an autoregressive attention mask.
 ///
@@ -89,9 +89,9 @@ pub fn generate_padding_mask<B: Backend>(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
     use alloc::vec;
-    use burn_tensor::Data;
 
     #[test]
     fn test_generate_autoregressive_mask() {

diff --git a/crates/burn-core/src/nn/attention/mha.rs b/crates/burn-core/src/nn/attention/mha.rs
@@ -12,29 +12,29 @@ use crate::{
 #[cfg(not(feature = "std"))]
 use num_traits::Float;
 
-/// Configuration to create a [Multi Head Attention](MultiHeadAttention) layer.
+/// Configuration to create a [Multi Head Attention](MultiHeadAttention) layer using the [init function](MultiHeadAttentionConfig::init).
 #[derive(Config)]
 pub struct MultiHeadAttentionConfig {
     /// The size of each linear layer.
-    d_model: usize,
+    pub d_model: usize,
     /// The number of heads.
-    n_heads: usize,
+    pub n_heads: usize,
     /// The dropout rate. Default: 0.1
     #[config(default = 0.1)]
-    dropout: f64,
+    pub dropout: f64,
     /// The minimum value a float can take. Default: -1.0e4
     /// This is used to mask attention scores before calculating attention weights.
     /// A value too low might result in NaN.
     #[config(default = -1.0e4)]
-    min_float: f64,
+    pub min_float: f64,
     /// Use "quiet softmax" instead of regular softmax.
     ///
     /// - Usage may improve performance by allowing attention heads to deposit no information (if the sequence contains no information relevant to that head).
     /// - Usage may reduce the entropy of weights in the model, enhancing quantization and compression.
     ///
     /// Reference: <https://www.evanmiller.org/attention-is-off-by-one.html>
     #[config(default = false)]
-    quiet_softmax: bool,
+    pub quiet_softmax: bool,
     /// The type of function used to initialize neural network parameters
     #[config(
         default = "Initializer::KaimingUniform{gain:1.0/num_traits::Float::sqrt(3.0), fan_out_only:false}"
@@ -50,6 +50,8 @@ pub struct MultiHeadAttentionConfig {
 /// - key: [Linear](nn::Linear) layer with `d_model` input and output features.
 /// - value: [Linear](nn::Linear) layer with `d_model` input and output features.
 /// - output: [Linear](nn::Linear) layer with `d_model` input and output features.
+///
+/// Should be created with [MultiHeadAttentionConfig].
 #[derive(Module, Debug)]
 pub struct MultiHeadAttention<B: Backend> {
     query: nn::Linear<B>,
@@ -67,8 +69,11 @@ pub struct MultiHeadAttention<B: Backend> {
 /// [Multihead attention](MultiHeadAttention) forward pass input argument.
 #[derive(Debug, Clone)]
 pub struct MhaInput<B: Backend> {
+    /// Shape `[batch_size, seq_length_1, d_model]`
     query: Tensor<B, 3>,
+    /// Shape `[batch_size, seq_length_2, d_model]`
     key: Tensor<B, 3>,
+    /// Shape `[batch_size, seq_length_2, d_model]`
     value: Tensor<B, 3>,
     mask_pad: Option<Tensor<B, 2, Bool>>,
     mask_attn: Option<Tensor<B, 3, Bool>>,
@@ -101,6 +106,9 @@ impl MultiHeadAttentionConfig {
 impl<B: Backend> MhaInput<B> {
     /// Create a [multihead attention](MultiHeadAttention) input argument
     /// by setting the query, key and value to the given tensor.
+    ///
+    /// # Shape
+    /// - tensor: `[batch_size, seq_length, d_model]`
     pub fn self_attn(tensor: Tensor<B, 3>) -> Self {
         Self {
             query: tensor.clone(),
@@ -138,15 +146,17 @@ impl<B: Backend> MhaInput<B> {
 /// [Multihead attention](MultiHeadAttention) outputs.
 #[derive(Debug, Clone)]
 pub struct MhaOutput<B: Backend> {
-    /// The attention weights [batch_size, n_heads, seq_length_1, seq_length_2].
+    /// The attention weights `[batch_size, n_heads, seq_length_1, seq_length_2]`.
     pub weights: Tensor<B, 4>,
-    /// The context tensor [batch_size, seq_length_1, d_model].
+    /// The context tensor `[batch_size, seq_length_1, d_model]`.
     pub context: Tensor<B, 3>,
 }
 
 impl<B: Backend> MultiHeadAttention<B> {
     /// Applies the forward pass on the input tensors.
     ///
+    /// See [MultiHeadAttention](MultiHeadAttention) for more information.
+    ///
     /// # Shapes
     ///
     /// - query: `[batch_size, seq_length_1, d_model]`
@@ -310,10 +320,10 @@ impl<B: Backend, const D: usize> MhaLinearCache<B, D> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Int;
+    use crate::tensor::{Distribution, Shape};
     use crate::{nn::attention::generate_autoregressive_mask, TestBackend};
     use alloc::vec::Vec;
-    use burn::tensor::{Distribution, Shape};
-    use burn_tensor::Int;
 
     #[test]
     fn test_self_attention_shapes() {

diff --git a/crates/burn-core/src/nn/conv/conv1d.rs b/crates/burn-core/src/nn/conv/conv1d.rs
@@ -3,15 +3,14 @@ use crate as burn;
 use crate::config::Config;
 use crate::module::Module;
 use crate::module::Param;
+use crate::nn::conv::checks;
 use crate::nn::{Initializer, PaddingConfig1d};
 use crate::tensor::backend::Backend;
+use crate::tensor::module::conv1d;
+use crate::tensor::ops::ConvOptions;
 use crate::tensor::Tensor;
-use burn_tensor::module::conv1d;
-use burn_tensor::ops::ConvOptions;
 
-use super::checks;
-
-/// Configuration to create an [1D convolution](Conv1d) layer.
+/// Configuration to create a [1D convolution](Conv1d) layer using the [init function](Conv1dConfig::init).
 #[derive(Config, Debug)]
 pub struct Conv1dConfig {
     /// The number of input channels.
@@ -44,14 +43,10 @@ pub struct Conv1dConfig {
 
 /// Applies a 1D convolution over input tensors.
 ///
-/// # Params
-///
-/// - weight: Tensor of shape [channels_out, channels_in / groups, kernel_size]
-///
-/// - bias:   Tensor of shape `[channels_out]`
+/// Should be created with [Conv1dConfig].
 #[derive(Module, Debug)]
 pub struct Conv1d<B: Backend> {
-    /// Tensor of shape [channels_out, channels_in / groups, kernel_size]
+    /// Tensor of shape `[channels_out, channels_in / groups, kernel_size]`
     pub weight: Param<Tensor<B, 3>>,
     /// Tensor of shape `[channels_out]`
     pub bias: Option<Param<Tensor<B, 1>>>,
@@ -102,10 +97,12 @@ impl Conv1dConfig {
 impl<B: Backend> Conv1d<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [conv1d](crate::tensor::module::conv1d) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels_in, length_in],
-    /// - output: [batch_size, channels_out, length_out],
+    /// - input: `[batch_size, channels_in, length_in]`
+    /// - output: `[batch_size, channels_out, length_out]`
     pub fn forward(&self, input: Tensor<B, 3>) -> Tensor<B, 3> {
         let [_batch_size, _channels, length] = input.dims();
         let padding = self
@@ -124,8 +121,8 @@ impl<B: Backend> Conv1d<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn initializer_default() {

diff --git a/crates/burn-core/src/nn/conv/conv2d.rs b/crates/burn-core/src/nn/conv/conv2d.rs
@@ -6,13 +6,13 @@ use crate::module::Param;
 use crate::nn::Initializer;
 use crate::nn::PaddingConfig2d;
 use crate::tensor::backend::Backend;
+use crate::tensor::module::conv2d;
+use crate::tensor::ops::ConvOptions;
 use crate::tensor::Tensor;
-use burn_tensor::module::conv2d;
-use burn_tensor::ops::ConvOptions;
 
-use super::checks;
+use crate::nn::conv::checks;
 
-/// Configuration to create an [2D convolution](Conv2d) layer.
+/// Configuration to create a [2D convolution](Conv2d) layer, using the [init function](Conv2dConfig::init).
 #[derive(Config, Debug)]
 pub struct Conv2dConfig {
     /// The number of channels.
@@ -43,11 +43,7 @@ pub struct Conv2dConfig {
 
 /// Applies a 2D convolution over input tensors.
 ///
-/// # Params
-///
-/// - weight: Tensor of shape `[channels_out, channels_in / groups, kernel_size_1, kernel_size_2]`
-///
-/// - bias:   Tensor of shape `[channels_out]`
+/// Should be created with [Conv2dConfig].
 #[derive(Module, Debug)]
 pub struct Conv2d<B: Backend> {
     /// Tensor of shape `[channels_out, channels_in / groups, kernel_size_1, kernel_size_2]`
@@ -106,10 +102,12 @@ impl Conv2dConfig {
 impl<B: Backend> Conv2d<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [conv2d](crate::tensor::module::conv2d) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels_in, height_in, width_in],
-    /// - output: [batch_size, channels_out, height_out, width_out],
+    /// - input: `[batch_size, channels_in, height_in, width_in]`
+    /// - output: `[batch_size, channels_out, height_out, width_out]`
     pub fn forward(&self, input: Tensor<B, 4>) -> Tensor<B, 4> {
         let [_batch_size, _channels_in, height_in, width_in] = input.dims();
         let padding =
@@ -127,8 +125,8 @@ impl<B: Backend> Conv2d<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn initializer_default() {

diff --git a/crates/burn-core/src/nn/conv/conv_transpose1d.rs b/crates/burn-core/src/nn/conv/conv_transpose1d.rs
@@ -3,15 +3,15 @@ use crate as burn;
 use crate::config::Config;
 use crate::module::Module;
 use crate::module::Param;
+use crate::nn::conv::checks;
 use crate::nn::Initializer;
 use crate::tensor::backend::Backend;
+use crate::tensor::module::conv_transpose1d;
+use crate::tensor::ops::ConvTransposeOptions;
 use crate::tensor::Tensor;
-use burn_tensor::module::conv_transpose1d;
-use burn_tensor::ops::ConvTransposeOptions;
 
-use super::checks;
-
-/// Configuration to create an [1D transposed convolution](ConvTranspose1d) layer.
+/// Configuration to create an [1D transposed convolution](ConvTranspose1d) layer
+/// using the [init function](ConvTranspose1dConfig::init).
 #[derive(Config, Debug)]
 pub struct ConvTranspose1dConfig {
     /// The number of channels.
@@ -44,12 +44,6 @@ pub struct ConvTranspose1dConfig {
 }
 
 /// Applies a 1D transposed convolution over input tensors.
-///
-/// # Params
-///
-/// - weight: Tensor of shape `[channels_in, channels_out / groups, kernel_size]`
-///
-/// - bias:   Tensor of shape `[channels_out]`
 #[derive(Module, Debug)]
 pub struct ConvTranspose1d<B: Backend> {
     /// Tensor of shape `[channels_in, channels_out / groups, kernel_size]`
@@ -104,10 +98,12 @@ impl ConvTranspose1dConfig {
 impl<B: Backend> ConvTranspose1d<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See also [conv_transpose1d](crate::tensor::module::conv_transpose1d).
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels_in, length_in],
-    /// - output: [batch_size, channels_out, length_out],
+    /// - input: `[batch_size, channels_in, length_in]`
+    /// - output: `[batch_size, channels_out, length_out]`
     pub fn forward(&self, input: Tensor<B, 3>) -> Tensor<B, 3> {
         conv_transpose1d(
             input,
@@ -127,8 +123,8 @@ impl<B: Backend> ConvTranspose1d<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn initializer_default() {

diff --git a/crates/burn-core/src/nn/conv/conv_transpose2d.rs b/crates/burn-core/src/nn/conv/conv_transpose2d.rs
@@ -1,17 +1,17 @@
 use crate as burn;
 
-use super::checks;
 use crate::config::Config;
 use crate::module::Module;
 use crate::module::Param;
+use crate::nn::conv::checks;
 use crate::nn::Initializer;
 use crate::tensor::backend::Backend;
+use crate::tensor::module::conv_transpose2d;
+use crate::tensor::ops::ConvTransposeOptions;
 use crate::tensor::Tensor;
 
-use burn_tensor::module::conv_transpose2d;
-use burn_tensor::ops::ConvTransposeOptions;
-
-/// Configuration to create an [2D transposed convolution](ConvTranspose2d) layer.
+/// Configuration to create an [2D transposed convolution](ConvTranspose2d) layer
+/// using the [init function](ConvTranspose2dConfig::init).
 #[derive(Config, Debug)]
 pub struct ConvTranspose2dConfig {
     /// The number of channels.
@@ -44,12 +44,6 @@ pub struct ConvTranspose2dConfig {
 }
 
 /// Applies a 2D transposed convolution over input tensors.
-///
-/// # Params
-///
-/// - weight: Tensor of shape `[channels_in, channels_out / groups, kernel_size_1, kernel_size_2]`
-///
-/// - bias:   Tensor of shape `[channels_out]`
 #[derive(Module, Debug)]
 pub struct ConvTranspose2d<B: Backend> {
     /// Tensor of shape `[channels_in, channels_out / groups, kernel_size_1, kernel_size_2]`
@@ -105,10 +99,12 @@ impl ConvTranspose2dConfig {
 impl<B: Backend> ConvTranspose2d<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See also [conv_transpose2d](crate::tensor::module::conv_transpose2d).
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels_in, height_in, width_in],
-    /// - output: [batch_size, channels_out, height_out, width_out],
+    /// - input: `[batch_size, channels_in, height_in, width_in]`
+    /// - output: `[batch_size, channels_out, height_out, width_out]`
     pub fn forward(&self, input: Tensor<B, 4>) -> Tensor<B, 4> {
         conv_transpose2d(
             input,
@@ -128,8 +124,8 @@ impl<B: Backend> ConvTranspose2d<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn initializer_default() {

diff --git a/crates/burn-core/src/nn/dropout.rs b/crates/burn-core/src/nn/dropout.rs
@@ -5,7 +5,7 @@ use crate::module::Module;
 use crate::tensor::backend::Backend;
 use crate::tensor::{Distribution, Tensor};
 
-/// Configuration to create a [Dropout](Dropout) layer.
+/// Configuration to create a [Dropout](Dropout) layer using the [init function](DropoutConfig::init).
 #[derive(Config, Debug)]
 pub struct DropoutConfig {
     /// The probability of randomly zeroes some elements of the input tensor during training.
@@ -18,6 +18,8 @@ pub struct DropoutConfig {
 /// [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580).
 ///
 /// The input is also scaled during training to `1 / (1 - prob_keep)`.
+///
+/// Should be created with [DropoutConfig].
 #[derive(Module, Clone, Debug)]
 pub struct Dropout {
     prob: f64,
@@ -33,6 +35,8 @@ impl DropoutConfig {
 impl Dropout {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [Dropout](Dropout) for more information.
+    ///
     /// # Shapes
     ///
     /// - input: `[..., any]`