diff --git a/crates/burn-core/src/nn/attention/mask.rs b/crates/burn-core/src/nn/attention/mask.rs index a9ef538295..8bab9fbfc2 100644 --- a/crates/burn-core/src/nn/attention/mask.rs +++ b/crates/burn-core/src/nn/attention/mask.rs @@ -1,6 +1,6 @@ use alloc::vec::Vec; -use burn_tensor::{backend::Backend, Bool, Data, ElementConversion, Int, Shape, Tensor}; +use crate::tensor::{backend::Backend, Bool, Data, ElementConversion, Int, Shape, Tensor}; /// Generate an autoregressive attention mask. /// @@ -89,9 +89,9 @@ pub fn generate_padding_mask( #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; use alloc::vec; - use burn_tensor::Data; #[test] fn test_generate_autoregressive_mask() { diff --git a/crates/burn-core/src/nn/attention/mha.rs b/crates/burn-core/src/nn/attention/mha.rs index b8fba51efb..ed6eb49235 100644 --- a/crates/burn-core/src/nn/attention/mha.rs +++ b/crates/burn-core/src/nn/attention/mha.rs @@ -12,21 +12,21 @@ use crate::{ #[cfg(not(feature = "std"))] use num_traits::Float; -/// Configuration to create a [Multi Head Attention](MultiHeadAttention) layer. +/// Configuration to create a [Multi Head Attention](MultiHeadAttention) layer using the [init function](MultiHeadAttentionConfig::init). #[derive(Config)] pub struct MultiHeadAttentionConfig { /// The size of each linear layer. - d_model: usize, + pub d_model: usize, /// The number of heads. - n_heads: usize, + pub n_heads: usize, /// The dropout rate. Default: 0.1 #[config(default = 0.1)] - dropout: f64, + pub dropout: f64, /// The minimum value a float can take. Default: -1.0e4 /// This is used to mask attention scores before calculating attention weights. /// A value too low might result in NaN. #[config(default = -1.0e4)] - min_float: f64, + pub min_float: f64, /// Use "quiet softmax" instead of regular softmax. /// /// - Usage may improve performance by allowing attention heads to deposit no information (if the sequence contains no information relevant to that head). @@ -34,7 +34,7 @@ pub struct MultiHeadAttentionConfig { /// /// Reference: #[config(default = false)] - quiet_softmax: bool, + pub quiet_softmax: bool, /// The type of function used to initialize neural network parameters #[config( default = "Initializer::KaimingUniform{gain:1.0/num_traits::Float::sqrt(3.0), fan_out_only:false}" @@ -50,6 +50,8 @@ pub struct MultiHeadAttentionConfig { /// - key: [Linear](nn::Linear) layer with `d_model` input and output features. /// - value: [Linear](nn::Linear) layer with `d_model` input and output features. /// - output: [Linear](nn::Linear) layer with `d_model` input and output features. +/// +/// Should be created with [MultiHeadAttentionConfig]. #[derive(Module, Debug)] pub struct MultiHeadAttention { query: nn::Linear, @@ -67,8 +69,11 @@ pub struct MultiHeadAttention { /// [Multihead attention](MultiHeadAttention) forward pass input argument. #[derive(Debug, Clone)] pub struct MhaInput { + /// Shape `[batch_size, seq_length_1, d_model]` query: Tensor, + /// Shape `[batch_size, seq_length_2, d_model]` key: Tensor, + /// Shape `[batch_size, seq_length_2, d_model]` value: Tensor, mask_pad: Option>, mask_attn: Option>, @@ -101,6 +106,9 @@ impl MultiHeadAttentionConfig { impl MhaInput { /// Create a [multihead attention](MultiHeadAttention) input argument /// by setting the query, key and value to the given tensor. + /// + /// # Shape + /// - tensor: `[batch_size, seq_length, d_model]` pub fn self_attn(tensor: Tensor) -> Self { Self { query: tensor.clone(), @@ -138,15 +146,17 @@ impl MhaInput { /// [Multihead attention](MultiHeadAttention) outputs. #[derive(Debug, Clone)] pub struct MhaOutput { - /// The attention weights [batch_size, n_heads, seq_length_1, seq_length_2]. + /// The attention weights `[batch_size, n_heads, seq_length_1, seq_length_2]`. pub weights: Tensor, - /// The context tensor [batch_size, seq_length_1, d_model]. + /// The context tensor `[batch_size, seq_length_1, d_model]`. pub context: Tensor, } impl MultiHeadAttention { /// Applies the forward pass on the input tensors. /// + /// See [MultiHeadAttention](MultiHeadAttention) for more information. + /// /// # Shapes /// /// - query: `[batch_size, seq_length_1, d_model]` @@ -310,10 +320,10 @@ impl MhaLinearCache { #[cfg(test)] mod tests { use super::*; + use crate::tensor::Int; + use crate::tensor::{Distribution, Shape}; use crate::{nn::attention::generate_autoregressive_mask, TestBackend}; use alloc::vec::Vec; - use burn::tensor::{Distribution, Shape}; - use burn_tensor::Int; #[test] fn test_self_attention_shapes() { diff --git a/crates/burn-core/src/nn/conv/conv1d.rs b/crates/burn-core/src/nn/conv/conv1d.rs index 4f232b7412..a14d668a82 100644 --- a/crates/burn-core/src/nn/conv/conv1d.rs +++ b/crates/burn-core/src/nn/conv/conv1d.rs @@ -3,15 +3,14 @@ use crate as burn; use crate::config::Config; use crate::module::Module; use crate::module::Param; +use crate::nn::conv::checks; use crate::nn::{Initializer, PaddingConfig1d}; use crate::tensor::backend::Backend; +use crate::tensor::module::conv1d; +use crate::tensor::ops::ConvOptions; use crate::tensor::Tensor; -use burn_tensor::module::conv1d; -use burn_tensor::ops::ConvOptions; -use super::checks; - -/// Configuration to create an [1D convolution](Conv1d) layer. +/// Configuration to create a [1D convolution](Conv1d) layer using the [init function](Conv1dConfig::init). #[derive(Config, Debug)] pub struct Conv1dConfig { /// The number of input channels. @@ -44,14 +43,10 @@ pub struct Conv1dConfig { /// Applies a 1D convolution over input tensors. /// -/// # Params -/// -/// - weight: Tensor of shape [channels_out, channels_in / groups, kernel_size] -/// -/// - bias: Tensor of shape `[channels_out]` +/// Should be created with [Conv1dConfig]. #[derive(Module, Debug)] pub struct Conv1d { - /// Tensor of shape [channels_out, channels_in / groups, kernel_size] + /// Tensor of shape `[channels_out, channels_in / groups, kernel_size]` pub weight: Param>, /// Tensor of shape `[channels_out]` pub bias: Option>>, @@ -102,10 +97,12 @@ impl Conv1dConfig { impl Conv1d { /// Applies the forward pass on the input tensor. /// + /// See [conv1d](crate::tensor::module::conv1d) for more information. + /// /// # Shapes /// - /// - input: [batch_size, channels_in, length_in], - /// - output: [batch_size, channels_out, length_out], + /// - input: `[batch_size, channels_in, length_in]` + /// - output: `[batch_size, channels_out, length_out]` pub fn forward(&self, input: Tensor) -> Tensor { let [_batch_size, _channels, length] = input.dims(); let padding = self @@ -124,8 +121,8 @@ impl Conv1d { #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; #[test] fn initializer_default() { diff --git a/crates/burn-core/src/nn/conv/conv2d.rs b/crates/burn-core/src/nn/conv/conv2d.rs index 877618d6c6..c7350d9916 100644 --- a/crates/burn-core/src/nn/conv/conv2d.rs +++ b/crates/burn-core/src/nn/conv/conv2d.rs @@ -6,13 +6,13 @@ use crate::module::Param; use crate::nn::Initializer; use crate::nn::PaddingConfig2d; use crate::tensor::backend::Backend; +use crate::tensor::module::conv2d; +use crate::tensor::ops::ConvOptions; use crate::tensor::Tensor; -use burn_tensor::module::conv2d; -use burn_tensor::ops::ConvOptions; -use super::checks; +use crate::nn::conv::checks; -/// Configuration to create an [2D convolution](Conv2d) layer. +/// Configuration to create a [2D convolution](Conv2d) layer, using the [init function](Conv2dConfig::init). #[derive(Config, Debug)] pub struct Conv2dConfig { /// The number of channels. @@ -43,11 +43,7 @@ pub struct Conv2dConfig { /// Applies a 2D convolution over input tensors. /// -/// # Params -/// -/// - weight: Tensor of shape `[channels_out, channels_in / groups, kernel_size_1, kernel_size_2]` -/// -/// - bias: Tensor of shape `[channels_out]` +/// Should be created with [Conv2dConfig]. #[derive(Module, Debug)] pub struct Conv2d { /// Tensor of shape `[channels_out, channels_in / groups, kernel_size_1, kernel_size_2]` @@ -106,10 +102,12 @@ impl Conv2dConfig { impl Conv2d { /// Applies the forward pass on the input tensor. /// + /// See [conv2d](crate::tensor::module::conv2d) for more information. + /// /// # Shapes /// - /// - input: [batch_size, channels_in, height_in, width_in], - /// - output: [batch_size, channels_out, height_out, width_out], + /// - input: `[batch_size, channels_in, height_in, width_in]` + /// - output: `[batch_size, channels_out, height_out, width_out]` pub fn forward(&self, input: Tensor) -> Tensor { let [_batch_size, _channels_in, height_in, width_in] = input.dims(); let padding = @@ -127,8 +125,8 @@ impl Conv2d { #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; #[test] fn initializer_default() { diff --git a/crates/burn-core/src/nn/conv/conv_transpose1d.rs b/crates/burn-core/src/nn/conv/conv_transpose1d.rs index 81c2372902..1a73191798 100644 --- a/crates/burn-core/src/nn/conv/conv_transpose1d.rs +++ b/crates/burn-core/src/nn/conv/conv_transpose1d.rs @@ -3,15 +3,15 @@ use crate as burn; use crate::config::Config; use crate::module::Module; use crate::module::Param; +use crate::nn::conv::checks; use crate::nn::Initializer; use crate::tensor::backend::Backend; +use crate::tensor::module::conv_transpose1d; +use crate::tensor::ops::ConvTransposeOptions; use crate::tensor::Tensor; -use burn_tensor::module::conv_transpose1d; -use burn_tensor::ops::ConvTransposeOptions; -use super::checks; - -/// Configuration to create an [1D transposed convolution](ConvTranspose1d) layer. +/// Configuration to create an [1D transposed convolution](ConvTranspose1d) layer +/// using the [init function](ConvTranspose1dConfig::init). #[derive(Config, Debug)] pub struct ConvTranspose1dConfig { /// The number of channels. @@ -44,12 +44,6 @@ pub struct ConvTranspose1dConfig { } /// Applies a 1D transposed convolution over input tensors. -/// -/// # Params -/// -/// - weight: Tensor of shape `[channels_in, channels_out / groups, kernel_size]` -/// -/// - bias: Tensor of shape `[channels_out]` #[derive(Module, Debug)] pub struct ConvTranspose1d { /// Tensor of shape `[channels_in, channels_out / groups, kernel_size]` @@ -104,10 +98,12 @@ impl ConvTranspose1dConfig { impl ConvTranspose1d { /// Applies the forward pass on the input tensor. /// + /// See also [conv_transpose1d](crate::tensor::module::conv_transpose1d). + /// /// # Shapes /// - /// - input: [batch_size, channels_in, length_in], - /// - output: [batch_size, channels_out, length_out], + /// - input: `[batch_size, channels_in, length_in]` + /// - output: `[batch_size, channels_out, length_out]` pub fn forward(&self, input: Tensor) -> Tensor { conv_transpose1d( input, @@ -127,8 +123,8 @@ impl ConvTranspose1d { #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; #[test] fn initializer_default() { diff --git a/crates/burn-core/src/nn/conv/conv_transpose2d.rs b/crates/burn-core/src/nn/conv/conv_transpose2d.rs index 0f2640ddad..0d5132942d 100644 --- a/crates/burn-core/src/nn/conv/conv_transpose2d.rs +++ b/crates/burn-core/src/nn/conv/conv_transpose2d.rs @@ -1,17 +1,17 @@ use crate as burn; -use super::checks; use crate::config::Config; use crate::module::Module; use crate::module::Param; +use crate::nn::conv::checks; use crate::nn::Initializer; use crate::tensor::backend::Backend; +use crate::tensor::module::conv_transpose2d; +use crate::tensor::ops::ConvTransposeOptions; use crate::tensor::Tensor; -use burn_tensor::module::conv_transpose2d; -use burn_tensor::ops::ConvTransposeOptions; - -/// Configuration to create an [2D transposed convolution](ConvTranspose2d) layer. +/// Configuration to create an [2D transposed convolution](ConvTranspose2d) layer +/// using the [init function](ConvTranspose2dConfig::init). #[derive(Config, Debug)] pub struct ConvTranspose2dConfig { /// The number of channels. @@ -44,12 +44,6 @@ pub struct ConvTranspose2dConfig { } /// Applies a 2D transposed convolution over input tensors. -/// -/// # Params -/// -/// - weight: Tensor of shape `[channels_in, channels_out / groups, kernel_size_1, kernel_size_2]` -/// -/// - bias: Tensor of shape `[channels_out]` #[derive(Module, Debug)] pub struct ConvTranspose2d { /// Tensor of shape `[channels_in, channels_out / groups, kernel_size_1, kernel_size_2]` @@ -105,10 +99,12 @@ impl ConvTranspose2dConfig { impl ConvTranspose2d { /// Applies the forward pass on the input tensor. /// + /// See also [conv_transpose2d](crate::tensor::module::conv_transpose2d). + /// /// # Shapes /// - /// - input: [batch_size, channels_in, height_in, width_in], - /// - output: [batch_size, channels_out, height_out, width_out], + /// - input: `[batch_size, channels_in, height_in, width_in]` + /// - output: `[batch_size, channels_out, height_out, width_out]` pub fn forward(&self, input: Tensor) -> Tensor { conv_transpose2d( input, @@ -128,8 +124,8 @@ impl ConvTranspose2d { #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; #[test] fn initializer_default() { diff --git a/crates/burn-core/src/nn/dropout.rs b/crates/burn-core/src/nn/dropout.rs index 92ea9cc320..e10dcf50b4 100644 --- a/crates/burn-core/src/nn/dropout.rs +++ b/crates/burn-core/src/nn/dropout.rs @@ -5,7 +5,7 @@ use crate::module::Module; use crate::tensor::backend::Backend; use crate::tensor::{Distribution, Tensor}; -/// Configuration to create a [Dropout](Dropout) layer. +/// Configuration to create a [Dropout](Dropout) layer using the [init function](DropoutConfig::init). #[derive(Config, Debug)] pub struct DropoutConfig { /// The probability of randomly zeroes some elements of the input tensor during training. @@ -18,6 +18,8 @@ pub struct DropoutConfig { /// [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). /// /// The input is also scaled during training to `1 / (1 - prob_keep)`. +/// +/// Should be created with [DropoutConfig]. #[derive(Module, Clone, Debug)] pub struct Dropout { prob: f64, @@ -33,6 +35,8 @@ impl DropoutConfig { impl Dropout { /// Applies the forward pass on the input tensor. /// + /// See [Dropout](Dropout) for more information. + /// /// # Shapes /// /// - input: `[..., any]` diff --git a/crates/burn-core/src/nn/embedding.rs b/crates/burn-core/src/nn/embedding.rs index 4ff23be900..3ad02f141d 100644 --- a/crates/burn-core/src/nn/embedding.rs +++ b/crates/burn-core/src/nn/embedding.rs @@ -5,16 +5,18 @@ use crate::config::Config; use crate::module::Module; use crate::module::Param; use crate::tensor::backend::Backend; +use crate::tensor::Int; use crate::tensor::Tensor; -use burn_tensor::Int; -/// Configuration to create an [Embedding](Embedding) layer. +use crate::tensor::module::embedding; + +/// Configuration to create an [Embedding](Embedding) layer using the [init function](EmbeddingConfig::init). #[derive(Config)] pub struct EmbeddingConfig { /// The number of embedding vectors. - n_embedding: usize, + pub n_embedding: usize, /// The size of each vector. - d_model: usize, + pub d_model: usize, /// The type of function used to initialize neural network parameters #[config(default = "Initializer::Normal{mean:0.0, std:1.0}")] pub initializer: Initializer, @@ -22,13 +24,10 @@ pub struct EmbeddingConfig { /// Lookup table to store a fix number of vectors. /// -/// # Params -/// -/// - weight: Matrix of shape `[n_embedding, d_model]` initialized from a normal distribution: -/// `N(0, 1)` +/// Should be created with [EmbeddingConfig]. #[derive(Module, Debug)] pub struct Embedding { - /// The learnable weights of the module of shape [n_embedding, d_model] initialized + /// The learnable weights of the module of shape `[n_embedding, d_model]` initialized /// from a normal distribution `N(0, 1)`. pub weight: Param>, } @@ -47,20 +46,22 @@ impl EmbeddingConfig { impl Embedding { /// Applies the forward pass on the input tensor. /// + /// See also [embedding](crate::tensor::module::embedding). + /// /// # Shapes /// - /// - input: [batch_size, seq_length] - /// - output: [batch_size, d_model] + /// - input: `[batch_size, seq_length]` + /// - output: `[batch_size, d_model]` pub fn forward(&self, input: Tensor) -> Tensor { - burn_tensor::module::embedding(self.weight.val(), input) + embedding(self.weight.val(), input) } } #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; #[test] fn initializer_default() { diff --git a/crates/burn-core/src/nn/gelu.rs b/crates/burn-core/src/nn/gelu.rs index 37a395e37e..421f83452d 100644 --- a/crates/burn-core/src/nn/gelu.rs +++ b/crates/burn-core/src/nn/gelu.rs @@ -5,6 +5,7 @@ use crate::tensor::backend::Backend; use crate::tensor::Tensor; /// Applies the Gaussian Error Linear Units function element-wise. +/// See also [gelu](burn::tensor::activation::gelu) #[derive(Module, Clone, Debug, Default)] pub struct Gelu {} diff --git a/crates/burn-core/src/nn/initializer.rs b/crates/burn-core/src/nn/initializer.rs index 00f44c3578..7beb8b246b 100644 --- a/crates/burn-core/src/nn/initializer.rs +++ b/crates/burn-core/src/nn/initializer.rs @@ -1,4 +1,4 @@ -use burn_tensor::Shape; +use crate::tensor::Shape; use crate::config::Config; use crate::module::{Param, ParamId}; @@ -200,7 +200,7 @@ fn normal_draw>>( mod tests { use super::*; - use burn_tensor::{Data, ElementConversion}; + use crate::tensor::{Data, ElementConversion}; use num_traits::Pow; pub type TB = burn_ndarray::NdArray; diff --git a/crates/burn-core/src/nn/leaky_relu.rs b/crates/burn-core/src/nn/leaky_relu.rs index 6fb81331c0..1a230a4841 100644 --- a/crates/burn-core/src/nn/leaky_relu.rs +++ b/crates/burn-core/src/nn/leaky_relu.rs @@ -1,19 +1,20 @@ -use core::marker::PhantomData; - use crate as burn; use crate::config::Config; use crate::module::Module; use crate::tensor::backend::Backend; use crate::tensor::Tensor; +use crate::tensor::activation::leaky_relu; + /// Leaky ReLu layer. -#[derive(Module, Debug)] -pub struct LeakyRelu { +/// +/// Should be created with [LeakyReluConfig](LeakyReluConfig). +#[derive(Module, Clone, Debug)] +pub struct LeakyRelu { /// The negative slope. pub negative_slope: f64, - phantom: PhantomData, } -/// Configuration to create a [Leaky Relu](LeakyRelu) layer. +/// Configuration to create a [Leaky Relu](LeakyRelu) layer using the [init function](LeakyReluConfig::init). #[derive(Config, Debug)] pub struct LeakyReluConfig { /// The negative slope. Default is 0.01 @@ -22,39 +23,36 @@ pub struct LeakyReluConfig { } impl LeakyReluConfig { /// Initialize a new [Leaky Relu](LeakyRelu) Layer - pub fn init(&self) -> LeakyRelu { + pub fn init(&self) -> LeakyRelu { LeakyRelu { negative_slope: self.negative_slope, - phantom: PhantomData, } } } -impl LeakyRelu { +impl LeakyRelu { /// Forward pass for the Leaky ReLu layer. /// - /// # Arguments - /// - /// * `input` - The input tensor. - /// - /// # Returns + /// See [leaky_relu](crate::tensor::activation::leaky_relu) for more information. /// - /// The output tensor. - pub fn forward(&self, input: Tensor) -> Tensor { - crate::tensor::activation::leaky_relu(input, self.negative_slope) + /// # Shapes + /// - input: `[..., any]` + /// - output: `[..., any]` + pub fn forward(&self, input: Tensor) -> Tensor { + leaky_relu(input, self.negative_slope) } } #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; #[test] fn test_leaky_relu_forward() { let device = ::Device::default(); - let model: LeakyRelu = LeakyReluConfig::new().init(); + let model = LeakyReluConfig::new().init(); let input = Tensor::::from_data(Data::from([[0.4410, -0.2507]]), &device); let out = model.forward(input); assert_eq!(out.to_data(), Data::from([[0.4410, -0.002507]])); @@ -87,7 +85,7 @@ mod tests { ]; let device = ::Device::default(); - let model: LeakyRelu = LeakyReluConfig::new().init(); + let model = LeakyReluConfig::new().init(); let input_data = Tensor::::from_data(Data::from(input), &device); let actual_output = model.forward(input_data); actual_output diff --git a/crates/burn-core/src/nn/linear.rs b/crates/burn-core/src/nn/linear.rs index 6e4bc8e501..d54da5f6fd 100644 --- a/crates/burn-core/src/nn/linear.rs +++ b/crates/burn-core/src/nn/linear.rs @@ -7,7 +7,7 @@ use crate::tensor::{backend::Backend, Tensor}; use super::Initializer; -/// Configuration to create a [Linear](Linear) layer. +/// Configuration to create a [Linear](Linear) layer using the [init function](LinearConfig::init). #[derive(Config, Debug)] pub struct LinearConfig { /// The size of the input features. @@ -26,6 +26,8 @@ pub struct LinearConfig { /// Applies a linear transformation to the input tensor: /// +/// Should be created with [LinearConfig] +/// /// `O = IW + b` #[derive(Module, Debug)] pub struct Linear { @@ -84,8 +86,8 @@ impl Linear { #[cfg(test)] mod tests { use super::*; + use crate::tensor::{Data, Shape}; use crate::TestBackend; - use burn_tensor::{Data, Shape}; #[test] fn initializer_default() { diff --git a/crates/burn-core/src/nn/loss/binary_cross_entropy.rs b/crates/burn-core/src/nn/loss/binary_cross_entropy.rs index 6719ef04db..192e07ab2a 100644 --- a/crates/burn-core/src/nn/loss/binary_cross_entropy.rs +++ b/crates/burn-core/src/nn/loss/binary_cross_entropy.rs @@ -1,11 +1,11 @@ use crate as burn; +use crate::tensor::activation::log_sigmoid; +use crate::tensor::{backend::Backend, Int, Tensor}; use crate::{config::Config, module::Module}; use alloc::vec::Vec; -use burn_tensor::activation::log_sigmoid; -use burn_tensor::{backend::Backend, Int, Tensor}; -/// Configuration to create a [Binary Cross-entropy loss](BinaryCrossEntropyLoss). +/// Configuration to create a [Binary Cross-entropy loss](BinaryCrossEntropyLoss) using the [init function](BinaryCrossEntropyLossConfig::init). #[derive(Config, Debug)] pub struct BinaryCrossEntropyLossConfig { /// Create weighted binary cross-entropy with a weight for each class. @@ -17,11 +17,11 @@ pub struct BinaryCrossEntropyLossConfig { /// /// Hard labels {0, 1} will be changed to `y_smoothed = y(1 - a) + a / num_classes`. /// Alpha = 0 would be the same as default. - smoothing: Option, + pub smoothing: Option, /// Treat the inputs as logits, applying a sigmoid activation when computing the loss. #[config(default = false)] - logits: bool, + pub logits: bool, } impl BinaryCrossEntropyLossConfig { @@ -56,6 +56,8 @@ impl BinaryCrossEntropyLossConfig { } /// Calculate the binary cross entropy loss from the input logits and the targets. +/// +/// Should be created using [BinaryCrossEntropyLossConfig] #[derive(Module, Debug)] pub struct BinaryCrossEntropyLoss { /// Weights for cross-entropy. @@ -146,8 +148,8 @@ impl BinaryCrossEntropyLoss { #[cfg(test)] mod tests { use super::*; + use crate::tensor::{activation::sigmoid, Data}; use crate::TestBackend; - use burn_tensor::{activation::sigmoid, Data}; #[test] fn test_binary_cross_entropy() { diff --git a/crates/burn-core/src/nn/loss/cross_entropy.rs b/crates/burn-core/src/nn/loss/cross_entropy.rs index 82b71797f1..0dea70b517 100644 --- a/crates/burn-core/src/nn/loss/cross_entropy.rs +++ b/crates/burn-core/src/nn/loss/cross_entropy.rs @@ -1,18 +1,18 @@ use crate as burn; +use crate::tensor::activation::log_softmax; +use crate::tensor::{backend::Backend, Bool, Int, Tensor}; use crate::{config::Config, module::Module}; use alloc::vec; use alloc::vec::Vec; -use burn_tensor::activation::log_softmax; -use burn_tensor::{backend::Backend, Bool, Int, Tensor}; -/// Configuration to create a [Cross-entropy loss](CrossEntropyLoss). +/// Configuration to create a [Cross-entropy loss](CrossEntropyLoss) using the [init function](CrossEntropyLossConfig::init). #[derive(Config, Debug)] pub struct CrossEntropyLossConfig { /// Create padded cross entropy. /// /// Prevents pad tokens from impacting loss calculation. - pad_tokens: Option>, + pub pad_tokens: Option>, /// Create weighted cross-entropy. /// @@ -21,18 +21,18 @@ pub struct CrossEntropyLossConfig { /// # Pre-conditions /// - The order of the weight vector should correspond to the label integer assignment. /// - Targets assigned negative Int's will not be allowed. - weights: Option>, + pub weights: Option>, /// Create cross-entropy with label smoothing. /// /// Hard labels {0, 1} will be changed to y_smoothed = y(1 - a) + a / nr_classes. /// Alpha = 0 would be the same as default. - smoothing: Option, + pub smoothing: Option, /// Create cross-entropy with probabilities as input instead of logits. /// #[config(default = true)] - logits: bool, + pub logits: bool, } impl CrossEntropyLossConfig { @@ -68,6 +68,8 @@ impl CrossEntropyLossConfig { } /// Calculate the cross entropy loss from the input logits and the targets. +/// +/// Should be created using [CrossEntropyLossConfig] #[derive(Module, Debug)] pub struct CrossEntropyLoss { pad_tokens: Option>, @@ -214,8 +216,8 @@ impl CrossEntropyLoss { #[cfg(test)] mod tests { use super::*; + use crate::tensor::{loss::cross_entropy_with_logits, Data, Distribution}; use crate::TestBackend; - use burn_tensor::{loss::cross_entropy_with_logits, Data, Distribution}; macro_rules! setup { () => {{ diff --git a/crates/burn-core/src/nn/loss/huber.rs b/crates/burn-core/src/nn/loss/huber.rs index a2ae13dd75..06e4b0f143 100644 --- a/crates/burn-core/src/nn/loss/huber.rs +++ b/crates/burn-core/src/nn/loss/huber.rs @@ -1,8 +1,8 @@ use crate as burn; +use crate::tensor::backend::Backend; +use crate::tensor::Tensor; use crate::{config::Config, module::Module}; -use burn_tensor::backend::Backend; -use burn_tensor::Tensor; use core::marker::PhantomData; use super::Reduction; @@ -124,8 +124,8 @@ impl HuberLoss { #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; type TestTensor = Tensor; #[test] diff --git a/crates/burn-core/src/nn/loss/mse.rs b/crates/burn-core/src/nn/loss/mse.rs index 00945d004b..0bd873a887 100644 --- a/crates/burn-core/src/nn/loss/mse.rs +++ b/crates/burn-core/src/nn/loss/mse.rs @@ -1,7 +1,7 @@ use crate::nn::loss::reduction::Reduction; use core::marker::PhantomData; -use burn_tensor::{backend::Backend, Tensor}; +use crate::tensor::{backend::Backend, Tensor}; /// Calculate the mean squared error loss from the input logits and the targets. #[derive(Clone, Debug)] @@ -55,8 +55,8 @@ impl MseLoss { #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; #[test] fn test_mse_loss() { diff --git a/crates/burn-core/src/nn/norm/batch.rs b/crates/burn-core/src/nn/norm/batch.rs index 38d0dc5cf2..9fdca5ed46 100644 --- a/crates/burn-core/src/nn/norm/batch.rs +++ b/crates/burn-core/src/nn/norm/batch.rs @@ -7,7 +7,7 @@ use crate::{ tensor::{backend::Backend, Tensor}, }; -/// Configuration to create a [BatchNorm](BatchNorm) layer. +/// Configuration to create a [BatchNorm](BatchNorm) layer using the [init function](BatchNormConfig::init). #[derive(Config, Debug)] pub struct BatchNormConfig { /// The number of features. @@ -23,18 +23,31 @@ pub struct BatchNormConfig { /// Applies Batch Normalization over a tensor as described in the paper [Batch Normalization](https://arxiv.org/abs/1502.03167) /// /// `Y = norm(X) * γ + β` +/// +/// Where: +/// - `X` is the input tensor +/// - `Y` is the output tensor +/// - `norm` is the normalization function +/// - `γ` is the learnable weight +/// - `β` is the learnable bias +/// +/// Should be created using [BatchNormConfig]. #[derive(Module, Debug)] pub struct BatchNorm { - gamma: Param>, - beta: Param>, - running_mean: RunningState>, - running_var: RunningState>, + /// The learnable weight gamma. + pub gamma: Param>, + /// The learnable weight beta. + pub beta: Param>, + /// The running mean. + pub running_mean: RunningState>, + /// The running variance. + pub running_var: RunningState>, momentum: f64, epsilon: f64, } impl BatchNormConfig { - /// Initialize a new [batch norm](BatchNorm) module. + /// Initializes a new [batch norm](BatchNorm) module. pub fn init(&self, device: &B::Device) -> BatchNorm { let gamma = Initializer::Ones.init([self.num_features], device); let beta = Initializer::Zeros.init([self.num_features], device); @@ -56,10 +69,16 @@ impl BatchNormConfig { impl BatchNorm { /// Applies the forward pass on the input tensor. /// + /// See [BatchNorm](BatchNorm) for more information. + /// /// # Shapes /// /// - input: `[batch_size, channels, ...]` /// - output: `[batch_size, channels, ...]` + /// + /// # Panics + /// + /// This function will panic if the input tensor has a dimension different from `D + 2`. pub fn forward(&self, input: Tensor) -> Tensor { // Should be move to a compilation error when const generic support that kind of // validation. https://github.com/rust-lang/rust/issues/76560 @@ -168,8 +187,8 @@ impl BatchNorm { #[cfg(test)] mod tests_1d { use super::*; + use crate::tensor::Data; use crate::{module::AutodiffModule, TestAutodiffBackend}; - use burn_tensor::Data; #[test] fn batch_norm_forward_train() { @@ -228,8 +247,8 @@ mod tests_1d { #[cfg(test)] mod tests_2d { use super::*; + use crate::tensor::Data; use crate::{module::AutodiffModule, TestAutodiffBackend}; - use burn_tensor::Data; #[test] fn batch_norm_forward_train() { diff --git a/crates/burn-core/src/nn/norm/group.rs b/crates/burn-core/src/nn/norm/group.rs index a19c2fbc04..91c04c3d1d 100644 --- a/crates/burn-core/src/nn/norm/group.rs +++ b/crates/burn-core/src/nn/norm/group.rs @@ -7,7 +7,7 @@ use crate::module::Param; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -/// Configuration to create a [GroupNorm](GroupNorm) layer. +/// Configuration to create a [GroupNorm](GroupNorm) layer using the [init function](GroupNormConfig::init). #[derive(Debug, Config)] pub struct GroupNormConfig { /// The number of groups to separate the channels into @@ -24,17 +24,28 @@ pub struct GroupNormConfig { pub affine: bool, } -/// Applies Group Normalization over a mini-batch of inputs. +/// Applies Group Normalization over a mini-batch of inputs as described in the paper [Group Normalization](https://arxiv.org/abs/1803.08494). /// /// `Y = groupnorm(X) * γ + β` +/// +/// Where: +/// - `X` is the input tensor +/// - `Y` is the output tensor +/// - `γ` is the learnable weight +/// - `β` is the learnable bias +/// +/// Should be created using [GroupNormConfig](GroupNormConfig). #[derive(Module, Debug)] pub struct GroupNorm { - num_groups: usize, - num_channels: usize, - gamma: Option>>, - beta: Option>>, - epsilon: f64, - affine: bool, + /// The learnable weight + pub gamma: Option>>, + /// The learnable bias + pub beta: Option>>, + + pub(crate) num_groups: usize, + pub(crate) num_channels: usize, + pub(crate) epsilon: f64, + pub(crate) affine: bool, } impl GroupNormConfig { @@ -69,58 +80,95 @@ impl GroupNormConfig { impl GroupNorm { /// Applies the forward pass on the input tensor. /// + /// See [GroupNorm](GroupNorm) for more information. + /// /// # Shapes /// - /// - input: `[..., any, d_model]` - /// - output: `[..., any, d_model]` + /// - input: `[batch_size, num_channels, *]` + /// - output: `[batch_size, num_channels, *]` pub fn forward(&self, input: Tensor) -> Tensor { - let shape = input.shape(); - if shape.num_elements() <= 2 { + if input.shape().dims[1] != self.num_channels { panic!( - "input rank for GroupNorm should be at least 3, but got {}", - shape.num_elements() + "The number of channels in the input tensor should be equal to the number of channels in the GroupNorm module. Expected {}, got {}", + self.num_channels, + input.shape().dims[1] ); } - let batch_size = shape.dims[0]; - let num_channels = shape.dims[1]; + let gamma = self.gamma.as_ref().map(|x| x.val()); + let beta = self.beta.as_ref().map(|x| x.val()); - if num_channels != self.num_channels { - panic!( - "expected {} channels but got {}", - self.num_channels, num_channels - ); - } + group_norm( + input, + gamma, + beta, + self.num_groups, + self.epsilon, + self.affine, + ) + } +} - let hidden_size = - shape.dims[2..].iter().product::() * num_channels / self.num_groups; - let input = input.reshape([batch_size, self.num_groups, hidden_size]); +/// Applies Group Normalization over a mini-batch of inputs as described in the paper [Group Normalization](https://arxiv.org/abs/1803.08494). +/// +/// `Y = groupnorm(X) * γ + β` +/// +/// Where: +/// - `X` is the input tensor +/// - `Y` is the output tensor +/// - `γ` is the learnable weight +/// - `β` is the learnable bias +/// +pub(crate) fn group_norm( + input: Tensor, + gamma: Option>, + beta: Option>, + num_groups: usize, + epsilon: f64, + affine: bool, +) -> Tensor { + if (beta.is_none() || gamma.is_none()) && affine { + panic!("Affine is set to true, but gamma or beta is None"); + } - let mean = input.clone().sum_dim(2) / hidden_size as f64; - let input = input.sub(mean); + let shape = input.shape(); + if shape.num_elements() <= 2 { + panic!( + "input rank for GroupNorm should be at least 3, but got {}", + shape.num_elements() + ); + } - let var = input.clone().powf_scalar(2.).sum_dim(2) / hidden_size as f64; - let input_normalized = input.div(var.sqrt().add_scalar(self.epsilon)); + let batch_size = shape.dims[0]; + let num_channels = shape.dims[1]; - if self.affine { - let mut affine_shape = [1; D]; - affine_shape[1] = num_channels; + let hidden_size = shape.dims[2..].iter().product::() * num_channels / num_groups; + let input = input.reshape([batch_size, num_groups, hidden_size]); - input_normalized - .reshape(shape) - .mul(self.gamma.clone().unwrap().val().reshape(affine_shape)) - .add(self.beta.clone().unwrap().val().reshape(affine_shape)) - } else { - input_normalized.reshape(shape) - } + let mean = input.clone().sum_dim(2) / hidden_size as f64; + let input = input.sub(mean); + + let var = input.clone().powf_scalar(2.).sum_dim(2) / hidden_size as f64; + let input_normalized = input.div(var.sqrt().add_scalar(epsilon)); + + if affine { + let mut affine_shape = [1; D]; + affine_shape[1] = num_channels; + + input_normalized + .reshape(shape) + .mul(gamma.clone().unwrap().reshape(affine_shape)) + .add(beta.clone().unwrap().reshape(affine_shape)) + } else { + input_normalized.reshape(shape) } } #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; #[test] fn group_norm_forward_affine_false() { diff --git a/crates/burn-core/src/nn/norm/instance.rs b/crates/burn-core/src/nn/norm/instance.rs index 359d47307f..fb47505b60 100644 --- a/crates/burn-core/src/nn/norm/instance.rs +++ b/crates/burn-core/src/nn/norm/instance.rs @@ -1,45 +1,56 @@ use crate as burn; use crate::config::Config; -use crate::module::Module; +use crate::module::{Module, Param}; +use crate::nn::norm::group_norm; +use crate::nn::Initializer; use crate::tensor::{backend::Backend, Tensor}; -use super::{GroupNorm, GroupNormConfig}; - -/// Configuration to create a [InstanceNorm](InstanceNorm) layer. +/// Configuration to create a [InstanceNorm](InstanceNorm) layer using the [init function](InstanceNormConfig::init). #[derive(Debug, Config)] pub struct InstanceNormConfig { /// The number of channels expected in the input - num_channels: usize, + pub num_channels: usize, /// A value required for numerical stability. Default: 1e-5 #[config(default = 1e-5)] - epsilon: f64, + pub epsilon: f64, /// A boolean value that when set to `true`, this module has learnable /// per-channel affine parameters initialized to ones (for weights) /// and zeros (for biases). Default: `true` #[config(default = true)] - affine: bool, + pub affine: bool, } -/// Applies Instance Normalization over a tensor as described in the paper [Instance Normalization](https://arxiv.org/abs/1607.08022) +/// Applies Instance Normalization over a tensor as described in the paper [Instance Normalization](https://arxiv.org/abs/1607.08022) +/// +/// Should be created using [InstanceNormConfig](InstanceNormConfig). #[derive(Module, Debug)] pub struct InstanceNorm { - group_norm: GroupNorm, + /// The learnable weight + pub gamma: Option>>, + /// The learnable bias + pub beta: Option>>, + + num_channels: usize, + epsilon: f64, + affine: bool, } impl InstanceNormConfig { /// Initialize a new [instance norm](InstanceNorm) module. pub fn init(&self, device: &B::Device) -> InstanceNorm { - InstanceNorm { - group_norm: self.to_group_norm().init(device), - } - } + let (gamma, beta) = if self.affine { + let gamma = Initializer::Ones.init([self.num_channels], device); + let beta = Initializer::Zeros.init([self.num_channels], device); + + (Some(gamma), Some(beta)) + } else { + (None, None) + }; - fn to_group_norm(&self) -> GroupNormConfig { - GroupNormConfig { - // Group norm is equivalent to instance norm, when the number of groups is - // equal to the number of channels. - num_groups: self.num_channels, + InstanceNorm { + gamma, + beta, num_channels: self.num_channels, epsilon: self.epsilon, affine: self.affine, @@ -50,20 +61,28 @@ impl InstanceNormConfig { impl InstanceNorm { /// Applies the forward pass on the input tensor. /// + /// See also [InstanceNormConfig](InstanceNormConfig) for more information. + /// /// # Shapes /// - /// - input: `[..., any, d_model]` - /// - output: `[..., any, d_model]` + /// - input: `[batch_size, num_channels, *]` + /// - output: `[batch_size, num_channels, *]` pub fn forward(&self, input: Tensor) -> Tensor { - self.group_norm.forward(input) + // Instance norm is equivalent to group norm when the number of groups is equal to the number of channels. + let num_groups = self.num_channels; + + let gamma = self.gamma.as_ref().map(|x| x.val()); + let beta = self.beta.as_ref().map(|x| x.val()); + + group_norm(input, gamma, beta, num_groups, self.epsilon, self.affine) } } #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; #[test] fn instance_norm_forward_affine_false() { diff --git a/crates/burn-core/src/nn/norm/layer.rs b/crates/burn-core/src/nn/norm/layer.rs index c236c51b21..c0dc71afa8 100644 --- a/crates/burn-core/src/nn/norm/layer.rs +++ b/crates/burn-core/src/nn/norm/layer.rs @@ -1,13 +1,13 @@ use crate as burn; -use crate::nn::Initializer; use crate::config::Config; use crate::module::Module; use crate::module::Param; +use crate::nn::Initializer; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -/// Configuration to create a [LayerNorm](LayerNorm) layer. +/// Configuration to create a [LayerNorm](LayerNorm) layer using the [init function](LayerNormConfig::init). #[derive(Debug, Config)] pub struct LayerNormConfig { /// The size of the input features. @@ -20,9 +20,19 @@ pub struct LayerNormConfig { /// Applies Layer Normalization over an input tensor as described in the paper [Layer Normalization](https://arxiv.org/abs/1607.06450). /// /// `Y = norm(X) * γ + β` +/// +/// Where: +/// - `X` is the input tensor +/// - `Y` is the output tensor +/// - `γ` is the learnable weight +/// - `β` is the learnable bias +/// +/// Should be created using [LayerNormConfig](LayerNormConfig). #[derive(Module, Debug)] pub struct LayerNorm { + /// The learnable weight. gamma: Param>, + /// The learnable bias. beta: Param>, epsilon: f64, } @@ -44,6 +54,8 @@ impl LayerNormConfig { impl LayerNorm { /// Applies the forward pass on the input tensor. /// + /// See the [LayerNorm](LayerNorm) documentation for more information. + /// /// # Shapes /// /// - input: `[..., any, d_model]` @@ -62,7 +74,7 @@ impl LayerNorm { #[cfg(test)] mod tests { use super::*; - use burn_tensor::Data; + use crate::tensor::Data; #[cfg(feature = "std")] use crate::{TestAutodiffBackend, TestBackend}; diff --git a/crates/burn-core/src/nn/norm/rms.rs b/crates/burn-core/src/nn/norm/rms.rs index 6c6f313b7d..2ac15df227 100644 --- a/crates/burn-core/src/nn/norm/rms.rs +++ b/crates/burn-core/src/nn/norm/rms.rs @@ -7,18 +7,22 @@ use crate::nn::Initializer; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -/// Configuration to create a [RMS Norm](RmsNorm) layer. +/// Configuration to create a [RMS Norm](RmsNorm) layer using the [init function](RmsNormConfig::init). #[derive(Config)] pub struct RmsNormConfig { /// The size of the input features. - d_model: usize, + pub d_model: usize, /// A value required for numerical stability. Default: 1e-5 #[config(default = 1e-5)] - epsilon: f64, + pub epsilon: f64, } impl RmsNormConfig { /// Initialize a new [RMS Norm](RmsNorm) module. + /// + /// # Panics + /// + /// Panics if `epsilon` is not positive. pub fn init(&self, device: &B::Device) -> RmsNorm { assert!(self.epsilon > 0.0, "epsilon must be positive."); @@ -35,11 +39,18 @@ impl RmsNormConfig { /// /// `Y = X / sqrt(mean(X^2) + eps) * gamma` /// -/// where `eps` is a small value to avoid division by zero. +/// Where: +/// - `X` is the input tensor +/// - `Y` is the output tensor +/// - `gamma` is the learnable weight +/// - `mean` is the mean operation +/// - `eps` is a small value to avoid division by zero. +/// +/// Should be created using the [RmsNormConfig](RmsNormConfig) configuration. #[derive(Module, Debug)] pub struct RmsNorm { /// The learnable parameter to scale the normalized tensor - gamma: Param>, + pub gamma: Param>, /// A value required for numerical stability epsilon: f64, } @@ -47,6 +58,8 @@ pub struct RmsNorm { impl RmsNorm { /// Applies the forward pass on the input tensor. /// + /// See the [RmsNorm](RmsNorm) documentation for more information. + /// /// # Shapes /// /// - input: `[..., any, d_model]` @@ -61,8 +74,8 @@ impl RmsNorm { #[cfg(test)] mod tests { use super::*; + use crate::tensor::Data; use crate::TestBackend; - use burn_tensor::Data; #[test] fn rms_norm_forward() { diff --git a/crates/burn-core/src/nn/padding.rs b/crates/burn-core/src/nn/padding.rs index db27bf4486..8f64340108 100644 --- a/crates/burn-core/src/nn/padding.rs +++ b/crates/burn-core/src/nn/padding.rs @@ -1,6 +1,6 @@ use crate as burn; -use burn_tensor::ops::conv::calculate_conv_padding; +use crate::tensor::ops::conv::calculate_conv_padding; use crate::config::Config; use crate::module::Module; diff --git a/crates/burn-core/src/nn/pool/adaptive_avg_pool1d.rs b/crates/burn-core/src/nn/pool/adaptive_avg_pool1d.rs index 9551db1db8..dd2c1d33c7 100644 --- a/crates/burn-core/src/nn/pool/adaptive_avg_pool1d.rs +++ b/crates/burn-core/src/nn/pool/adaptive_avg_pool1d.rs @@ -4,9 +4,10 @@ use crate::config::Config; use crate::module::Module; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -use burn_tensor::module::adaptive_avg_pool1d; -/// Configuration to create a [1D adaptive avg pooling](AdaptiveAvgPool1d) layer. +use crate::tensor::module::adaptive_avg_pool1d; + +/// Configuration to create a [1D adaptive avg pooling](AdaptiveAvgPool1d) layer using the [init function](AdaptiveAvgPool1dConfig::init). #[derive(Config)] pub struct AdaptiveAvgPool1dConfig { /// The size of the output. @@ -14,6 +15,8 @@ pub struct AdaptiveAvgPool1dConfig { } /// Applies a 1D adaptive avg pooling over input tensors. +/// +/// Should be created with [AdaptiveAvgPool1dConfig]. #[derive(Module, Clone, Debug)] pub struct AdaptiveAvgPool1d { output_size: usize, @@ -31,10 +34,12 @@ impl AdaptiveAvgPool1dConfig { impl AdaptiveAvgPool1d { /// Applies the forward pass on the input tensor. /// + /// See [adaptive_avg_pool1d](crate::tensor::module::adaptive_avg_pool1d) for more information. + /// /// # Shapes /// - /// - input: [batch_size, channels, length], - /// - output: [batch_size, channels, length_out], + /// - input: `[batch_size, channels, length]` + /// - output: `[batch_size, channels, length_out]` pub fn forward(&self, input: Tensor) -> Tensor { adaptive_avg_pool1d(input, self.output_size) } diff --git a/crates/burn-core/src/nn/pool/adaptive_avg_pool2d.rs b/crates/burn-core/src/nn/pool/adaptive_avg_pool2d.rs index c5849fa84d..8d4d55d424 100644 --- a/crates/burn-core/src/nn/pool/adaptive_avg_pool2d.rs +++ b/crates/burn-core/src/nn/pool/adaptive_avg_pool2d.rs @@ -4,9 +4,10 @@ use crate::config::Config; use crate::module::Module; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -use burn_tensor::module::adaptive_avg_pool2d; -/// Configuration to create a [2D adaptive avg pooling](AdaptiveAvgPool2d) layer. +use crate::tensor::module::adaptive_avg_pool2d; + +/// Configuration to create a [2D adaptive avg pooling](AdaptiveAvgPool2d) layer using the [init function](AdaptiveAvgPool2dConfig::init). #[derive(Config)] pub struct AdaptiveAvgPool2dConfig { /// The size of the output. @@ -14,6 +15,8 @@ pub struct AdaptiveAvgPool2dConfig { } /// Applies a 2D adaptive avg pooling over input tensors. +/// +/// Should be created with [AdaptiveAvgPool2dConfig]. #[derive(Module, Clone, Debug)] pub struct AdaptiveAvgPool2d { output_size: [usize; 2], @@ -31,10 +34,12 @@ impl AdaptiveAvgPool2dConfig { impl AdaptiveAvgPool2d { /// Applies the forward pass on the input tensor. /// + /// See [adaptive_avg_pool2d](crate::tensor::module::adaptive_avg_pool2d) for more information. + /// /// # Shapes /// - /// - input: [batch_size, channels, height_in, width_in], - /// - output: [batch_size, channels, height_out, width_out], + /// - input: `[batch_size, channels, height_in, width_in]` + /// - output: `[batch_size, channels, height_out, width_out]` pub fn forward(&self, input: Tensor) -> Tensor { adaptive_avg_pool2d(input, self.output_size) } diff --git a/crates/burn-core/src/nn/pool/avg_pool1d.rs b/crates/burn-core/src/nn/pool/avg_pool1d.rs index 68e29fdde9..4b58eb3025 100644 --- a/crates/burn-core/src/nn/pool/avg_pool1d.rs +++ b/crates/burn-core/src/nn/pool/avg_pool1d.rs @@ -5,9 +5,10 @@ use crate::module::Module; use crate::nn::PaddingConfig1d; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -use burn_tensor::module::avg_pool1d; -/// Configuration to create a [1D avg pooling](AvgPool1d) layer. +use crate::tensor::module::avg_pool1d; + +/// Configuration to create a [1D avg pooling](AvgPool1d) layer using the [init function](AvgPool1dConfig::init). #[derive(Config, Debug)] pub struct AvgPool1dConfig { /// The size of the kernel. @@ -25,7 +26,7 @@ pub struct AvgPool1dConfig { /// Applies a 1D avg pooling over input tensors. /// -/// See [AvgPool1dConfig](AvgPool1dConfig) for details. +/// Should be created with [AvgPool1dConfig](AvgPool1dConfig). /// /// # Remarks /// @@ -61,10 +62,12 @@ impl AvgPool1dConfig { impl AvgPool1d { /// Applies the forward pass on the input tensor. /// + /// See [avg_pool1d](crate::tensor::module::avg_pool1d) for more information. + /// /// # Shapes /// - /// - input: [batch_size, channels, length_in], - /// - output: [batch_size, channels, length_out], + /// - input: `[batch_size, channels, length_in]` + /// - output: `[batch_size, channels, length_out]` pub fn forward(&self, input: Tensor) -> Tensor { let [_batch_size, _channels, length] = input.dims(); let padding = self diff --git a/crates/burn-core/src/nn/pool/avg_pool2d.rs b/crates/burn-core/src/nn/pool/avg_pool2d.rs index 8ec63d0856..fb2aff8d2e 100644 --- a/crates/burn-core/src/nn/pool/avg_pool2d.rs +++ b/crates/burn-core/src/nn/pool/avg_pool2d.rs @@ -5,9 +5,10 @@ use crate::module::Module; use crate::nn::PaddingConfig2d; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -use burn_tensor::module::avg_pool2d; -/// Configuration to create a [2D avg pooling](AvgPool2d) layer. +use crate::tensor::module::avg_pool2d; + +/// Configuration to create a [2D avg pooling](AvgPool2d) layer using the [init function](AvgPool2dConfig::init). #[derive(Config, Debug)] pub struct AvgPool2dConfig { /// The size of the kernel. @@ -25,7 +26,7 @@ pub struct AvgPool2dConfig { /// Applies a 2D avg pooling over input tensors. /// -/// See [AvgPool2dConfig](AvgPool2dConfig) for details. +/// Should be created with [AvgPool2dConfig](AvgPool2dConfig). /// /// # Remarks /// @@ -60,10 +61,12 @@ impl AvgPool2dConfig { impl AvgPool2d { /// Applies the forward pass on the input tensor. /// + /// See [avg_pool2d](crate::tensor::module::avg_pool2d) for more information. + /// /// # Shapes /// - /// - input: [batch_size, channels, height_in, width_in], - /// - output: [batch_size, channels, height_out, width_out], + /// - input: `[batch_size, channels, height_in, width_in]` + /// - output: `[batch_size, channels, height_out, width_out]` pub fn forward(&self, input: Tensor) -> Tensor { let [_batch_size, _channels_in, height_in, width_in] = input.dims(); let padding = diff --git a/crates/burn-core/src/nn/pool/max_pool1d.rs b/crates/burn-core/src/nn/pool/max_pool1d.rs index eabed4603f..632ab6622d 100644 --- a/crates/burn-core/src/nn/pool/max_pool1d.rs +++ b/crates/burn-core/src/nn/pool/max_pool1d.rs @@ -5,9 +5,10 @@ use crate::module::Module; use crate::nn::PaddingConfig1d; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -use burn_tensor::module::max_pool1d; -/// Configuration to create a [1D max pooling](MaxPool1d) layer. +use crate::tensor::module::max_pool1d; + +/// Configuration to create a [1D max pooling](MaxPool1d) layer using the [init function](MaxPool1dConfig::init). #[derive(Config, Debug)] pub struct MaxPool1dConfig { /// The size of the kernel. @@ -24,6 +25,8 @@ pub struct MaxPool1dConfig { } /// Applies a 1D max pooling over input tensors. +/// +/// Should be created with [MaxPool1dConfig](MaxPool1dConfig). #[derive(Module, Clone, Debug)] pub struct MaxPool1d { stride: usize, @@ -47,10 +50,12 @@ impl MaxPool1dConfig { impl MaxPool1d { /// Applies the forward pass on the input tensor. /// + /// See [max_pool1d](crate::tensor::module::max_pool1d) for more information. + /// /// # Shapes /// - /// - input: [batch_size, channels, length_in], - /// - output: [batch_size, channels, length_out], + /// - input: `[batch_size, channels, length_in]` + /// - output: `[batch_size, channels, length_out]` pub fn forward(&self, input: Tensor) -> Tensor { let [_batch_size, _channels, length] = input.dims(); let padding = self diff --git a/crates/burn-core/src/nn/pool/max_pool2d.rs b/crates/burn-core/src/nn/pool/max_pool2d.rs index 9697ab3529..63dee1326d 100644 --- a/crates/burn-core/src/nn/pool/max_pool2d.rs +++ b/crates/burn-core/src/nn/pool/max_pool2d.rs @@ -5,9 +5,10 @@ use crate::module::Module; use crate::nn::PaddingConfig2d; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -use burn_tensor::module::max_pool2d; -/// Configuration to create an [2D max pooling](MaxPool2d) layer. +use crate::tensor::module::max_pool2d; + +/// Configuration to create a [2D max pooling](MaxPool2d) layer using the [init function](MaxPool2dConfig::init). #[derive(Debug, Config)] pub struct MaxPool2dConfig { /// The size of the kernel. @@ -24,6 +25,8 @@ pub struct MaxPool2dConfig { } /// Applies a 2D max pooling over input tensors. +/// +/// Should be created with [MaxPool2dConfig](MaxPool2dConfig). #[derive(Module, Clone, Debug)] pub struct MaxPool2d { stride: [usize; 2], @@ -47,10 +50,12 @@ impl MaxPool2dConfig { impl MaxPool2d { /// Applies the forward pass on the input tensor. /// + /// See [max_pool2d](crate::tensor::module::max_pool2d) for more information. + /// /// # Shapes /// - /// - input: [batch_size, channels, height_in, width_in], - /// - output: [batch_size, channels, height_out, width_out], + /// - input: `[batch_size, channels, height_in, width_in]` + /// - output: `[batch_size, channels, height_out, width_out]` pub fn forward(&self, input: Tensor) -> Tensor { let [_batch_size, _channels_in, height_in, width_in] = input.dims(); let padding = diff --git a/crates/burn-core/src/nn/pos_encoding.rs b/crates/burn-core/src/nn/pos_encoding.rs index 0df639316a..703927dc3d 100644 --- a/crates/burn-core/src/nn/pos_encoding.rs +++ b/crates/burn-core/src/nn/pos_encoding.rs @@ -4,25 +4,25 @@ use crate as burn; use crate::config::Config; use crate::module::Module; use crate::tensor::backend::Backend; +use crate::tensor::Data; use crate::tensor::Tensor; -use burn_tensor::Data; #[cfg(not(feature = "std"))] use num_traits::Float; -/// Configuration to create an [PositionalEncoding](PositionalEncoding) layer. +/// Configuration to create a [PositionalEncoding](PositionalEncoding) layer using the [init function](PositionalEncodingConfig::init). #[derive(Config)] pub struct PositionalEncodingConfig { /// Maximum sequence size to use. #[config(default = "5_000")] - max_sequence_size: usize, + pub max_sequence_size: usize, /// The size of each vector. - d_model: usize, + pub d_model: usize, /// Max time scale to use. #[config(default = "10_000")] - max_timescale: usize, + pub max_timescale: usize, } /// Positional encoding layer for transformer models. @@ -37,6 +37,8 @@ pub struct PositionalEncodingConfig { /// The reference implementation can be found here: /// [LANGUAGE MODELING WITH NN.TRANSFORMER AND TORCHTEXT /// ](https://pytorch.org/tutorials/beginner/transformer_tutorial.html) +/// +/// Should be created using [PositionalEncodingConfig] #[derive(Module, Debug)] pub struct PositionalEncoding { sinusoids: Tensor, diff --git a/crates/burn-core/src/nn/prelu.rs b/crates/burn-core/src/nn/prelu.rs index d8ea5f90ab..f15c96481c 100644 --- a/crates/burn-core/src/nn/prelu.rs +++ b/crates/burn-core/src/nn/prelu.rs @@ -6,13 +6,15 @@ use crate::nn::Initializer; use crate::tensor::backend::Backend; use crate::tensor::Tensor; /// Parametric Relu layer. +/// +/// Should be created using [PReluConfig] #[derive(Module, Debug)] pub struct PRelu { /// the weights learnt for PReLu. can be of shape \[1\] or \[num_parameters\] in which case it must /// be the same as number of channels in the input tensor pub alpha: Param>, } -/// Configuration to create a [Parametric Relu](PRelu) layer. +/// Configuration to create a [Parametric Relu](PRelu) layer using the [init function](PReluConfig::init). #[derive(Config, Debug)] pub struct PReluConfig { /// The number of parameters. @@ -39,6 +41,8 @@ impl PRelu { /// /// - input: `[..., any]` /// - output: `[..., any]` + /// + /// See also [prelu](crate::tensor::activation::prelu) for more information. pub fn forward(&self, input: Tensor) -> Tensor { crate::tensor::activation::prelu(input, self.alpha.val()) } diff --git a/crates/burn-core/src/nn/relu.rs b/crates/burn-core/src/nn/relu.rs index bd8c92a4dd..262c393134 100644 --- a/crates/burn-core/src/nn/relu.rs +++ b/crates/burn-core/src/nn/relu.rs @@ -4,9 +4,9 @@ use crate::module::Module; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -/// Applies the rectified linear unit function element-wise: +/// Applies the rectified linear unit function element-wise +/// See also [relu](burn::tensor::activation::relu) /// -/// `y = max(0, x)` #[derive(Module, Clone, Debug, Default)] pub struct Relu {} diff --git a/crates/burn-core/src/nn/rnn/gate_controller.rs b/crates/burn-core/src/nn/rnn/gate_controller.rs index 0c52f383f4..f20f7fa2a1 100644 --- a/crates/burn-core/src/nn/rnn/gate_controller.rs +++ b/crates/burn-core/src/nn/rnn/gate_controller.rs @@ -2,7 +2,7 @@ use crate as burn; use crate::module::Module; use crate::nn::{Initializer, Linear, LinearConfig}; -use burn_tensor::{backend::Backend, Tensor}; +use crate::tensor::{backend::Backend, Tensor}; /// A GateController represents a gate in an LSTM cell. An /// LSTM cell generally contains three gates: an input gate, diff --git a/crates/burn-core/src/nn/rnn/gru.rs b/crates/burn-core/src/nn/rnn/gru.rs index ddcada5526..fc29bdc192 100644 --- a/crates/burn-core/src/nn/rnn/gru.rs +++ b/crates/burn-core/src/nn/rnn/gru.rs @@ -4,13 +4,13 @@ use crate::config::Config; use crate::module::Module; use crate::nn::rnn::gate_controller; use crate::nn::Initializer; +use crate::tensor::activation; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -use burn_tensor::activation; use super::gate_controller::GateController; -/// The configuration for a [gru](Gru) module. +/// Configuration to create a [gru](Gru) module using the [init function](GruConfig::init). #[derive(Config)] pub struct GruConfig { /// The size of the input features. @@ -24,7 +24,11 @@ pub struct GruConfig { pub initializer: Initializer, } -/// The Gru module. This implementation is for a unidirectional, stateless, Gru. +/// The Gru (Gated recurrent unit) module. This implementation is for a unidirectional, stateless, Gru. +/// +/// Introduced in the paper: [Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation](https://arxiv.org/abs/1406.1078). +/// +/// Should be created with [GruConfig]. #[derive(Module, Debug)] pub struct Gru { update_gate: GateController, @@ -73,13 +77,11 @@ impl Gru { /// Applies the forward pass on the input tensor. This GRU implementation /// returns a single state tensor with dimensions [batch_size, sequence_length, hidden_size]. /// - /// Parameters: - /// batched_input: The input tensor of shape [batch_size, sequence_length, input_size]. - /// state: An optional tensor representing an initial cell state with the same dimensions - /// as batched_input. If none is provided, one will be generated. - /// - /// Returns: - /// The resulting state tensor, with shape [batch_size, sequence_length, hidden_size]. + /// # Shapes + /// - batched_input: `[batch_size, sequence_length, input_size]`. + /// - state: An optional tensor representing an initial cell state with the same dimensions + /// as batched_input. If none is provided, one will be generated. + /// - output: `[batch_size, sequence_length, hidden_size]`. pub fn forward( &self, batched_input: Tensor, @@ -177,8 +179,8 @@ impl Gru { #[cfg(test)] mod tests { use super::*; + use crate::tensor::{Data, Distribution}; use crate::{module::Param, nn::LinearRecord, TestBackend}; - use burn_tensor::{Data, Distribution}; /// Test forward pass with simple input vector. /// diff --git a/crates/burn-core/src/nn/rnn/lstm.rs b/crates/burn-core/src/nn/rnn/lstm.rs index 82025c6364..8690b16d27 100644 --- a/crates/burn-core/src/nn/rnn/lstm.rs +++ b/crates/burn-core/src/nn/rnn/lstm.rs @@ -4,9 +4,9 @@ use crate::config::Config; use crate::module::Module; use crate::nn::rnn::gate_controller::GateController; use crate::nn::Initializer; +use crate::tensor::activation; use crate::tensor::backend::Backend; use crate::tensor::Tensor; -use burn_tensor::activation; /// A LstmState is used to store cell state and hidden state in LSTM. pub struct LstmState { @@ -23,7 +23,7 @@ impl LstmState { } } -/// The configuration for a [lstm](Lstm) module. +/// Configuration to create a [Lstm](Lstm) module using the [init function](LstmConfig::init). #[derive(Config)] pub struct LstmConfig { /// The size of the input features. @@ -38,6 +38,10 @@ pub struct LstmConfig { } /// The Lstm module. This implementation is for a unidirectional, stateless, Lstm. +/// +/// Introduced in the paper: [Long Short-Term Memory](https://www.researchgate.net/publication/13853244). +/// +/// Should be created with [LstmConfig]. #[derive(Module, Debug)] pub struct Lstm { /// The input gate regulates which information to update and store in the cell state at each time step. @@ -171,7 +175,7 @@ impl Lstm { } } -/// The configuration for a [Bidirectional LSTM](BiLstm) module. +/// Configuration to create a [BiLstm](BiLstm) module using the [init function](BiLstmConfig::init). #[derive(Config)] pub struct BiLstmConfig { /// The size of the input features. @@ -186,6 +190,10 @@ pub struct BiLstmConfig { } /// The BiLstm module. This implementation is for Bidirectional LSTM. +/// +/// Introduced in the paper: [Framewise phoneme classification with bidirectional LSTM and other neural network architectures](https://www.cs.toronto.edu/~graves/ijcnn_2005.pdf). +/// +/// Should be created with [BiLstmConfig]. #[derive(Module, Debug)] pub struct BiLstm { /// LSTM for the forward direction. @@ -298,8 +306,8 @@ impl BiLstm { #[cfg(test)] mod tests { use super::*; + use crate::tensor::{Data, Device, Distribution}; use crate::{module::Param, nn::LinearRecord, TestBackend}; - use burn_tensor::{Data, Device, Distribution}; #[cfg(feature = "std")] use crate::TestAutodiffBackend; @@ -451,7 +459,7 @@ mod tests { #[test] #[cfg(feature = "std")] fn test_batched_backward_pass() { - use burn_tensor::Shape; + use crate::tensor::Shape; let device = Default::default(); let lstm = LstmConfig::new(64, 32, true).init(&device); let shape: Shape<3> = [8, 10, 64].into(); diff --git a/crates/burn-core/src/nn/rope_encoding.rs b/crates/burn-core/src/nn/rope_encoding.rs index b8f4e97903..9a10d5d0a8 100644 --- a/crates/burn-core/src/nn/rope_encoding.rs +++ b/crates/burn-core/src/nn/rope_encoding.rs @@ -2,25 +2,25 @@ use crate as burn; use crate::config::Config; use crate::module::Module; use crate::tensor::backend::Backend; +use crate::tensor::Int; use crate::tensor::Tensor; use alloc::vec; -use burn_tensor::Int; #[cfg(not(feature = "std"))] use num_traits::Float; -/// Configuration to create a [RotaryEncoding](RotaryEncoding) layer. +/// Configuration to create a [RotaryEncoding](RotaryEncoding) layer using the [init function](RotaryEncodingConfig::init). #[derive(Config, Debug)] pub struct RotaryEncodingConfig { /// Maximum sequence length of input - max_sequence_length: usize, + pub max_sequence_length: usize, /// Size of the input embedding or hidden dimension - d_model: usize, + pub d_model: usize, /// Scaling factor for frequency computation. Defaults to 10000.0 #[config(default = "10000.0")] - theta: f32, + pub theta: f32, } impl RotaryEncodingConfig { @@ -84,6 +84,8 @@ impl RotaryEncodingConfig { /// explicit relative position dependency in self-attention formulation. /// /// Introduced in the paper: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) +/// +/// Should be created using [RotaryEncodingConfig]. #[derive(Module, Debug)] pub struct RotaryEncoding { /// Frequency Tensor of shape (max_sequence_length, d_model, 2) with real and imaginary components diff --git a/crates/burn-core/src/nn/swiglu.rs b/crates/burn-core/src/nn/swiglu.rs index 6949b4ba44..3dacbae68a 100644 --- a/crates/burn-core/src/nn/swiglu.rs +++ b/crates/burn-core/src/nn/swiglu.rs @@ -7,7 +7,7 @@ use crate::tensor::{backend::Backend, Tensor}; use super::{Initializer, Linear, LinearConfig}; -/// Configuration to create a [SwiGlu](SwiGlu) activation layer. +/// Configuration to create a [SwiGlu](SwiGlu) activation layer using the [init function](SwiGluConfig::init). #[derive(Config, Debug)] pub struct SwiGluConfig { /// The size of the input features. @@ -29,16 +29,15 @@ pub struct SwiGluConfig { /// The SwiGLU activation function is defined as: /// `SwiGLU(x) = Swish(W_inner * x + b_inner) * (W_outer * x + b_outer)` /// -/// # Params -/// -/// - linear inner: The inner linear layer for Swish activation function -/// with `d_input` input features and `d_output` output features. -/// - linear outer: Outer Linear layer for element wise multiplication -/// with `d_input` input features and `d_output` output features. +/// Should be created with [SwiGluConfig]. #[derive(Module, Debug)] pub struct SwiGlu { - linear_inner: Linear, - linear_outer: Linear, + /// The inner linear layer for Swish activation function + /// with `d_input` input features and `d_output` output features. + pub linear_inner: Linear, + /// The outer linear layer for element wise multiplication + /// with `d_input` input features and `d_output` output features. + pub linear_outer: Linear, } impl SwiGluConfig { @@ -58,11 +57,11 @@ impl SwiGluConfig { } impl SwiGlu { - /// Applies the forward pass on the input tensor. + /// Applies the Swish Gated Linear Unit to the input tensor. /// /// # Shapes /// - /// - tensor: `[batch_size, seq_length, d_input]` + /// - input: `[batch_size, seq_length, d_input]` /// - output: `[batch_size, seq_length, d_output]` pub fn forward(&self, input: Tensor) -> Tensor { let x = self.linear_inner.forward(input.clone()); diff --git a/crates/burn-core/src/nn/transformer/decoder.rs b/crates/burn-core/src/nn/transformer/decoder.rs index 7dd9ecfd6b..85fc50159f 100644 --- a/crates/burn-core/src/nn/transformer/decoder.rs +++ b/crates/burn-core/src/nn/transformer/decoder.rs @@ -1,5 +1,5 @@ +use crate::tensor::Bool; use alloc::vec::Vec; -use burn_tensor::Bool; use crate::{ self as burn, @@ -17,7 +17,7 @@ use crate::{ tensor::{backend::Backend, Tensor}, }; -/// Configuration to create a [Transformer Decoder](TransformerDecoder) layer. +/// Configuration to create a [Transformer Decoder](TransformerDecoder) layer using the [init function](TransformerDecoderConfig::init). #[derive(Config)] pub struct TransformerDecoderConfig { /// The size of the model. @@ -54,6 +54,8 @@ pub struct TransformerDecoderConfig { /// # Params /// /// - layers: transformer decoder layers with `d_model` input and output features. +/// +/// Should be created using [TransformerDecoderConfig] #[derive(Module, Debug)] pub struct TransformerDecoder { layers: Vec>, @@ -204,6 +206,7 @@ impl TransformerDecoderLayer { } } + /// Applies the TransformerDecoder forward pass to the input tensor. fn forward(&self, mut input: TransformerDecoderInput) -> TransformerDecoderInput { // Self attention residual path. let x = input.target; @@ -401,8 +404,8 @@ impl TransformerDecoder { #[cfg(test)] mod tests { use super::*; + use crate::tensor::Distribution; use crate::{nn::attention::generate_autoregressive_mask, TestBackend}; - use burn_tensor::Distribution; #[test] fn test_autoregressive_norm_last() { diff --git a/crates/burn-core/src/nn/transformer/encoder.rs b/crates/burn-core/src/nn/transformer/encoder.rs index 1eba8658bc..0eb226a3b9 100644 --- a/crates/burn-core/src/nn/transformer/encoder.rs +++ b/crates/burn-core/src/nn/transformer/encoder.rs @@ -1,5 +1,5 @@ +use crate::tensor::Bool; use alloc::vec::Vec; -use burn_tensor::Bool; use crate::{ self as burn, @@ -17,7 +17,7 @@ use crate::{ tensor::{backend::Backend, Tensor}, }; -/// Configuration to create a [Transformer Encoder](TransformerEncoder) layer. +/// Configuration to create a [Transformer Encoder](TransformerEncoder) layer using the [init function](TransformerEncoderConfig::init). #[derive(Config)] pub struct TransformerEncoderConfig { /// The size of the model. @@ -54,6 +54,8 @@ pub struct TransformerEncoderConfig { /// # Params /// /// - layers: transformer encoder layers with `d_model` input and output features. +/// +/// Should be created using [TransformerEncoderConfig] #[derive(Module, Debug)] pub struct TransformerEncoder { layers: Vec>, @@ -338,8 +340,8 @@ impl TransformerEncoderAutoregressiveCache { #[cfg(test)] mod tests { use super::*; + use crate::tensor::Distribution; use crate::{nn::attention::generate_autoregressive_mask, TestBackend}; - use burn_tensor::Distribution; #[test] fn test_autoregressive_norm_last() { diff --git a/crates/burn-core/src/nn/transformer/pwff.rs b/crates/burn-core/src/nn/transformer/pwff.rs index 30c258d5ec..bd168b6433 100644 --- a/crates/burn-core/src/nn/transformer/pwff.rs +++ b/crates/burn-core/src/nn/transformer/pwff.rs @@ -8,7 +8,7 @@ use crate::{ tensor::{backend::Backend, Tensor}, }; -/// Configuration to create a [position-wise feed-forward](PositionWiseFeedForward) layer. +/// Configuration to create a [position-wise feed-forward](PositionWiseFeedForward) layer using the [init function](PositionWiseFeedForwardConfig::init). #[derive(Config)] pub struct PositionWiseFeedForwardConfig { /// The size of the input and output features. @@ -25,12 +25,16 @@ pub struct PositionWiseFeedForwardConfig { pub initializer: Initializer, } -/// Applies the position-wise feed-forward network to the input tensor. +/// Applies the position-wise feed-forward network to the input tensor from the paper [Attention Is All You Need](https://arxiv.org/pdf/1706.03762v7). /// /// # Params /// /// - linear inner: Linear layer with `d_model` input features and `d_ff` output features. /// - linear outer: Linear layer with `d_ff` input features and `d_model` output features. +/// +/// `FFN(x) = max(0, xW1 + b1)W2 + b2` +/// +/// Should be created using [PositionWiseFeedForwardConfig] #[derive(Module, Debug)] pub struct PositionWiseFeedForward { linear_inner: Linear, diff --git a/crates/burn-core/src/nn/unfold.rs b/crates/burn-core/src/nn/unfold.rs index 26711622e3..31acb1a87f 100644 --- a/crates/burn-core/src/nn/unfold.rs +++ b/crates/burn-core/src/nn/unfold.rs @@ -2,12 +2,13 @@ use crate as burn; use crate::config::Config; use crate::module::Module; -use burn_tensor::backend::Backend; -use burn_tensor::module::unfold4d; -use burn_tensor::ops::UnfoldOptions; -use burn_tensor::Tensor; +use crate::tensor::backend::Backend; +use crate::tensor::ops::UnfoldOptions; +use crate::tensor::Tensor; -/// Configuration to create an [unfold 4D](Unfold4d) layer. +use crate::tensor::module::unfold4d; + +/// Configuration to create an [unfold 4d](Unfold4d) layer using the [init function](Unfold4dConfig::init). #[derive(Config, Debug)] pub struct Unfold4dConfig { /// The size of the kernel. @@ -24,13 +25,15 @@ pub struct Unfold4dConfig { } /// Four-dimensional unfolding. +/// +/// Should be created with [Unfold4dConfig]. #[derive(Module, Clone, Debug)] pub struct Unfold4d { config: Unfold4dConfig, } impl Unfold4dConfig { - /// Initialize a new [unfold 4k](Unfold4d) module. + /// Initializes a new [Unfold4d] module. pub fn init(&self) -> Unfold4d { Unfold4d { config: self.clone(), @@ -41,10 +44,12 @@ impl Unfold4dConfig { impl Unfold4d { /// Applies the forward pass on the input tensor. /// + /// See [unfold4d](crate::tensor::module::unfold4d) for more information. + /// /// # Shapes /// - /// input: `[batch_size, channels_in, height, width]`, - /// returns: `[batch_size, channels_in * kernel_size_1 * kernel_size_2, number of blocks]`, + /// input: `[batch_size, channels_in, height, width]` + /// returns: `[batch_size, channels_in * kernel_size_1 * kernel_size_2, number of blocks]` pub fn forward(&self, input: Tensor) -> Tensor { unfold4d( input, diff --git a/crates/burn-tensor/src/tensor/activation/base.rs b/crates/burn-tensor/src/tensor/activation/base.rs index 77d9e04170..21cd516431 100644 --- a/crates/burn-tensor/src/tensor/activation/base.rs +++ b/crates/burn-tensor/src/tensor/activation/base.rs @@ -2,7 +2,10 @@ use crate::backend::Backend; use crate::check::TensorCheck; use crate::{check, Tensor}; -/// Applies the rectified linear unit function. +/// Applies the rectified linear unit function as described in the paper [Deep Learning using +/// Rectified Linear Units (ReLU)](https://arxiv.org/pdf/1803.08375). +/// +/// `y = max(0, x)` pub fn relu(tensor: Tensor) -> Tensor { tensor.relu() } @@ -20,12 +23,12 @@ pub fn leaky_relu( )) } -/// Applies the Gaussian Error Linear Units function as described in the paper in [Gaussian Error Linear Units (GELUs)](https://arxiv.org/pdf/1606.08415v3.pdf). +/// Applies the Gaussian Error Linear Units function as described in the paper [Gaussian Error Linear Units (GELUs)](https://arxiv.org/pdf/1606.08415v3.pdf). pub fn gelu(tensor: Tensor) -> Tensor { Tensor::from_primitive(B::gelu(tensor.primitive)) } -/// Applies Parametric ReLu activation +/// Applies Parametric ReLu activation function as described in the paper [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/pdf/1502.01852). /// ` PReLu(x) = max(0,x) + \alpha * min(0,x)` /// tensor is assumed to be of shape \[batch_size, channels, ...\] /// alpha is assumed to be of shape \[channels\] or \[1\]