diff --git a/crates/burn-core/src/nn/attention/mask.rs b/crates/burn-core/src/nn/attention/mask.rs
index a9ef538295..8bab9fbfc2 100644
--- a/crates/burn-core/src/nn/attention/mask.rs
+++ b/crates/burn-core/src/nn/attention/mask.rs
@@ -1,6 +1,6 @@
 use alloc::vec::Vec;
 
-use burn_tensor::{backend::Backend, Bool, Data, ElementConversion, Int, Shape, Tensor};
+use crate::tensor::{backend::Backend, Bool, Data, ElementConversion, Int, Shape, Tensor};
 
 /// Generate an autoregressive attention mask.
 ///
@@ -89,9 +89,9 @@ pub fn generate_padding_mask<B: Backend>(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
     use alloc::vec;
-    use burn_tensor::Data;
 
     #[test]
     fn test_generate_autoregressive_mask() {
diff --git a/crates/burn-core/src/nn/attention/mha.rs b/crates/burn-core/src/nn/attention/mha.rs
index b8fba51efb..ed6eb49235 100644
--- a/crates/burn-core/src/nn/attention/mha.rs
+++ b/crates/burn-core/src/nn/attention/mha.rs
@@ -12,21 +12,21 @@ use crate::{
 #[cfg(not(feature = "std"))]
 use num_traits::Float;
 
-/// Configuration to create a [Multi Head Attention](MultiHeadAttention) layer.
+/// Configuration to create a [Multi Head Attention](MultiHeadAttention) layer using the [init function](MultiHeadAttentionConfig::init).
 #[derive(Config)]
 pub struct MultiHeadAttentionConfig {
     /// The size of each linear layer.
-    d_model: usize,
+    pub d_model: usize,
     /// The number of heads.
-    n_heads: usize,
+    pub n_heads: usize,
     /// The dropout rate. Default: 0.1
     #[config(default = 0.1)]
-    dropout: f64,
+    pub dropout: f64,
     /// The minimum value a float can take. Default: -1.0e4
     /// This is used to mask attention scores before calculating attention weights.
     /// A value too low might result in NaN.
     #[config(default = -1.0e4)]
-    min_float: f64,
+    pub min_float: f64,
     /// Use "quiet softmax" instead of regular softmax.
     ///
     /// - Usage may improve performance by allowing attention heads to deposit no information (if the sequence contains no information relevant to that head).
@@ -34,7 +34,7 @@ pub struct MultiHeadAttentionConfig {
     ///
     /// Reference: <https://www.evanmiller.org/attention-is-off-by-one.html>
     #[config(default = false)]
-    quiet_softmax: bool,
+    pub quiet_softmax: bool,
     /// The type of function used to initialize neural network parameters
     #[config(
         default = "Initializer::KaimingUniform{gain:1.0/num_traits::Float::sqrt(3.0), fan_out_only:false}"
@@ -50,6 +50,8 @@ pub struct MultiHeadAttentionConfig {
 /// - key: [Linear](nn::Linear) layer with `d_model` input and output features.
 /// - value: [Linear](nn::Linear) layer with `d_model` input and output features.
 /// - output: [Linear](nn::Linear) layer with `d_model` input and output features.
+///
+/// Should be created with [MultiHeadAttentionConfig].
 #[derive(Module, Debug)]
 pub struct MultiHeadAttention<B: Backend> {
     query: nn::Linear<B>,
@@ -67,8 +69,11 @@ pub struct MultiHeadAttention<B: Backend> {
 /// [Multihead attention](MultiHeadAttention) forward pass input argument.
 #[derive(Debug, Clone)]
 pub struct MhaInput<B: Backend> {
+    /// Shape `[batch_size, seq_length_1, d_model]`
     query: Tensor<B, 3>,
+    /// Shape `[batch_size, seq_length_2, d_model]`
     key: Tensor<B, 3>,
+    /// Shape `[batch_size, seq_length_2, d_model]`
     value: Tensor<B, 3>,
     mask_pad: Option<Tensor<B, 2, Bool>>,
     mask_attn: Option<Tensor<B, 3, Bool>>,
@@ -101,6 +106,9 @@ impl MultiHeadAttentionConfig {
 impl<B: Backend> MhaInput<B> {
     /// Create a [multihead attention](MultiHeadAttention) input argument
     /// by setting the query, key and value to the given tensor.
+    ///
+    /// # Shape
+    /// - tensor: `[batch_size, seq_length, d_model]`
     pub fn self_attn(tensor: Tensor<B, 3>) -> Self {
         Self {
             query: tensor.clone(),
@@ -138,15 +146,17 @@ impl<B: Backend> MhaInput<B> {
 /// [Multihead attention](MultiHeadAttention) outputs.
 #[derive(Debug, Clone)]
 pub struct MhaOutput<B: Backend> {
-    /// The attention weights [batch_size, n_heads, seq_length_1, seq_length_2].
+    /// The attention weights `[batch_size, n_heads, seq_length_1, seq_length_2]`.
     pub weights: Tensor<B, 4>,
-    /// The context tensor [batch_size, seq_length_1, d_model].
+    /// The context tensor `[batch_size, seq_length_1, d_model]`.
     pub context: Tensor<B, 3>,
 }
 
 impl<B: Backend> MultiHeadAttention<B> {
     /// Applies the forward pass on the input tensors.
     ///
+    /// See [MultiHeadAttention](MultiHeadAttention) for more information.
+    ///
     /// # Shapes
     ///
     /// - query: `[batch_size, seq_length_1, d_model]`
@@ -310,10 +320,10 @@ impl<B: Backend, const D: usize> MhaLinearCache<B, D> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Int;
+    use crate::tensor::{Distribution, Shape};
     use crate::{nn::attention::generate_autoregressive_mask, TestBackend};
     use alloc::vec::Vec;
-    use burn::tensor::{Distribution, Shape};
-    use burn_tensor::Int;
 
     #[test]
     fn test_self_attention_shapes() {
diff --git a/crates/burn-core/src/nn/conv/conv1d.rs b/crates/burn-core/src/nn/conv/conv1d.rs
index 4f232b7412..a14d668a82 100644
--- a/crates/burn-core/src/nn/conv/conv1d.rs
+++ b/crates/burn-core/src/nn/conv/conv1d.rs
@@ -3,15 +3,14 @@ use crate as burn;
 use crate::config::Config;
 use crate::module::Module;
 use crate::module::Param;
+use crate::nn::conv::checks;
 use crate::nn::{Initializer, PaddingConfig1d};
 use crate::tensor::backend::Backend;
+use crate::tensor::module::conv1d;
+use crate::tensor::ops::ConvOptions;
 use crate::tensor::Tensor;
-use burn_tensor::module::conv1d;
-use burn_tensor::ops::ConvOptions;
 
-use super::checks;
-
-/// Configuration to create an [1D convolution](Conv1d) layer.
+/// Configuration to create a [1D convolution](Conv1d) layer using the [init function](Conv1dConfig::init).
 #[derive(Config, Debug)]
 pub struct Conv1dConfig {
     /// The number of input channels.
@@ -44,14 +43,10 @@ pub struct Conv1dConfig {
 
 /// Applies a 1D convolution over input tensors.
 ///
-/// # Params
-///
-/// - weight: Tensor of shape [channels_out, channels_in / groups, kernel_size]
-///
-/// - bias:   Tensor of shape `[channels_out]`
+/// Should be created with [Conv1dConfig].
 #[derive(Module, Debug)]
 pub struct Conv1d<B: Backend> {
-    /// Tensor of shape [channels_out, channels_in / groups, kernel_size]
+    /// Tensor of shape `[channels_out, channels_in / groups, kernel_size]`
     pub weight: Param<Tensor<B, 3>>,
     /// Tensor of shape `[channels_out]`
     pub bias: Option<Param<Tensor<B, 1>>>,
@@ -102,10 +97,12 @@ impl Conv1dConfig {
 impl<B: Backend> Conv1d<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [conv1d](crate::tensor::module::conv1d) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels_in, length_in],
-    /// - output: [batch_size, channels_out, length_out],
+    /// - input: `[batch_size, channels_in, length_in]`
+    /// - output: `[batch_size, channels_out, length_out]`
     pub fn forward(&self, input: Tensor<B, 3>) -> Tensor<B, 3> {
         let [_batch_size, _channels, length] = input.dims();
         let padding = self
@@ -124,8 +121,8 @@ impl<B: Backend> Conv1d<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn initializer_default() {
diff --git a/crates/burn-core/src/nn/conv/conv2d.rs b/crates/burn-core/src/nn/conv/conv2d.rs
index 877618d6c6..c7350d9916 100644
--- a/crates/burn-core/src/nn/conv/conv2d.rs
+++ b/crates/burn-core/src/nn/conv/conv2d.rs
@@ -6,13 +6,13 @@ use crate::module::Param;
 use crate::nn::Initializer;
 use crate::nn::PaddingConfig2d;
 use crate::tensor::backend::Backend;
+use crate::tensor::module::conv2d;
+use crate::tensor::ops::ConvOptions;
 use crate::tensor::Tensor;
-use burn_tensor::module::conv2d;
-use burn_tensor::ops::ConvOptions;
 
-use super::checks;
+use crate::nn::conv::checks;
 
-/// Configuration to create an [2D convolution](Conv2d) layer.
+/// Configuration to create a [2D convolution](Conv2d) layer, using the [init function](Conv2dConfig::init).
 #[derive(Config, Debug)]
 pub struct Conv2dConfig {
     /// The number of channels.
@@ -43,11 +43,7 @@ pub struct Conv2dConfig {
 
 /// Applies a 2D convolution over input tensors.
 ///
-/// # Params
-///
-/// - weight: Tensor of shape `[channels_out, channels_in / groups, kernel_size_1, kernel_size_2]`
-///
-/// - bias:   Tensor of shape `[channels_out]`
+/// Should be created with [Conv2dConfig].
 #[derive(Module, Debug)]
 pub struct Conv2d<B: Backend> {
     /// Tensor of shape `[channels_out, channels_in / groups, kernel_size_1, kernel_size_2]`
@@ -106,10 +102,12 @@ impl Conv2dConfig {
 impl<B: Backend> Conv2d<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [conv2d](crate::tensor::module::conv2d) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels_in, height_in, width_in],
-    /// - output: [batch_size, channels_out, height_out, width_out],
+    /// - input: `[batch_size, channels_in, height_in, width_in]`
+    /// - output: `[batch_size, channels_out, height_out, width_out]`
     pub fn forward(&self, input: Tensor<B, 4>) -> Tensor<B, 4> {
         let [_batch_size, _channels_in, height_in, width_in] = input.dims();
         let padding =
@@ -127,8 +125,8 @@ impl<B: Backend> Conv2d<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn initializer_default() {
diff --git a/crates/burn-core/src/nn/conv/conv_transpose1d.rs b/crates/burn-core/src/nn/conv/conv_transpose1d.rs
index 81c2372902..1a73191798 100644
--- a/crates/burn-core/src/nn/conv/conv_transpose1d.rs
+++ b/crates/burn-core/src/nn/conv/conv_transpose1d.rs
@@ -3,15 +3,15 @@ use crate as burn;
 use crate::config::Config;
 use crate::module::Module;
 use crate::module::Param;
+use crate::nn::conv::checks;
 use crate::nn::Initializer;
 use crate::tensor::backend::Backend;
+use crate::tensor::module::conv_transpose1d;
+use crate::tensor::ops::ConvTransposeOptions;
 use crate::tensor::Tensor;
-use burn_tensor::module::conv_transpose1d;
-use burn_tensor::ops::ConvTransposeOptions;
 
-use super::checks;
-
-/// Configuration to create an [1D transposed convolution](ConvTranspose1d) layer.
+/// Configuration to create an [1D transposed convolution](ConvTranspose1d) layer
+/// using the [init function](ConvTranspose1dConfig::init).
 #[derive(Config, Debug)]
 pub struct ConvTranspose1dConfig {
     /// The number of channels.
@@ -44,12 +44,6 @@ pub struct ConvTranspose1dConfig {
 }
 
 /// Applies a 1D transposed convolution over input tensors.
-///
-/// # Params
-///
-/// - weight: Tensor of shape `[channels_in, channels_out / groups, kernel_size]`
-///
-/// - bias:   Tensor of shape `[channels_out]`
 #[derive(Module, Debug)]
 pub struct ConvTranspose1d<B: Backend> {
     /// Tensor of shape `[channels_in, channels_out / groups, kernel_size]`
@@ -104,10 +98,12 @@ impl ConvTranspose1dConfig {
 impl<B: Backend> ConvTranspose1d<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See also [conv_transpose1d](crate::tensor::module::conv_transpose1d).
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels_in, length_in],
-    /// - output: [batch_size, channels_out, length_out],
+    /// - input: `[batch_size, channels_in, length_in]`
+    /// - output: `[batch_size, channels_out, length_out]`
     pub fn forward(&self, input: Tensor<B, 3>) -> Tensor<B, 3> {
         conv_transpose1d(
             input,
@@ -127,8 +123,8 @@ impl<B: Backend> ConvTranspose1d<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn initializer_default() {
diff --git a/crates/burn-core/src/nn/conv/conv_transpose2d.rs b/crates/burn-core/src/nn/conv/conv_transpose2d.rs
index 0f2640ddad..0d5132942d 100644
--- a/crates/burn-core/src/nn/conv/conv_transpose2d.rs
+++ b/crates/burn-core/src/nn/conv/conv_transpose2d.rs
@@ -1,17 +1,17 @@
 use crate as burn;
 
-use super::checks;
 use crate::config::Config;
 use crate::module::Module;
 use crate::module::Param;
+use crate::nn::conv::checks;
 use crate::nn::Initializer;
 use crate::tensor::backend::Backend;
+use crate::tensor::module::conv_transpose2d;
+use crate::tensor::ops::ConvTransposeOptions;
 use crate::tensor::Tensor;
 
-use burn_tensor::module::conv_transpose2d;
-use burn_tensor::ops::ConvTransposeOptions;
-
-/// Configuration to create an [2D transposed convolution](ConvTranspose2d) layer.
+/// Configuration to create an [2D transposed convolution](ConvTranspose2d) layer
+/// using the [init function](ConvTranspose2dConfig::init).
 #[derive(Config, Debug)]
 pub struct ConvTranspose2dConfig {
     /// The number of channels.
@@ -44,12 +44,6 @@ pub struct ConvTranspose2dConfig {
 }
 
 /// Applies a 2D transposed convolution over input tensors.
-///
-/// # Params
-///
-/// - weight: Tensor of shape `[channels_in, channels_out / groups, kernel_size_1, kernel_size_2]`
-///
-/// - bias:   Tensor of shape `[channels_out]`
 #[derive(Module, Debug)]
 pub struct ConvTranspose2d<B: Backend> {
     /// Tensor of shape `[channels_in, channels_out / groups, kernel_size_1, kernel_size_2]`
@@ -105,10 +99,12 @@ impl ConvTranspose2dConfig {
 impl<B: Backend> ConvTranspose2d<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See also [conv_transpose2d](crate::tensor::module::conv_transpose2d).
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels_in, height_in, width_in],
-    /// - output: [batch_size, channels_out, height_out, width_out],
+    /// - input: `[batch_size, channels_in, height_in, width_in]`
+    /// - output: `[batch_size, channels_out, height_out, width_out]`
     pub fn forward(&self, input: Tensor<B, 4>) -> Tensor<B, 4> {
         conv_transpose2d(
             input,
@@ -128,8 +124,8 @@ impl<B: Backend> ConvTranspose2d<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn initializer_default() {
diff --git a/crates/burn-core/src/nn/dropout.rs b/crates/burn-core/src/nn/dropout.rs
index 92ea9cc320..e10dcf50b4 100644
--- a/crates/burn-core/src/nn/dropout.rs
+++ b/crates/burn-core/src/nn/dropout.rs
@@ -5,7 +5,7 @@ use crate::module::Module;
 use crate::tensor::backend::Backend;
 use crate::tensor::{Distribution, Tensor};
 
-/// Configuration to create a [Dropout](Dropout) layer.
+/// Configuration to create a [Dropout](Dropout) layer using the [init function](DropoutConfig::init).
 #[derive(Config, Debug)]
 pub struct DropoutConfig {
     /// The probability of randomly zeroes some elements of the input tensor during training.
@@ -18,6 +18,8 @@ pub struct DropoutConfig {
 /// [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580).
 ///
 /// The input is also scaled during training to `1 / (1 - prob_keep)`.
+///
+/// Should be created with [DropoutConfig].
 #[derive(Module, Clone, Debug)]
 pub struct Dropout {
     prob: f64,
@@ -33,6 +35,8 @@ impl DropoutConfig {
 impl Dropout {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [Dropout](Dropout) for more information.
+    ///
     /// # Shapes
     ///
     /// - input: `[..., any]`
diff --git a/crates/burn-core/src/nn/embedding.rs b/crates/burn-core/src/nn/embedding.rs
index 4ff23be900..3ad02f141d 100644
--- a/crates/burn-core/src/nn/embedding.rs
+++ b/crates/burn-core/src/nn/embedding.rs
@@ -5,16 +5,18 @@ use crate::config::Config;
 use crate::module::Module;
 use crate::module::Param;
 use crate::tensor::backend::Backend;
+use crate::tensor::Int;
 use crate::tensor::Tensor;
-use burn_tensor::Int;
 
-/// Configuration to create an [Embedding](Embedding) layer.
+use crate::tensor::module::embedding;
+
+/// Configuration to create an [Embedding](Embedding) layer using the [init function](EmbeddingConfig::init).
 #[derive(Config)]
 pub struct EmbeddingConfig {
     /// The number of embedding vectors.
-    n_embedding: usize,
+    pub n_embedding: usize,
     /// The size of each vector.
-    d_model: usize,
+    pub d_model: usize,
     /// The type of function used to initialize neural network parameters
     #[config(default = "Initializer::Normal{mean:0.0, std:1.0}")]
     pub initializer: Initializer,
@@ -22,13 +24,10 @@ pub struct EmbeddingConfig {
 
 /// Lookup table to store a fix number of vectors.
 ///
-/// # Params
-///
-/// - weight: Matrix of shape `[n_embedding, d_model]` initialized from a normal distribution:
-///     `N(0, 1)`
+/// Should be created with [EmbeddingConfig].
 #[derive(Module, Debug)]
 pub struct Embedding<B: Backend> {
-    /// The learnable weights of the module of shape [n_embedding, d_model] initialized
+    /// The learnable weights of the module of shape `[n_embedding, d_model]` initialized
     /// from a normal distribution `N(0, 1)`.
     pub weight: Param<Tensor<B, 2>>,
 }
@@ -47,20 +46,22 @@ impl EmbeddingConfig {
 impl<B: Backend> Embedding<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See also [embedding](crate::tensor::module::embedding).
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, seq_length]
-    /// - output: [batch_size, d_model]
+    /// - input: `[batch_size, seq_length]`
+    /// - output: `[batch_size, d_model]`
     pub fn forward(&self, input: Tensor<B, 2, Int>) -> Tensor<B, 3> {
-        burn_tensor::module::embedding(self.weight.val(), input)
+        embedding(self.weight.val(), input)
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn initializer_default() {
diff --git a/crates/burn-core/src/nn/gelu.rs b/crates/burn-core/src/nn/gelu.rs
index 37a395e37e..421f83452d 100644
--- a/crates/burn-core/src/nn/gelu.rs
+++ b/crates/burn-core/src/nn/gelu.rs
@@ -5,6 +5,7 @@ use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
 
 /// Applies the Gaussian Error Linear Units function element-wise.
+/// See also [gelu](burn::tensor::activation::gelu)
 #[derive(Module, Clone, Debug, Default)]
 pub struct Gelu {}
 
diff --git a/crates/burn-core/src/nn/initializer.rs b/crates/burn-core/src/nn/initializer.rs
index 00f44c3578..7beb8b246b 100644
--- a/crates/burn-core/src/nn/initializer.rs
+++ b/crates/burn-core/src/nn/initializer.rs
@@ -1,4 +1,4 @@
-use burn_tensor::Shape;
+use crate::tensor::Shape;
 
 use crate::config::Config;
 use crate::module::{Param, ParamId};
@@ -200,7 +200,7 @@ fn normal_draw<B: Backend, const D: usize, S: Into<Shape<D>>>(
 mod tests {
     use super::*;
 
-    use burn_tensor::{Data, ElementConversion};
+    use crate::tensor::{Data, ElementConversion};
     use num_traits::Pow;
 
     pub type TB = burn_ndarray::NdArray<f32>;
diff --git a/crates/burn-core/src/nn/leaky_relu.rs b/crates/burn-core/src/nn/leaky_relu.rs
index 6fb81331c0..1a230a4841 100644
--- a/crates/burn-core/src/nn/leaky_relu.rs
+++ b/crates/burn-core/src/nn/leaky_relu.rs
@@ -1,19 +1,20 @@
-use core::marker::PhantomData;
-
 use crate as burn;
 use crate::config::Config;
 use crate::module::Module;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
 
+use crate::tensor::activation::leaky_relu;
+
 /// Leaky ReLu layer.
-#[derive(Module, Debug)]
-pub struct LeakyRelu<B: Backend> {
+///
+/// Should be created with [LeakyReluConfig](LeakyReluConfig).
+#[derive(Module, Clone, Debug)]
+pub struct LeakyRelu {
     /// The negative slope.
     pub negative_slope: f64,
-    phantom: PhantomData<B>,
 }
-/// Configuration to create a [Leaky Relu](LeakyRelu) layer.
+/// Configuration to create a [Leaky Relu](LeakyRelu) layer using the [init function](LeakyReluConfig::init).
 #[derive(Config, Debug)]
 pub struct LeakyReluConfig {
     /// The negative slope. Default is 0.01
@@ -22,39 +23,36 @@ pub struct LeakyReluConfig {
 }
 impl LeakyReluConfig {
     /// Initialize a new [Leaky Relu](LeakyRelu) Layer
-    pub fn init<B: Backend>(&self) -> LeakyRelu<B> {
+    pub fn init(&self) -> LeakyRelu {
         LeakyRelu {
             negative_slope: self.negative_slope,
-            phantom: PhantomData,
         }
     }
 }
 
-impl<B: Backend> LeakyRelu<B> {
+impl LeakyRelu {
     /// Forward pass for the Leaky ReLu layer.
     ///
-    /// # Arguments
-    ///
-    /// * `input` - The input tensor.
-    ///
-    /// # Returns
+    /// See [leaky_relu](crate::tensor::activation::leaky_relu) for more information.
     ///
-    /// The output tensor.
-    pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
-        crate::tensor::activation::leaky_relu(input, self.negative_slope)
+    /// # Shapes
+    /// - input: `[..., any]`
+    /// - output: `[..., any]`
+    pub fn forward<B: Backend, const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
+        leaky_relu(input, self.negative_slope)
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn test_leaky_relu_forward() {
         let device = <TestBackend as Backend>::Device::default();
-        let model: LeakyRelu<TestBackend> = LeakyReluConfig::new().init();
+        let model = LeakyReluConfig::new().init();
         let input = Tensor::<TestBackend, 2>::from_data(Data::from([[0.4410, -0.2507]]), &device);
         let out = model.forward(input);
         assert_eq!(out.to_data(), Data::from([[0.4410, -0.002507]]));
@@ -87,7 +85,7 @@ mod tests {
         ];
 
         let device = <TestBackend as Backend>::Device::default();
-        let model: LeakyRelu<TestBackend> = LeakyReluConfig::new().init();
+        let model = LeakyReluConfig::new().init();
         let input_data = Tensor::<TestBackend, 3>::from_data(Data::from(input), &device);
         let actual_output = model.forward(input_data);
         actual_output
diff --git a/crates/burn-core/src/nn/linear.rs b/crates/burn-core/src/nn/linear.rs
index 6e4bc8e501..d54da5f6fd 100644
--- a/crates/burn-core/src/nn/linear.rs
+++ b/crates/burn-core/src/nn/linear.rs
@@ -7,7 +7,7 @@ use crate::tensor::{backend::Backend, Tensor};
 
 use super::Initializer;
 
-/// Configuration to create a [Linear](Linear) layer.
+/// Configuration to create a [Linear](Linear) layer using the [init function](LinearConfig::init).
 #[derive(Config, Debug)]
 pub struct LinearConfig {
     /// The size of the input features.
@@ -26,6 +26,8 @@ pub struct LinearConfig {
 
 /// Applies a linear transformation to the input tensor:
 ///
+/// Should be created with [LinearConfig]
+///
 /// `O = IW + b`
 #[derive(Module, Debug)]
 pub struct Linear<B: Backend> {
@@ -84,8 +86,8 @@ impl<B: Backend> Linear<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::{Data, Shape};
     use crate::TestBackend;
-    use burn_tensor::{Data, Shape};
 
     #[test]
     fn initializer_default() {
diff --git a/crates/burn-core/src/nn/loss/binary_cross_entropy.rs b/crates/burn-core/src/nn/loss/binary_cross_entropy.rs
index 6719ef04db..192e07ab2a 100644
--- a/crates/burn-core/src/nn/loss/binary_cross_entropy.rs
+++ b/crates/burn-core/src/nn/loss/binary_cross_entropy.rs
@@ -1,11 +1,11 @@
 use crate as burn;
 
+use crate::tensor::activation::log_sigmoid;
+use crate::tensor::{backend::Backend, Int, Tensor};
 use crate::{config::Config, module::Module};
 use alloc::vec::Vec;
-use burn_tensor::activation::log_sigmoid;
-use burn_tensor::{backend::Backend, Int, Tensor};
 
-/// Configuration to create a [Binary Cross-entropy loss](BinaryCrossEntropyLoss).
+/// Configuration to create a [Binary Cross-entropy loss](BinaryCrossEntropyLoss) using the [init function](BinaryCrossEntropyLossConfig::init).
 #[derive(Config, Debug)]
 pub struct BinaryCrossEntropyLossConfig {
     /// Create weighted binary cross-entropy with a weight for each class.
@@ -17,11 +17,11 @@ pub struct BinaryCrossEntropyLossConfig {
     ///
     /// Hard labels {0, 1} will be changed to `y_smoothed = y(1 - a) + a / num_classes`.
     /// Alpha = 0 would be the same as default.
-    smoothing: Option<f32>,
+    pub smoothing: Option<f32>,
 
     /// Treat the inputs as logits, applying a sigmoid activation when computing the loss.
     #[config(default = false)]
-    logits: bool,
+    pub logits: bool,
 }
 
 impl BinaryCrossEntropyLossConfig {
@@ -56,6 +56,8 @@ impl BinaryCrossEntropyLossConfig {
 }
 
 /// Calculate the binary cross entropy loss from the input logits and the targets.
+///
+/// Should be created using [BinaryCrossEntropyLossConfig]
 #[derive(Module, Debug)]
 pub struct BinaryCrossEntropyLoss<B: Backend> {
     /// Weights for cross-entropy.
@@ -146,8 +148,8 @@ impl<B: Backend> BinaryCrossEntropyLoss<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::{activation::sigmoid, Data};
     use crate::TestBackend;
-    use burn_tensor::{activation::sigmoid, Data};
 
     #[test]
     fn test_binary_cross_entropy() {
diff --git a/crates/burn-core/src/nn/loss/cross_entropy.rs b/crates/burn-core/src/nn/loss/cross_entropy.rs
index 82b71797f1..0dea70b517 100644
--- a/crates/burn-core/src/nn/loss/cross_entropy.rs
+++ b/crates/burn-core/src/nn/loss/cross_entropy.rs
@@ -1,18 +1,18 @@
 use crate as burn;
 
+use crate::tensor::activation::log_softmax;
+use crate::tensor::{backend::Backend, Bool, Int, Tensor};
 use crate::{config::Config, module::Module};
 use alloc::vec;
 use alloc::vec::Vec;
-use burn_tensor::activation::log_softmax;
-use burn_tensor::{backend::Backend, Bool, Int, Tensor};
 
-/// Configuration to create a [Cross-entropy loss](CrossEntropyLoss).
+/// Configuration to create a [Cross-entropy loss](CrossEntropyLoss) using the [init function](CrossEntropyLossConfig::init).
 #[derive(Config, Debug)]
 pub struct CrossEntropyLossConfig {
     /// Create padded cross entropy.
     ///
     /// Prevents pad tokens from impacting loss calculation.
-    pad_tokens: Option<Vec<usize>>,
+    pub pad_tokens: Option<Vec<usize>>,
 
     /// Create weighted cross-entropy.
     ///
@@ -21,18 +21,18 @@ pub struct CrossEntropyLossConfig {
     /// # Pre-conditions
     ///   - The order of the weight vector should correspond to the label integer assignment.
     ///   - Targets assigned negative Int's will not be allowed.
-    weights: Option<Vec<f32>>,
+    pub weights: Option<Vec<f32>>,
 
     /// Create cross-entropy with label smoothing.
     ///
     /// Hard labels {0, 1} will be changed to y_smoothed = y(1 - a) + a / nr_classes.
     /// Alpha = 0 would be the same as default.
-    smoothing: Option<f32>,
+    pub smoothing: Option<f32>,
 
     /// Create cross-entropy with probabilities as input instead of logits.    
     ///
     #[config(default = true)]
-    logits: bool,
+    pub logits: bool,
 }
 
 impl CrossEntropyLossConfig {
@@ -68,6 +68,8 @@ impl CrossEntropyLossConfig {
 }
 
 /// Calculate the cross entropy loss from the input logits and the targets.
+///
+/// Should be created using [CrossEntropyLossConfig]
 #[derive(Module, Debug)]
 pub struct CrossEntropyLoss<B: Backend> {
     pad_tokens: Option<Vec<usize>>,
@@ -214,8 +216,8 @@ impl<B: Backend> CrossEntropyLoss<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::{loss::cross_entropy_with_logits, Data, Distribution};
     use crate::TestBackend;
-    use burn_tensor::{loss::cross_entropy_with_logits, Data, Distribution};
 
     macro_rules! setup {
         () => {{
diff --git a/crates/burn-core/src/nn/loss/huber.rs b/crates/burn-core/src/nn/loss/huber.rs
index a2ae13dd75..06e4b0f143 100644
--- a/crates/burn-core/src/nn/loss/huber.rs
+++ b/crates/burn-core/src/nn/loss/huber.rs
@@ -1,8 +1,8 @@
 use crate as burn;
 
+use crate::tensor::backend::Backend;
+use crate::tensor::Tensor;
 use crate::{config::Config, module::Module};
-use burn_tensor::backend::Backend;
-use burn_tensor::Tensor;
 use core::marker::PhantomData;
 
 use super::Reduction;
@@ -124,8 +124,8 @@ impl<B: Backend> HuberLoss<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
     type TestTensor<const D: usize> = Tensor<TestBackend, D>;
 
     #[test]
diff --git a/crates/burn-core/src/nn/loss/mse.rs b/crates/burn-core/src/nn/loss/mse.rs
index 00945d004b..0bd873a887 100644
--- a/crates/burn-core/src/nn/loss/mse.rs
+++ b/crates/burn-core/src/nn/loss/mse.rs
@@ -1,7 +1,7 @@
 use crate::nn::loss::reduction::Reduction;
 use core::marker::PhantomData;
 
-use burn_tensor::{backend::Backend, Tensor};
+use crate::tensor::{backend::Backend, Tensor};
 
 /// Calculate the mean squared error loss from the input logits and the targets.
 #[derive(Clone, Debug)]
@@ -55,8 +55,8 @@ impl<B: Backend> MseLoss<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn test_mse_loss() {
diff --git a/crates/burn-core/src/nn/norm/batch.rs b/crates/burn-core/src/nn/norm/batch.rs
index 38d0dc5cf2..9fdca5ed46 100644
--- a/crates/burn-core/src/nn/norm/batch.rs
+++ b/crates/burn-core/src/nn/norm/batch.rs
@@ -7,7 +7,7 @@ use crate::{
     tensor::{backend::Backend, Tensor},
 };
 
-/// Configuration to create a [BatchNorm](BatchNorm) layer.
+/// Configuration to create a [BatchNorm](BatchNorm) layer using the [init function](BatchNormConfig::init).
 #[derive(Config, Debug)]
 pub struct BatchNormConfig {
     /// The number of features.
@@ -23,18 +23,31 @@ pub struct BatchNormConfig {
 /// Applies Batch Normalization over a tensor as described in the paper [Batch Normalization](https://arxiv.org/abs/1502.03167)
 ///
 /// `Y = norm(X) * γ + β`
+///
+/// Where:
+/// - `X` is the input tensor
+/// - `Y` is the output tensor
+/// - `norm` is the normalization function
+/// - `γ` is the learnable weight
+/// - `β` is the learnable bias
+///
+/// Should be created using [BatchNormConfig].
 #[derive(Module, Debug)]
 pub struct BatchNorm<B: Backend, const D: usize> {
-    gamma: Param<Tensor<B, 1>>,
-    beta: Param<Tensor<B, 1>>,
-    running_mean: RunningState<Tensor<B, 1>>,
-    running_var: RunningState<Tensor<B, 1>>,
+    /// The learnable weight gamma.
+    pub gamma: Param<Tensor<B, 1>>,
+    /// The learnable weight beta.
+    pub beta: Param<Tensor<B, 1>>,
+    /// The running mean.
+    pub running_mean: RunningState<Tensor<B, 1>>,
+    /// The running variance.
+    pub running_var: RunningState<Tensor<B, 1>>,
     momentum: f64,
     epsilon: f64,
 }
 
 impl BatchNormConfig {
-    /// Initialize a new [batch norm](BatchNorm) module.
+    /// Initializes a new [batch norm](BatchNorm) module.
     pub fn init<B: Backend, const D: usize>(&self, device: &B::Device) -> BatchNorm<B, D> {
         let gamma = Initializer::Ones.init([self.num_features], device);
         let beta = Initializer::Zeros.init([self.num_features], device);
@@ -56,10 +69,16 @@ impl BatchNormConfig {
 impl<const D: usize, B: Backend> BatchNorm<B, D> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [BatchNorm](BatchNorm) for more information.
+    ///
     /// # Shapes
     ///
     /// - input: `[batch_size, channels, ...]`
     /// - output: `[batch_size, channels, ...]`
+    ///
+    /// # Panics
+    ///
+    /// This function will panic if the input tensor has a dimension different from `D + 2`.
     pub fn forward<const DI: usize>(&self, input: Tensor<B, DI>) -> Tensor<B, DI> {
         // Should be move to a compilation error when const generic support that kind of
         // validation. https://github.com/rust-lang/rust/issues/76560
@@ -168,8 +187,8 @@ impl<const D: usize, B: Backend> BatchNorm<B, D> {
 #[cfg(test)]
 mod tests_1d {
     use super::*;
+    use crate::tensor::Data;
     use crate::{module::AutodiffModule, TestAutodiffBackend};
-    use burn_tensor::Data;
 
     #[test]
     fn batch_norm_forward_train() {
@@ -228,8 +247,8 @@ mod tests_1d {
 #[cfg(test)]
 mod tests_2d {
     use super::*;
+    use crate::tensor::Data;
     use crate::{module::AutodiffModule, TestAutodiffBackend};
-    use burn_tensor::Data;
 
     #[test]
     fn batch_norm_forward_train() {
diff --git a/crates/burn-core/src/nn/norm/group.rs b/crates/burn-core/src/nn/norm/group.rs
index a19c2fbc04..91c04c3d1d 100644
--- a/crates/burn-core/src/nn/norm/group.rs
+++ b/crates/burn-core/src/nn/norm/group.rs
@@ -7,7 +7,7 @@ use crate::module::Param;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
 
-/// Configuration to create a [GroupNorm](GroupNorm) layer.
+/// Configuration to create a [GroupNorm](GroupNorm) layer using the [init function](GroupNormConfig::init).
 #[derive(Debug, Config)]
 pub struct GroupNormConfig {
     /// The number of groups to separate the channels into
@@ -24,17 +24,28 @@ pub struct GroupNormConfig {
     pub affine: bool,
 }
 
-/// Applies Group Normalization over a mini-batch of inputs.
+/// Applies Group Normalization over a mini-batch of inputs as described in the paper [Group Normalization](https://arxiv.org/abs/1803.08494).
 ///
 /// `Y = groupnorm(X) * γ + β`
+///
+/// Where:
+/// - `X` is the input tensor
+/// - `Y` is the output tensor
+/// - `γ` is the learnable weight
+/// - `β` is the learnable bias
+///
+/// Should be created using [GroupNormConfig](GroupNormConfig).
 #[derive(Module, Debug)]
 pub struct GroupNorm<B: Backend> {
-    num_groups: usize,
-    num_channels: usize,
-    gamma: Option<Param<Tensor<B, 1>>>,
-    beta: Option<Param<Tensor<B, 1>>>,
-    epsilon: f64,
-    affine: bool,
+    /// The learnable weight
+    pub gamma: Option<Param<Tensor<B, 1>>>,
+    /// The learnable bias
+    pub beta: Option<Param<Tensor<B, 1>>>,
+
+    pub(crate) num_groups: usize,
+    pub(crate) num_channels: usize,
+    pub(crate) epsilon: f64,
+    pub(crate) affine: bool,
 }
 
 impl GroupNormConfig {
@@ -69,58 +80,95 @@ impl GroupNormConfig {
 impl<B: Backend> GroupNorm<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [GroupNorm](GroupNorm) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: `[..., any, d_model]`
-    /// - output: `[..., any, d_model]`
+    /// - input: `[batch_size, num_channels, *]`
+    /// - output: `[batch_size, num_channels, *]`
     pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
-        let shape = input.shape();
-        if shape.num_elements() <= 2 {
+        if input.shape().dims[1] != self.num_channels {
             panic!(
-                "input rank for GroupNorm should be at least 3, but got {}",
-                shape.num_elements()
+                "The number of channels in the input tensor should be equal to the number of channels in the GroupNorm module. Expected {}, got {}",
+                self.num_channels,
+                input.shape().dims[1]
             );
         }
 
-        let batch_size = shape.dims[0];
-        let num_channels = shape.dims[1];
+        let gamma = self.gamma.as_ref().map(|x| x.val());
+        let beta = self.beta.as_ref().map(|x| x.val());
 
-        if num_channels != self.num_channels {
-            panic!(
-                "expected {} channels but got {}",
-                self.num_channels, num_channels
-            );
-        }
+        group_norm(
+            input,
+            gamma,
+            beta,
+            self.num_groups,
+            self.epsilon,
+            self.affine,
+        )
+    }
+}
 
-        let hidden_size =
-            shape.dims[2..].iter().product::<usize>() * num_channels / self.num_groups;
-        let input = input.reshape([batch_size, self.num_groups, hidden_size]);
+/// Applies Group Normalization over a mini-batch of inputs as described in the paper [Group Normalization](https://arxiv.org/abs/1803.08494).
+///
+/// `Y = groupnorm(X) * γ + β`
+///
+/// Where:
+/// - `X` is the input tensor
+/// - `Y` is the output tensor
+/// - `γ` is the learnable weight
+/// - `β` is the learnable bias
+///
+pub(crate) fn group_norm<B: Backend, const D: usize>(
+    input: Tensor<B, D>,
+    gamma: Option<Tensor<B, 1>>,
+    beta: Option<Tensor<B, 1>>,
+    num_groups: usize,
+    epsilon: f64,
+    affine: bool,
+) -> Tensor<B, D> {
+    if (beta.is_none() || gamma.is_none()) && affine {
+        panic!("Affine is set to true, but gamma or beta is None");
+    }
 
-        let mean = input.clone().sum_dim(2) / hidden_size as f64;
-        let input = input.sub(mean);
+    let shape = input.shape();
+    if shape.num_elements() <= 2 {
+        panic!(
+            "input rank for GroupNorm should be at least 3, but got {}",
+            shape.num_elements()
+        );
+    }
 
-        let var = input.clone().powf_scalar(2.).sum_dim(2) / hidden_size as f64;
-        let input_normalized = input.div(var.sqrt().add_scalar(self.epsilon));
+    let batch_size = shape.dims[0];
+    let num_channels = shape.dims[1];
 
-        if self.affine {
-            let mut affine_shape = [1; D];
-            affine_shape[1] = num_channels;
+    let hidden_size = shape.dims[2..].iter().product::<usize>() * num_channels / num_groups;
+    let input = input.reshape([batch_size, num_groups, hidden_size]);
 
-            input_normalized
-                .reshape(shape)
-                .mul(self.gamma.clone().unwrap().val().reshape(affine_shape))
-                .add(self.beta.clone().unwrap().val().reshape(affine_shape))
-        } else {
-            input_normalized.reshape(shape)
-        }
+    let mean = input.clone().sum_dim(2) / hidden_size as f64;
+    let input = input.sub(mean);
+
+    let var = input.clone().powf_scalar(2.).sum_dim(2) / hidden_size as f64;
+    let input_normalized = input.div(var.sqrt().add_scalar(epsilon));
+
+    if affine {
+        let mut affine_shape = [1; D];
+        affine_shape[1] = num_channels;
+
+        input_normalized
+            .reshape(shape)
+            .mul(gamma.clone().unwrap().reshape(affine_shape))
+            .add(beta.clone().unwrap().reshape(affine_shape))
+    } else {
+        input_normalized.reshape(shape)
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn group_norm_forward_affine_false() {
diff --git a/crates/burn-core/src/nn/norm/instance.rs b/crates/burn-core/src/nn/norm/instance.rs
index 359d47307f..fb47505b60 100644
--- a/crates/burn-core/src/nn/norm/instance.rs
+++ b/crates/burn-core/src/nn/norm/instance.rs
@@ -1,45 +1,56 @@
 use crate as burn;
 
 use crate::config::Config;
-use crate::module::Module;
+use crate::module::{Module, Param};
+use crate::nn::norm::group_norm;
+use crate::nn::Initializer;
 use crate::tensor::{backend::Backend, Tensor};
 
-use super::{GroupNorm, GroupNormConfig};
-
-/// Configuration to create a [InstanceNorm](InstanceNorm) layer.
+/// Configuration to create a [InstanceNorm](InstanceNorm) layer using the [init function](InstanceNormConfig::init).
 #[derive(Debug, Config)]
 pub struct InstanceNormConfig {
     /// The number of channels expected in the input
-    num_channels: usize,
+    pub num_channels: usize,
     /// A value required for numerical stability. Default: 1e-5
     #[config(default = 1e-5)]
-    epsilon: f64,
+    pub epsilon: f64,
     /// A boolean value that when set to `true`, this module has learnable
     /// per-channel affine parameters initialized to ones (for weights)
     /// and zeros (for biases). Default: `true`
     #[config(default = true)]
-    affine: bool,
+    pub affine: bool,
 }
 
-/// Applies Instance Normalization over  a tensor as described in the paper [Instance Normalization](https://arxiv.org/abs/1607.08022)
+/// Applies Instance Normalization over a tensor as described in the paper [Instance Normalization](https://arxiv.org/abs/1607.08022)
+///
+/// Should be created using [InstanceNormConfig](InstanceNormConfig).
 #[derive(Module, Debug)]
 pub struct InstanceNorm<B: Backend> {
-    group_norm: GroupNorm<B>,
+    /// The learnable weight
+    pub gamma: Option<Param<Tensor<B, 1>>>,
+    /// The learnable bias
+    pub beta: Option<Param<Tensor<B, 1>>>,
+
+    num_channels: usize,
+    epsilon: f64,
+    affine: bool,
 }
 
 impl InstanceNormConfig {
     /// Initialize a new [instance norm](InstanceNorm) module.
     pub fn init<B: Backend>(&self, device: &B::Device) -> InstanceNorm<B> {
-        InstanceNorm {
-            group_norm: self.to_group_norm().init(device),
-        }
-    }
+        let (gamma, beta) = if self.affine {
+            let gamma = Initializer::Ones.init([self.num_channels], device);
+            let beta = Initializer::Zeros.init([self.num_channels], device);
+
+            (Some(gamma), Some(beta))
+        } else {
+            (None, None)
+        };
 
-    fn to_group_norm(&self) -> GroupNormConfig {
-        GroupNormConfig {
-            // Group norm is equivalent to instance norm, when the number of groups is
-            // equal to the number of channels.
-            num_groups: self.num_channels,
+        InstanceNorm {
+            gamma,
+            beta,
             num_channels: self.num_channels,
             epsilon: self.epsilon,
             affine: self.affine,
@@ -50,20 +61,28 @@ impl InstanceNormConfig {
 impl<B: Backend> InstanceNorm<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See also [InstanceNormConfig](InstanceNormConfig) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: `[..., any, d_model]`
-    /// - output: `[..., any, d_model]`
+    /// - input: `[batch_size, num_channels, *]`
+    /// - output: `[batch_size, num_channels, *]`
     pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
-        self.group_norm.forward(input)
+        // Instance norm is equivalent to group norm when the number of groups is equal to the number of channels.
+        let num_groups = self.num_channels;
+
+        let gamma = self.gamma.as_ref().map(|x| x.val());
+        let beta = self.beta.as_ref().map(|x| x.val());
+
+        group_norm(input, gamma, beta, num_groups, self.epsilon, self.affine)
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn instance_norm_forward_affine_false() {
diff --git a/crates/burn-core/src/nn/norm/layer.rs b/crates/burn-core/src/nn/norm/layer.rs
index c236c51b21..c0dc71afa8 100644
--- a/crates/burn-core/src/nn/norm/layer.rs
+++ b/crates/burn-core/src/nn/norm/layer.rs
@@ -1,13 +1,13 @@
 use crate as burn;
-use crate::nn::Initializer;
 
 use crate::config::Config;
 use crate::module::Module;
 use crate::module::Param;
+use crate::nn::Initializer;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
 
-/// Configuration to create a [LayerNorm](LayerNorm) layer.
+/// Configuration to create a [LayerNorm](LayerNorm) layer using the [init function](LayerNormConfig::init).
 #[derive(Debug, Config)]
 pub struct LayerNormConfig {
     /// The size of the input features.
@@ -20,9 +20,19 @@ pub struct LayerNormConfig {
 /// Applies Layer Normalization over an input tensor as described in the paper [Layer Normalization](https://arxiv.org/abs/1607.06450).
 ///
 /// `Y = norm(X) * γ + β`
+///
+/// Where:
+/// - `X` is the input tensor
+/// - `Y` is the output tensor
+/// - `γ` is the learnable weight
+/// - `β` is the learnable bias
+///
+/// Should be created using [LayerNormConfig](LayerNormConfig).
 #[derive(Module, Debug)]
 pub struct LayerNorm<B: Backend> {
+    /// The learnable weight.
     gamma: Param<Tensor<B, 1>>,
+    /// The learnable bias.
     beta: Param<Tensor<B, 1>>,
     epsilon: f64,
 }
@@ -44,6 +54,8 @@ impl LayerNormConfig {
 impl<B: Backend> LayerNorm<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See the [LayerNorm](LayerNorm) documentation for more information.
+    ///
     /// # Shapes
     ///
     /// - input: `[..., any, d_model]`
@@ -62,7 +74,7 @@ impl<B: Backend> LayerNorm<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use burn_tensor::Data;
+    use crate::tensor::Data;
 
     #[cfg(feature = "std")]
     use crate::{TestAutodiffBackend, TestBackend};
diff --git a/crates/burn-core/src/nn/norm/rms.rs b/crates/burn-core/src/nn/norm/rms.rs
index 6c6f313b7d..2ac15df227 100644
--- a/crates/burn-core/src/nn/norm/rms.rs
+++ b/crates/burn-core/src/nn/norm/rms.rs
@@ -7,18 +7,22 @@ use crate::nn::Initializer;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
 
-/// Configuration to create a [RMS Norm](RmsNorm) layer.
+/// Configuration to create a [RMS Norm](RmsNorm) layer using the [init function](RmsNormConfig::init).
 #[derive(Config)]
 pub struct RmsNormConfig {
     /// The size of the input features.
-    d_model: usize,
+    pub d_model: usize,
     /// A value required for numerical stability. Default: 1e-5
     #[config(default = 1e-5)]
-    epsilon: f64,
+    pub epsilon: f64,
 }
 
 impl RmsNormConfig {
     /// Initialize a new [RMS Norm](RmsNorm) module.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `epsilon` is not positive.
     pub fn init<B: Backend>(&self, device: &B::Device) -> RmsNorm<B> {
         assert!(self.epsilon > 0.0, "epsilon must be positive.");
 
@@ -35,11 +39,18 @@ impl RmsNormConfig {
 ///
 /// `Y = X / sqrt(mean(X^2) + eps) * gamma`
 ///
-/// where `eps` is a small value to avoid division by zero.
+/// Where:
+/// - `X` is the input tensor
+/// - `Y` is the output tensor
+/// - `gamma` is the learnable weight
+/// - `mean` is the mean operation
+/// - `eps` is a small value to avoid division by zero.
+///
+/// Should be created using the [RmsNormConfig](RmsNormConfig) configuration.
 #[derive(Module, Debug)]
 pub struct RmsNorm<B: Backend> {
     /// The learnable parameter to scale the normalized tensor
-    gamma: Param<Tensor<B, 1>>,
+    pub gamma: Param<Tensor<B, 1>>,
     /// A value required for numerical stability
     epsilon: f64,
 }
@@ -47,6 +58,8 @@ pub struct RmsNorm<B: Backend> {
 impl<B: Backend> RmsNorm<B> {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See the [RmsNorm](RmsNorm) documentation for more information.
+    ///
     /// # Shapes
     ///
     /// - input: `[..., any, d_model]`
@@ -61,8 +74,8 @@ impl<B: Backend> RmsNorm<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Data;
     use crate::TestBackend;
-    use burn_tensor::Data;
 
     #[test]
     fn rms_norm_forward() {
diff --git a/crates/burn-core/src/nn/padding.rs b/crates/burn-core/src/nn/padding.rs
index db27bf4486..8f64340108 100644
--- a/crates/burn-core/src/nn/padding.rs
+++ b/crates/burn-core/src/nn/padding.rs
@@ -1,6 +1,6 @@
 use crate as burn;
 
-use burn_tensor::ops::conv::calculate_conv_padding;
+use crate::tensor::ops::conv::calculate_conv_padding;
 
 use crate::config::Config;
 use crate::module::Module;
diff --git a/crates/burn-core/src/nn/pool/adaptive_avg_pool1d.rs b/crates/burn-core/src/nn/pool/adaptive_avg_pool1d.rs
index 9551db1db8..dd2c1d33c7 100644
--- a/crates/burn-core/src/nn/pool/adaptive_avg_pool1d.rs
+++ b/crates/burn-core/src/nn/pool/adaptive_avg_pool1d.rs
@@ -4,9 +4,10 @@ use crate::config::Config;
 use crate::module::Module;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
-use burn_tensor::module::adaptive_avg_pool1d;
 
-/// Configuration to create a [1D adaptive avg pooling](AdaptiveAvgPool1d) layer.
+use crate::tensor::module::adaptive_avg_pool1d;
+
+/// Configuration to create a [1D adaptive avg pooling](AdaptiveAvgPool1d) layer using the [init function](AdaptiveAvgPool1dConfig::init).
 #[derive(Config)]
 pub struct AdaptiveAvgPool1dConfig {
     /// The size of the output.
@@ -14,6 +15,8 @@ pub struct AdaptiveAvgPool1dConfig {
 }
 
 /// Applies a 1D adaptive avg pooling over input tensors.
+///
+/// Should be created with [AdaptiveAvgPool1dConfig].
 #[derive(Module, Clone, Debug)]
 pub struct AdaptiveAvgPool1d {
     output_size: usize,
@@ -31,10 +34,12 @@ impl AdaptiveAvgPool1dConfig {
 impl AdaptiveAvgPool1d {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [adaptive_avg_pool1d](crate::tensor::module::adaptive_avg_pool1d) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels, length],
-    /// - output: [batch_size, channels, length_out],
+    /// - input: `[batch_size, channels, length]`
+    /// - output: `[batch_size, channels, length_out]`
     pub fn forward<B: Backend>(&self, input: Tensor<B, 3>) -> Tensor<B, 3> {
         adaptive_avg_pool1d(input, self.output_size)
     }
diff --git a/crates/burn-core/src/nn/pool/adaptive_avg_pool2d.rs b/crates/burn-core/src/nn/pool/adaptive_avg_pool2d.rs
index c5849fa84d..8d4d55d424 100644
--- a/crates/burn-core/src/nn/pool/adaptive_avg_pool2d.rs
+++ b/crates/burn-core/src/nn/pool/adaptive_avg_pool2d.rs
@@ -4,9 +4,10 @@ use crate::config::Config;
 use crate::module::Module;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
-use burn_tensor::module::adaptive_avg_pool2d;
 
-/// Configuration to create a [2D adaptive avg pooling](AdaptiveAvgPool2d) layer.
+use crate::tensor::module::adaptive_avg_pool2d;
+
+/// Configuration to create a [2D adaptive avg pooling](AdaptiveAvgPool2d) layer using the [init function](AdaptiveAvgPool2dConfig::init).
 #[derive(Config)]
 pub struct AdaptiveAvgPool2dConfig {
     /// The size of the output.
@@ -14,6 +15,8 @@ pub struct AdaptiveAvgPool2dConfig {
 }
 
 /// Applies a 2D adaptive avg pooling over input tensors.
+///
+/// Should be created with [AdaptiveAvgPool2dConfig].
 #[derive(Module, Clone, Debug)]
 pub struct AdaptiveAvgPool2d {
     output_size: [usize; 2],
@@ -31,10 +34,12 @@ impl AdaptiveAvgPool2dConfig {
 impl AdaptiveAvgPool2d {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [adaptive_avg_pool2d](crate::tensor::module::adaptive_avg_pool2d) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels, height_in, width_in],
-    /// - output: [batch_size, channels, height_out, width_out],
+    /// - input: `[batch_size, channels, height_in, width_in]`
+    /// - output: `[batch_size, channels, height_out, width_out]`
     pub fn forward<B: Backend>(&self, input: Tensor<B, 4>) -> Tensor<B, 4> {
         adaptive_avg_pool2d(input, self.output_size)
     }
diff --git a/crates/burn-core/src/nn/pool/avg_pool1d.rs b/crates/burn-core/src/nn/pool/avg_pool1d.rs
index 68e29fdde9..4b58eb3025 100644
--- a/crates/burn-core/src/nn/pool/avg_pool1d.rs
+++ b/crates/burn-core/src/nn/pool/avg_pool1d.rs
@@ -5,9 +5,10 @@ use crate::module::Module;
 use crate::nn::PaddingConfig1d;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
-use burn_tensor::module::avg_pool1d;
 
-/// Configuration to create a [1D avg pooling](AvgPool1d) layer.
+use crate::tensor::module::avg_pool1d;
+
+/// Configuration to create a [1D avg pooling](AvgPool1d) layer using the [init function](AvgPool1dConfig::init).
 #[derive(Config, Debug)]
 pub struct AvgPool1dConfig {
     /// The size of the kernel.
@@ -25,7 +26,7 @@ pub struct AvgPool1dConfig {
 
 /// Applies a 1D avg pooling over input tensors.
 ///
-/// See [AvgPool1dConfig](AvgPool1dConfig) for details.
+/// Should be created with [AvgPool1dConfig](AvgPool1dConfig).
 ///
 /// # Remarks
 ///
@@ -61,10 +62,12 @@ impl AvgPool1dConfig {
 impl AvgPool1d {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [avg_pool1d](crate::tensor::module::avg_pool1d) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels, length_in],
-    /// - output: [batch_size, channels, length_out],
+    /// - input: `[batch_size, channels, length_in]`
+    /// - output: `[batch_size, channels, length_out]`
     pub fn forward<B: Backend>(&self, input: Tensor<B, 3>) -> Tensor<B, 3> {
         let [_batch_size, _channels, length] = input.dims();
         let padding = self
diff --git a/crates/burn-core/src/nn/pool/avg_pool2d.rs b/crates/burn-core/src/nn/pool/avg_pool2d.rs
index 8ec63d0856..fb2aff8d2e 100644
--- a/crates/burn-core/src/nn/pool/avg_pool2d.rs
+++ b/crates/burn-core/src/nn/pool/avg_pool2d.rs
@@ -5,9 +5,10 @@ use crate::module::Module;
 use crate::nn::PaddingConfig2d;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
-use burn_tensor::module::avg_pool2d;
 
-/// Configuration to create a [2D avg pooling](AvgPool2d) layer.
+use crate::tensor::module::avg_pool2d;
+
+/// Configuration to create a [2D avg pooling](AvgPool2d) layer using the [init function](AvgPool2dConfig::init).
 #[derive(Config, Debug)]
 pub struct AvgPool2dConfig {
     /// The size of the kernel.
@@ -25,7 +26,7 @@ pub struct AvgPool2dConfig {
 
 /// Applies a 2D avg pooling over input tensors.
 ///
-/// See [AvgPool2dConfig](AvgPool2dConfig) for details.
+/// Should be created with [AvgPool2dConfig](AvgPool2dConfig).
 ///
 /// # Remarks
 ///
@@ -60,10 +61,12 @@ impl AvgPool2dConfig {
 impl AvgPool2d {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [avg_pool2d](crate::tensor::module::avg_pool2d) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels, height_in, width_in],
-    /// - output: [batch_size, channels, height_out, width_out],
+    /// - input: `[batch_size, channels, height_in, width_in]`
+    /// - output: `[batch_size, channels, height_out, width_out]`
     pub fn forward<B: Backend>(&self, input: Tensor<B, 4>) -> Tensor<B, 4> {
         let [_batch_size, _channels_in, height_in, width_in] = input.dims();
         let padding =
diff --git a/crates/burn-core/src/nn/pool/max_pool1d.rs b/crates/burn-core/src/nn/pool/max_pool1d.rs
index eabed4603f..632ab6622d 100644
--- a/crates/burn-core/src/nn/pool/max_pool1d.rs
+++ b/crates/burn-core/src/nn/pool/max_pool1d.rs
@@ -5,9 +5,10 @@ use crate::module::Module;
 use crate::nn::PaddingConfig1d;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
-use burn_tensor::module::max_pool1d;
 
-/// Configuration to create a [1D max pooling](MaxPool1d) layer.
+use crate::tensor::module::max_pool1d;
+
+/// Configuration to create a [1D max pooling](MaxPool1d) layer using the [init function](MaxPool1dConfig::init).
 #[derive(Config, Debug)]
 pub struct MaxPool1dConfig {
     /// The size of the kernel.
@@ -24,6 +25,8 @@ pub struct MaxPool1dConfig {
 }
 
 /// Applies a 1D max pooling over input tensors.
+///
+/// Should be created with [MaxPool1dConfig](MaxPool1dConfig).
 #[derive(Module, Clone, Debug)]
 pub struct MaxPool1d {
     stride: usize,
@@ -47,10 +50,12 @@ impl MaxPool1dConfig {
 impl MaxPool1d {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [max_pool1d](crate::tensor::module::max_pool1d) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels, length_in],
-    /// - output: [batch_size, channels, length_out],
+    /// - input: `[batch_size, channels, length_in]`
+    /// - output: `[batch_size, channels, length_out]`
     pub fn forward<B: Backend>(&self, input: Tensor<B, 3>) -> Tensor<B, 3> {
         let [_batch_size, _channels, length] = input.dims();
         let padding = self
diff --git a/crates/burn-core/src/nn/pool/max_pool2d.rs b/crates/burn-core/src/nn/pool/max_pool2d.rs
index 9697ab3529..63dee1326d 100644
--- a/crates/burn-core/src/nn/pool/max_pool2d.rs
+++ b/crates/burn-core/src/nn/pool/max_pool2d.rs
@@ -5,9 +5,10 @@ use crate::module::Module;
 use crate::nn::PaddingConfig2d;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
-use burn_tensor::module::max_pool2d;
 
-/// Configuration to create an [2D max pooling](MaxPool2d) layer.
+use crate::tensor::module::max_pool2d;
+
+/// Configuration to create a [2D max pooling](MaxPool2d) layer using the [init function](MaxPool2dConfig::init).
 #[derive(Debug, Config)]
 pub struct MaxPool2dConfig {
     /// The size of the kernel.
@@ -24,6 +25,8 @@ pub struct MaxPool2dConfig {
 }
 
 /// Applies a 2D max pooling over input tensors.
+///
+/// Should be created with [MaxPool2dConfig](MaxPool2dConfig).
 #[derive(Module, Clone, Debug)]
 pub struct MaxPool2d {
     stride: [usize; 2],
@@ -47,10 +50,12 @@ impl MaxPool2dConfig {
 impl MaxPool2d {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [max_pool2d](crate::tensor::module::max_pool2d) for more information.
+    ///
     /// # Shapes
     ///
-    /// - input: [batch_size, channels, height_in, width_in],
-    /// - output: [batch_size, channels, height_out, width_out],
+    /// - input: `[batch_size, channels, height_in, width_in]`
+    /// - output: `[batch_size, channels, height_out, width_out]`
     pub fn forward<B: Backend>(&self, input: Tensor<B, 4>) -> Tensor<B, 4> {
         let [_batch_size, _channels_in, height_in, width_in] = input.dims();
         let padding =
diff --git a/crates/burn-core/src/nn/pos_encoding.rs b/crates/burn-core/src/nn/pos_encoding.rs
index 0df639316a..703927dc3d 100644
--- a/crates/burn-core/src/nn/pos_encoding.rs
+++ b/crates/burn-core/src/nn/pos_encoding.rs
@@ -4,25 +4,25 @@ use crate as burn;
 use crate::config::Config;
 use crate::module::Module;
 use crate::tensor::backend::Backend;
+use crate::tensor::Data;
 use crate::tensor::Tensor;
-use burn_tensor::Data;
 
 #[cfg(not(feature = "std"))]
 use num_traits::Float;
 
-/// Configuration to create an [PositionalEncoding](PositionalEncoding) layer.
+/// Configuration to create a [PositionalEncoding](PositionalEncoding) layer using the [init function](PositionalEncodingConfig::init).
 #[derive(Config)]
 pub struct PositionalEncodingConfig {
     /// Maximum sequence size to use.
     #[config(default = "5_000")]
-    max_sequence_size: usize,
+    pub max_sequence_size: usize,
 
     /// The size of each vector.
-    d_model: usize,
+    pub d_model: usize,
 
     /// Max time scale to use.
     #[config(default = "10_000")]
-    max_timescale: usize,
+    pub max_timescale: usize,
 }
 
 /// Positional encoding layer for transformer models.
@@ -37,6 +37,8 @@ pub struct PositionalEncodingConfig {
 /// The reference implementation can be found here:
 /// [LANGUAGE MODELING WITH NN.TRANSFORMER AND TORCHTEXT
 /// ](https://pytorch.org/tutorials/beginner/transformer_tutorial.html)
+///
+/// Should be created using [PositionalEncodingConfig]
 #[derive(Module, Debug)]
 pub struct PositionalEncoding<B: Backend> {
     sinusoids: Tensor<B, 3>,
diff --git a/crates/burn-core/src/nn/prelu.rs b/crates/burn-core/src/nn/prelu.rs
index d8ea5f90ab..f15c96481c 100644
--- a/crates/burn-core/src/nn/prelu.rs
+++ b/crates/burn-core/src/nn/prelu.rs
@@ -6,13 +6,15 @@ use crate::nn::Initializer;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
 /// Parametric Relu layer.
+///
+/// Should be created using [PReluConfig]
 #[derive(Module, Debug)]
 pub struct PRelu<B: Backend> {
     /// the weights learnt for PReLu. can be of shape \[1\] or \[num_parameters\] in which case it must
     /// be the same as number of channels in the input tensor
     pub alpha: Param<Tensor<B, 1>>,
 }
-/// Configuration to create a [Parametric Relu](PRelu) layer.
+/// Configuration to create a [Parametric Relu](PRelu) layer using the [init function](PReluConfig::init).
 #[derive(Config, Debug)]
 pub struct PReluConfig {
     /// The number of parameters.
@@ -39,6 +41,8 @@ impl<B: Backend> PRelu<B> {
     ///
     /// - input: `[..., any]`
     /// - output: `[..., any]`
+    ///
+    /// See also [prelu](crate::tensor::activation::prelu) for more information.
     pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
         crate::tensor::activation::prelu(input, self.alpha.val())
     }
diff --git a/crates/burn-core/src/nn/relu.rs b/crates/burn-core/src/nn/relu.rs
index bd8c92a4dd..262c393134 100644
--- a/crates/burn-core/src/nn/relu.rs
+++ b/crates/burn-core/src/nn/relu.rs
@@ -4,9 +4,9 @@ use crate::module::Module;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
 
-/// Applies the rectified linear unit function element-wise:
+/// Applies the rectified linear unit function element-wise
+/// See also [relu](burn::tensor::activation::relu)
 ///
-/// `y = max(0, x)`
 #[derive(Module, Clone, Debug, Default)]
 pub struct Relu {}
 
diff --git a/crates/burn-core/src/nn/rnn/gate_controller.rs b/crates/burn-core/src/nn/rnn/gate_controller.rs
index 0c52f383f4..f20f7fa2a1 100644
--- a/crates/burn-core/src/nn/rnn/gate_controller.rs
+++ b/crates/burn-core/src/nn/rnn/gate_controller.rs
@@ -2,7 +2,7 @@ use crate as burn;
 
 use crate::module::Module;
 use crate::nn::{Initializer, Linear, LinearConfig};
-use burn_tensor::{backend::Backend, Tensor};
+use crate::tensor::{backend::Backend, Tensor};
 
 /// A GateController represents a gate in an LSTM cell. An
 /// LSTM cell generally contains three gates: an input gate,
diff --git a/crates/burn-core/src/nn/rnn/gru.rs b/crates/burn-core/src/nn/rnn/gru.rs
index ddcada5526..fc29bdc192 100644
--- a/crates/burn-core/src/nn/rnn/gru.rs
+++ b/crates/burn-core/src/nn/rnn/gru.rs
@@ -4,13 +4,13 @@ use crate::config::Config;
 use crate::module::Module;
 use crate::nn::rnn::gate_controller;
 use crate::nn::Initializer;
+use crate::tensor::activation;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
-use burn_tensor::activation;
 
 use super::gate_controller::GateController;
 
-/// The configuration for a [gru](Gru) module.
+/// Configuration to create a [gru](Gru) module using the [init function](GruConfig::init).
 #[derive(Config)]
 pub struct GruConfig {
     /// The size of the input features.
@@ -24,7 +24,11 @@ pub struct GruConfig {
     pub initializer: Initializer,
 }
 
-/// The Gru module. This implementation is for a unidirectional, stateless, Gru.
+/// The Gru (Gated recurrent unit) module. This implementation is for a unidirectional, stateless, Gru.
+///
+/// Introduced in the paper: [Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation](https://arxiv.org/abs/1406.1078).
+///
+/// Should be created with [GruConfig].
 #[derive(Module, Debug)]
 pub struct Gru<B: Backend> {
     update_gate: GateController<B>,
@@ -73,13 +77,11 @@ impl<B: Backend> Gru<B> {
     /// Applies the forward pass on the input tensor. This GRU implementation
     /// returns a single state tensor with dimensions [batch_size, sequence_length, hidden_size].
     ///
-    /// Parameters:
-    ///     batched_input: The input tensor of shape [batch_size, sequence_length, input_size].
-    ///     state: An optional tensor representing an initial cell state with the same dimensions
-    ///            as batched_input. If none is provided, one will be generated.
-    ///
-    /// Returns:
-    ///     The resulting state tensor, with shape [batch_size, sequence_length, hidden_size].
+    /// # Shapes
+    /// - batched_input: `[batch_size, sequence_length, input_size]`.
+    /// - state: An optional tensor representing an initial cell state with the same dimensions
+    ///          as batched_input. If none is provided, one will be generated.
+    /// - output: `[batch_size, sequence_length, hidden_size]`.
     pub fn forward(
         &self,
         batched_input: Tensor<B, 3>,
@@ -177,8 +179,8 @@ impl<B: Backend> Gru<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::{Data, Distribution};
     use crate::{module::Param, nn::LinearRecord, TestBackend};
-    use burn_tensor::{Data, Distribution};
 
     /// Test forward pass with simple input vector.
     ///
diff --git a/crates/burn-core/src/nn/rnn/lstm.rs b/crates/burn-core/src/nn/rnn/lstm.rs
index 82025c6364..8690b16d27 100644
--- a/crates/burn-core/src/nn/rnn/lstm.rs
+++ b/crates/burn-core/src/nn/rnn/lstm.rs
@@ -4,9 +4,9 @@ use crate::config::Config;
 use crate::module::Module;
 use crate::nn::rnn::gate_controller::GateController;
 use crate::nn::Initializer;
+use crate::tensor::activation;
 use crate::tensor::backend::Backend;
 use crate::tensor::Tensor;
-use burn_tensor::activation;
 
 /// A LstmState is used to store cell state and hidden state in LSTM.
 pub struct LstmState<B: Backend, const D: usize> {
@@ -23,7 +23,7 @@ impl<B: Backend, const D: usize> LstmState<B, D> {
     }
 }
 
-/// The configuration for a [lstm](Lstm) module.
+/// Configuration to create a [Lstm](Lstm) module using the [init function](LstmConfig::init).
 #[derive(Config)]
 pub struct LstmConfig {
     /// The size of the input features.
@@ -38,6 +38,10 @@ pub struct LstmConfig {
 }
 
 /// The Lstm module. This implementation is for a unidirectional, stateless, Lstm.
+///
+/// Introduced in the paper: [Long Short-Term Memory](https://www.researchgate.net/publication/13853244).
+///
+/// Should be created with [LstmConfig].
 #[derive(Module, Debug)]
 pub struct Lstm<B: Backend> {
     /// The input gate regulates which information to update and store in the cell state at each time step.
@@ -171,7 +175,7 @@ impl<B: Backend> Lstm<B> {
     }
 }
 
-/// The configuration for a [Bidirectional LSTM](BiLstm) module.
+/// Configuration to create a [BiLstm](BiLstm) module using the [init function](BiLstmConfig::init).
 #[derive(Config)]
 pub struct BiLstmConfig {
     /// The size of the input features.
@@ -186,6 +190,10 @@ pub struct BiLstmConfig {
 }
 
 /// The BiLstm module. This implementation is for Bidirectional LSTM.
+///
+/// Introduced in the paper: [Framewise phoneme classification with bidirectional LSTM and other neural network architectures](https://www.cs.toronto.edu/~graves/ijcnn_2005.pdf).
+///
+/// Should be created with [BiLstmConfig].
 #[derive(Module, Debug)]
 pub struct BiLstm<B: Backend> {
     /// LSTM for the forward direction.
@@ -298,8 +306,8 @@ impl<B: Backend> BiLstm<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::{Data, Device, Distribution};
     use crate::{module::Param, nn::LinearRecord, TestBackend};
-    use burn_tensor::{Data, Device, Distribution};
 
     #[cfg(feature = "std")]
     use crate::TestAutodiffBackend;
@@ -451,7 +459,7 @@ mod tests {
     #[test]
     #[cfg(feature = "std")]
     fn test_batched_backward_pass() {
-        use burn_tensor::Shape;
+        use crate::tensor::Shape;
         let device = Default::default();
         let lstm = LstmConfig::new(64, 32, true).init(&device);
         let shape: Shape<3> = [8, 10, 64].into();
diff --git a/crates/burn-core/src/nn/rope_encoding.rs b/crates/burn-core/src/nn/rope_encoding.rs
index b8f4e97903..9a10d5d0a8 100644
--- a/crates/burn-core/src/nn/rope_encoding.rs
+++ b/crates/burn-core/src/nn/rope_encoding.rs
@@ -2,25 +2,25 @@ use crate as burn;
 use crate::config::Config;
 use crate::module::Module;
 use crate::tensor::backend::Backend;
+use crate::tensor::Int;
 use crate::tensor::Tensor;
 use alloc::vec;
-use burn_tensor::Int;
 
 #[cfg(not(feature = "std"))]
 use num_traits::Float;
 
-/// Configuration to create a [RotaryEncoding](RotaryEncoding) layer.
+/// Configuration to create a [RotaryEncoding](RotaryEncoding) layer using the [init function](RotaryEncodingConfig::init).
 #[derive(Config, Debug)]
 pub struct RotaryEncodingConfig {
     /// Maximum sequence length of input
-    max_sequence_length: usize,
+    pub max_sequence_length: usize,
 
     /// Size of the input embedding or hidden dimension
-    d_model: usize,
+    pub d_model: usize,
 
     /// Scaling factor for frequency computation. Defaults to 10000.0
     #[config(default = "10000.0")]
-    theta: f32,
+    pub theta: f32,
 }
 
 impl RotaryEncodingConfig {
@@ -84,6 +84,8 @@ impl RotaryEncodingConfig {
 /// explicit relative position dependency in self-attention formulation.
 ///
 /// Introduced in the paper: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
+///
+/// Should be created using [RotaryEncodingConfig].
 #[derive(Module, Debug)]
 pub struct RotaryEncoding<B: Backend> {
     /// Frequency Tensor of shape (max_sequence_length, d_model, 2) with real and imaginary components
diff --git a/crates/burn-core/src/nn/swiglu.rs b/crates/burn-core/src/nn/swiglu.rs
index 6949b4ba44..3dacbae68a 100644
--- a/crates/burn-core/src/nn/swiglu.rs
+++ b/crates/burn-core/src/nn/swiglu.rs
@@ -7,7 +7,7 @@ use crate::tensor::{backend::Backend, Tensor};
 
 use super::{Initializer, Linear, LinearConfig};
 
-/// Configuration to create a [SwiGlu](SwiGlu) activation layer.
+/// Configuration to create a [SwiGlu](SwiGlu) activation layer using the [init function](SwiGluConfig::init).
 #[derive(Config, Debug)]
 pub struct SwiGluConfig {
     /// The size of the input features.
@@ -29,16 +29,15 @@ pub struct SwiGluConfig {
 /// The SwiGLU activation function is defined as:
 /// `SwiGLU(x) = Swish(W_inner * x + b_inner) * (W_outer * x + b_outer)`
 ///
-/// # Params
-///
-/// - linear inner: The inner linear layer for Swish activation function
-/// with `d_input` input features and `d_output` output features.
-/// - linear outer: Outer Linear layer for element wise multiplication
-/// with `d_input` input features and `d_output` output features.
+/// Should be created with [SwiGluConfig].
 #[derive(Module, Debug)]
 pub struct SwiGlu<B: Backend> {
-    linear_inner: Linear<B>,
-    linear_outer: Linear<B>,
+    /// The inner linear layer for Swish activation function
+    /// with `d_input` input features and `d_output` output features.
+    pub linear_inner: Linear<B>,
+    /// The outer linear layer for element wise multiplication
+    /// with `d_input` input features and `d_output` output features.
+    pub linear_outer: Linear<B>,
 }
 
 impl SwiGluConfig {
@@ -58,11 +57,11 @@ impl SwiGluConfig {
 }
 
 impl<B: Backend> SwiGlu<B> {
-    /// Applies the forward pass on the input tensor.
+    /// Applies the Swish Gated Linear Unit to the input tensor.
     ///
     /// # Shapes
     ///
-    /// - tensor: `[batch_size, seq_length, d_input]`
+    /// - input: `[batch_size, seq_length, d_input]`  
     /// - output: `[batch_size, seq_length, d_output]`
     pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
         let x = self.linear_inner.forward(input.clone());
diff --git a/crates/burn-core/src/nn/transformer/decoder.rs b/crates/burn-core/src/nn/transformer/decoder.rs
index 7dd9ecfd6b..85fc50159f 100644
--- a/crates/burn-core/src/nn/transformer/decoder.rs
+++ b/crates/burn-core/src/nn/transformer/decoder.rs
@@ -1,5 +1,5 @@
+use crate::tensor::Bool;
 use alloc::vec::Vec;
-use burn_tensor::Bool;
 
 use crate::{
     self as burn,
@@ -17,7 +17,7 @@ use crate::{
     tensor::{backend::Backend, Tensor},
 };
 
-/// Configuration to create a [Transformer Decoder](TransformerDecoder) layer.
+/// Configuration to create a [Transformer Decoder](TransformerDecoder) layer using the [init function](TransformerDecoderConfig::init).
 #[derive(Config)]
 pub struct TransformerDecoderConfig {
     /// The size of the model.
@@ -54,6 +54,8 @@ pub struct TransformerDecoderConfig {
 /// # Params
 ///
 /// - layers: transformer decoder layers with `d_model` input and output features.
+///
+/// Should be created using [TransformerDecoderConfig]
 #[derive(Module, Debug)]
 pub struct TransformerDecoder<B: Backend> {
     layers: Vec<TransformerDecoderLayer<B>>,
@@ -204,6 +206,7 @@ impl<B: Backend> TransformerDecoderLayer<B> {
         }
     }
 
+    /// Applies the TransformerDecoder forward pass to the input tensor.
     fn forward(&self, mut input: TransformerDecoderInput<B>) -> TransformerDecoderInput<B> {
         // Self attention residual path.
         let x = input.target;
@@ -401,8 +404,8 @@ impl<B: Backend> TransformerDecoder<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Distribution;
     use crate::{nn::attention::generate_autoregressive_mask, TestBackend};
-    use burn_tensor::Distribution;
 
     #[test]
     fn test_autoregressive_norm_last() {
diff --git a/crates/burn-core/src/nn/transformer/encoder.rs b/crates/burn-core/src/nn/transformer/encoder.rs
index 1eba8658bc..0eb226a3b9 100644
--- a/crates/burn-core/src/nn/transformer/encoder.rs
+++ b/crates/burn-core/src/nn/transformer/encoder.rs
@@ -1,5 +1,5 @@
+use crate::tensor::Bool;
 use alloc::vec::Vec;
-use burn_tensor::Bool;
 
 use crate::{
     self as burn,
@@ -17,7 +17,7 @@ use crate::{
     tensor::{backend::Backend, Tensor},
 };
 
-/// Configuration to create a [Transformer Encoder](TransformerEncoder) layer.
+/// Configuration to create a [Transformer Encoder](TransformerEncoder) layer using the [init function](TransformerEncoderConfig::init).
 #[derive(Config)]
 pub struct TransformerEncoderConfig {
     /// The size of the model.
@@ -54,6 +54,8 @@ pub struct TransformerEncoderConfig {
 /// # Params
 ///
 /// - layers: transformer encoder layers with `d_model` input and output features.
+///
+/// Should be created using [TransformerEncoderConfig]
 #[derive(Module, Debug)]
 pub struct TransformerEncoder<B: Backend> {
     layers: Vec<TransformerEncoderLayer<B>>,
@@ -338,8 +340,8 @@ impl<B: Backend> TransformerEncoderAutoregressiveCache<B> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::tensor::Distribution;
     use crate::{nn::attention::generate_autoregressive_mask, TestBackend};
-    use burn_tensor::Distribution;
 
     #[test]
     fn test_autoregressive_norm_last() {
diff --git a/crates/burn-core/src/nn/transformer/pwff.rs b/crates/burn-core/src/nn/transformer/pwff.rs
index 30c258d5ec..bd168b6433 100644
--- a/crates/burn-core/src/nn/transformer/pwff.rs
+++ b/crates/burn-core/src/nn/transformer/pwff.rs
@@ -8,7 +8,7 @@ use crate::{
     tensor::{backend::Backend, Tensor},
 };
 
-/// Configuration to create a [position-wise feed-forward](PositionWiseFeedForward) layer.
+/// Configuration to create a [position-wise feed-forward](PositionWiseFeedForward) layer using the [init function](PositionWiseFeedForwardConfig::init).
 #[derive(Config)]
 pub struct PositionWiseFeedForwardConfig {
     /// The size of the input and output features.
@@ -25,12 +25,16 @@ pub struct PositionWiseFeedForwardConfig {
     pub initializer: Initializer,
 }
 
-/// Applies the position-wise feed-forward network to the input tensor.
+/// Applies the position-wise feed-forward network to the input tensor from the paper [Attention Is All You Need](https://arxiv.org/pdf/1706.03762v7).
 ///
 /// # Params
 ///
 /// - linear inner: Linear layer with `d_model` input features and `d_ff` output features.
 /// - linear outer: Linear layer with `d_ff` input features and `d_model` output features.
+///
+/// `FFN(x) = max(0, xW1 + b1)W2 + b2`
+///
+/// Should be created using [PositionWiseFeedForwardConfig]
 #[derive(Module, Debug)]
 pub struct PositionWiseFeedForward<B: Backend> {
     linear_inner: Linear<B>,
diff --git a/crates/burn-core/src/nn/unfold.rs b/crates/burn-core/src/nn/unfold.rs
index 26711622e3..31acb1a87f 100644
--- a/crates/burn-core/src/nn/unfold.rs
+++ b/crates/burn-core/src/nn/unfold.rs
@@ -2,12 +2,13 @@ use crate as burn;
 
 use crate::config::Config;
 use crate::module::Module;
-use burn_tensor::backend::Backend;
-use burn_tensor::module::unfold4d;
-use burn_tensor::ops::UnfoldOptions;
-use burn_tensor::Tensor;
+use crate::tensor::backend::Backend;
+use crate::tensor::ops::UnfoldOptions;
+use crate::tensor::Tensor;
 
-/// Configuration to create an [unfold 4D](Unfold4d) layer.
+use crate::tensor::module::unfold4d;
+
+/// Configuration to create an [unfold 4d](Unfold4d) layer using the [init function](Unfold4dConfig::init).
 #[derive(Config, Debug)]
 pub struct Unfold4dConfig {
     /// The size of the kernel.
@@ -24,13 +25,15 @@ pub struct Unfold4dConfig {
 }
 
 /// Four-dimensional unfolding.
+///
+/// Should be created with [Unfold4dConfig].
 #[derive(Module, Clone, Debug)]
 pub struct Unfold4d {
     config: Unfold4dConfig,
 }
 
 impl Unfold4dConfig {
-    /// Initialize a new [unfold 4k](Unfold4d) module.
+    /// Initializes a new [Unfold4d] module.
     pub fn init(&self) -> Unfold4d {
         Unfold4d {
             config: self.clone(),
@@ -41,10 +44,12 @@ impl Unfold4dConfig {
 impl Unfold4d {
     /// Applies the forward pass on the input tensor.
     ///
+    /// See [unfold4d](crate::tensor::module::unfold4d) for more information.
+    ///
     /// # Shapes
     ///
-    /// input:      `[batch_size, channels_in, height, width]`,
-    /// returns: `[batch_size, channels_in * kernel_size_1 * kernel_size_2, number of blocks]`,
+    /// input:   `[batch_size, channels_in, height, width]`  
+    /// returns: `[batch_size, channels_in * kernel_size_1 * kernel_size_2, number of blocks]`
     pub fn forward<B: Backend>(&self, input: Tensor<B, 4>) -> Tensor<B, 3> {
         unfold4d(
             input,
diff --git a/crates/burn-tensor/src/tensor/activation/base.rs b/crates/burn-tensor/src/tensor/activation/base.rs
index 77d9e04170..21cd516431 100644
--- a/crates/burn-tensor/src/tensor/activation/base.rs
+++ b/crates/burn-tensor/src/tensor/activation/base.rs
@@ -2,7 +2,10 @@ use crate::backend::Backend;
 use crate::check::TensorCheck;
 use crate::{check, Tensor};
 
-/// Applies the rectified linear unit function.
+/// Applies the rectified linear unit function as described in the paper [Deep Learning using
+/// Rectified Linear Units (ReLU)](https://arxiv.org/pdf/1803.08375).
+///
+/// `y = max(0, x)`
 pub fn relu<const D: usize, B: Backend>(tensor: Tensor<B, D>) -> Tensor<B, D> {
     tensor.relu()
 }
@@ -20,12 +23,12 @@ pub fn leaky_relu<const D: usize, B: Backend>(
     ))
 }
 
-/// Applies the Gaussian Error Linear Units function as described in the paper in [Gaussian Error Linear Units (GELUs)](https://arxiv.org/pdf/1606.08415v3.pdf).
+/// Applies the Gaussian Error Linear Units function as described in the paper [Gaussian Error Linear Units (GELUs)](https://arxiv.org/pdf/1606.08415v3.pdf).
 pub fn gelu<const D: usize, B: Backend>(tensor: Tensor<B, D>) -> Tensor<B, D> {
     Tensor::from_primitive(B::gelu(tensor.primitive))
 }
 
-/// Applies Parametric ReLu activation
+/// Applies Parametric ReLu activation function as described in the paper [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/pdf/1502.01852).
 /// ` PReLu(x) = max(0,x) + \alpha * min(0,x)`
 /// tensor is assumed to be of shape \[batch_size, channels, ...\]
 /// alpha is assumed to be of shape \[channels\] or \[1\]