From 5f15e97f696126062aaeb65e2c92aef0080e56b9 Mon Sep 17 00:00:00 2001
From: quantumliu <liuyiliang100@sina.com>
Date: Sun, 18 Feb 2024 13:56:34 +0800
Subject: [PATCH 01/27] Jittor backend 'jittor_backend.py' is added.

---
 tensorlayerx/backend/ops/jittor_backend.py | 1846 ++++++++++++++++++++
 1 file changed, 1846 insertions(+)
 create mode 100644 tensorlayerx/backend/ops/jittor_backend.py

diff --git a/tensorlayerx/backend/ops/jittor_backend.py b/tensorlayerx/backend/ops/jittor_backend.py
new file mode 100644
index 0000000..2b13457
--- /dev/null
+++ b/tensorlayerx/backend/ops/jittor_backend.py
@@ -0,0 +1,1846 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import, division, print_function
+from .jittor_nn import nchw_to_nhwc, nhwc_to_nchw
+import jittor as jt
+# import jittor.nn.functional as F
+import numpy as np
+import random
+
+
+_dtypeDict = {
+    'DType': None,
+    'float16': jt.float16,
+    'float32': jt.float32,
+    'float64': jt.float64,
+    'int8': jt.int8,
+    'int16': jt.int16,
+    'int32': jt.int32,
+    'int64': jt.int64,
+    'uint8': jt.uint8,
+    'uint16': None,
+    'uint32': None,
+    'uint64': None,
+    'bool': jt.bool,
+    'complex64': jt.complex64,
+    'complex128': jt.complex128
+}
+
+DType = None
+float16 = jt.float16
+float32 = jt.float32
+float64 = jt.float64
+int8 = jt.int8
+int16 = jt.int16
+int32 = jt.int32
+int64 = jt.int64
+uint8 = jt.uint8
+uint16 = jt.uint16
+uint32 = jt.uint32
+uint64 = jt.uint64
+bool = None
+complex64 = None
+complex128 = None
+
+
+def set_context(**kwargs):
+    raise Exception("Using PyTorch backend,You don't need to set context")
+
+
+def get_tensor_shape(x):
+    return list(x.shape())
+
+
+# initializers
+def zeros(shape, dtype=None, device = None):
+    """
+    Creates a tensor with all elements set to zero.
+
+    Parameters
+    ----------
+    shape : A list of integers
+        a tuple of integers, or a 1-D Tensor of type int32.
+    dtype : tensor
+        The DType of an element in the resulting Tensor
+
+    Returns
+    -------
+        A Tensor with all elements set to zero.
+
+    """
+    if device == 'gpu':
+        jt.flags.use_cuda = 1
+    
+    return jt.zeros(size=shape, dtype=dtype)
+
+
+def ones(shape, dtype=None, device = None):
+    """
+    Creates a tensor with all elements set to ones.
+
+    Parameters
+    ----------
+    shape : A list of integers
+        a tuple of integers, or a 1-D Tensor of type int32.
+    dtype : tensor
+        The DType of an element in the resulting Tensor
+
+    Returns
+    -------
+        A Tensor with all elements set to zero.
+
+    """
+    if device == 'gpu':
+        jt.flags.use_cuda = 1
+        
+    return jt.ones(size=shape, dtype=dtype)
+
+
+def constant(value, dtype=None, shape=None, device =None):
+    """
+    Creates a constant tensor from a tensor-like object.
+
+    Parameters
+    ----------
+    value : int
+        A constant value (or list) of output type dtype.
+    dtype : tensor
+         The type of the elements of the resulting tensor.
+    shape : tuple
+        Optional dimensions of resulting tensor.
+
+    Returns
+    -------
+        A Constant Tensor.
+
+    """
+    if device == 'gpu':
+        jt.flags.use_cuda = 1
+    w = jt.empty(size=shape, dtype=dtype)
+    return jt.nn.init.constant_(w, value)
+
+
+def random_uniform(shape, minval=0, maxval=1, dtype=None, seed=None):
+    """
+    Outputs random values from a uniform distribution.
+
+    Parameters
+    ----------
+    shape : tuple
+        A 1-D integer Tensor or Python array. The shape of the output tensor.
+    minval : int
+        The lower bound on the range of random values to generate (inclusive). Defaults to 0.
+    maxval : int
+        The upper bound on the range of random values to generate (exclusive). Defaults to 1 if dtype is floating point.
+    dtype : tensor
+        The type of the output: float16, float32, float64, int32, or int64.
+    seed : int
+         Used in combination with tf.random.set_seed to create a reproducible sequence of tensors across multiple calls.
+    Returns
+    -------
+        A tensor of the specified shape filled with random uniform values.
+
+    """
+
+    if seed is None:
+        jt.random.seed()
+    else:
+        jt.random.manual_seed(seed)
+    w = jt.randn(size=shape, dtype=dtype)
+    out = w.uniform_(minval, maxval)
+    return out
+
+
+def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
+    """
+    Outputs random values from a normal distribution.
+
+    Parameters
+    ----------
+    shape : tuple
+        A 1-D integer Tensor or Python array. The shape of the output tensor.
+    mean : float
+        The mean of the normal distribution
+    stddev : float
+        The standard deviation of the normal distribution.
+    dtype : tensor
+        The type of the output.
+    seed : A Python integer
+         Used to create a random seed for the distribution
+
+    Returns
+    -------
+        A tensor of the specified shape filled with random normal values.
+
+    """
+
+    if seed is None:
+        jt.random.seed()
+    else:
+        jt.random.manual_seed(seed)
+    w = jt.randn(size=shape, dtype=dtype)
+    out = w.normal_(mean=mean, std=stddev)
+    return out
+
+
+def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
+    """
+    Outputs random values from a truncated normal distribution.
+
+    Parameters
+    ----------
+    shape : tuple
+        A 1-D integer Tensor or Python array. The shape of the output tensor.
+    mean : float
+        The mean of the normal distribution
+    stddev : float
+        The standard deviation of the normal distribution.
+    dtype : tensor
+        The type of the output.
+    seed : A Python integer
+         Used to create a random seed for the distribution
+
+    Returns
+    -------
+        A tensor of the specified shape filled with random truncated normal values.
+
+    """
+
+    tensor = jt.empty(size=shape, dtype=dtype)
+    out = jt.nn.init.trunc_normal_(tensor, mean=mean, std=stddev)
+    return out
+
+
+def he_normal(shape, a = 0, mode = 'fan_in', nonlinearity='leaky_relu', dtype=None, seed=None):
+    """
+    He normal initializer.
+
+    Parameters
+    ----------
+    seed : A Python integer.
+        Used to seed the random generator.
+    shape : tuple
+        A 1-D integer Tensor or Python array. The shape of the output tensor.
+    dtype : tensor
+        The type of the output.
+
+    Returns
+    -------
+        A tensor of the specified shape filled with he normal values.
+    """
+
+    tensor = jt.empty(size=shape, dtype=dtype)
+    out = jt.nn.init.kaiming_normal_(tensor, a=a, mode = mode, nonlinearity = nonlinearity)
+    return out
+
+def he_uniform(shape, a = 0, mode = 'fan_in', nonlinearity='leaky_relu', dtype=None, seed=None):
+
+    tensor = jt.empty(size=shape, dtype=dtype)
+    out = jt.nn.init.kaiming_uniform_(tensor, a=a, mode = mode, nonlinearity = nonlinearity)
+    return out
+
+def xavier_normal(shape, gain = 1.0, dtype=None, seed=None):
+    _tensor = jt.empty(size=shape, dtype=dtype)
+    return jt.nn.init.xavier_normal_(_tensor, gain)
+
+
+def xavier_uniform(shape, gain = 1.0, dtype=None, seed=None):
+    _tensor = jt.empty(size=shape, dtype=dtype)
+    return jt.nn.init.xavier_uniform_(_tensor, gain)
+
+
+def Variable(initial_value, name=None, trainable=True):
+    """
+    Creates a new variable with value initial_value.
+
+    Parameters
+    ----------
+    initial_value : tensor
+        A Tensor, or Python object convertible to a Tensor
+    name : str
+        Optional name for the variable. Defaults to 'Variable' and gets uniquified automatically.
+    Returns
+    -------
+        Variable
+    """
+    return jt.nn.Parameter(data=initial_value, requires_grad=trainable)
+
+
+class MatMul(object):
+
+    def __init__(self, transpose_a=False, transpose_b=False):
+        self.transpose_a = transpose_a
+        self.transpose_b = transpose_b
+
+    def __call__(self, a, b):
+        return jt.matmul(a, b)
+
+
+def matmul(a, b, transpose_a=False, transpose_b=False):
+    """
+    Multiplies matrix a by matrix b, producing a * b.
+
+    Parameters
+    ----------
+    a : tensor
+         type float16, float32, float64, int32, complex64, complex128 and rank > 1.
+    b : tensor
+        with same type and rank as a.
+
+    Returns
+    -------
+        A Tensor of the same type as a and b
+    """
+    return jt.matmul(a, b)
+
+
+def add(value, bias):
+    """
+    Returns x + y element-wise.
+
+    Parameters
+    ----------
+    value :  tensor.
+        Must be one of the following types: bfloat16, half, float32, float64,
+        uint8, int8, int16, int32, int64, complex64, complex128, string.
+    bias : tensor
+        Must have the same type as a
+
+    Returns
+    -------
+        A Tensor. Has the same type as a.
+    """
+    return jt.add(value, bias)
+
+
+def dtypes(dt):
+    """
+    Data dtypes.
+
+    Parameters
+    ----------
+    dt : string
+         It could be 'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16',
+         'int32', 'int64', 'float16', 'float32', 'float64', 'DType'.
+
+    Returns
+    -------
+        Data dtypes
+    """
+    if dt not in _dtypeDict.keys():
+        raise Exception("Unsupported dtype: {}".format(dt))
+    return _dtypeDict[dt]
+
+
+class Maximum(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, x, y):
+        return jt.maximum(x, y)
+
+
+class Minimum(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, x, y):
+        return jt.minimum(x, y)
+
+
+def minimum(x, y):
+    """
+    Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+
+    Parameters
+    ----------
+    x : tensor.
+        Must be one of the following types: bfloat16, half, float32, float64, int32, int64.
+    y : A Tensor.
+        Must have the same type as x.
+
+    Returns
+    -------
+        A Tensor. Has the same type as x
+    """
+
+    return jt.minimum(x, y)
+
+
+class FlattenReshape(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, inputs):
+        dim = 1
+        for d in get_tensor_shape(inputs)[1:]:
+            dim *= d
+        return jt.reshape(inputs, [-1, dim])
+
+
+class Reshape(object):
+
+    def __init__(self, shape):
+        self.shape = shape
+
+    def __call__(self, tensor):
+        return jt.reshape(tensor, self.shape)
+
+
+def reshape(tensor, shape):
+    """
+    Reshapes a tensor.
+
+    Parameters
+    ----------
+    tensor : tensor
+        A Tensor.
+    shape : tensor
+         Defines the shape of the output tensor.
+    Returns
+    -------
+        A Tensor. Has the same type as tensor
+    """
+
+    return jt.reshape(tensor, shape)
+
+
+class Concat(object):
+
+    def __init__(self, axis=0):
+        super(Concat, self).__init__()
+        self.axis = axis
+
+    def __call__(self, values):
+        return jt.concat(tensors=values, dim=self.axis)
+
+
+def concat(values, axis=0):
+    """
+    Concatenates tensors along one dimension.
+
+    Parameters
+    ----------
+    values : list
+         A list of Tensor objects or a single Tensor
+    axis : int
+        0-D int32 Tensor. Dimension along which to concatenate
+    Returns
+    -------
+        A Tensor resulting from concatenation of the input tensors.
+    """
+
+    return jt.concat(values, axis)
+
+
+def convert_to_tensor(value, dtype=None, device = None):
+    """
+    Converts the given value to a Tensor.
+
+    Parameters
+    ----------
+    value : object
+        An object whose type has a registered Tensor conversion function.
+    dtype : optional
+        Optional element type for the returned tensor. If missing, the type is inferred from the type of value.
+
+    Returns
+    -------
+        A Tensor based on value.
+    """
+    if isinstance(dtype, str):
+        dtype = _dtypeDict[dtype]
+    if device == 'gpu':
+        jt.flags.use_cuda = 1
+    return jt.array(value, dtype=dtype)
+
+
+def convert_to_numpy(value):
+    try:
+        return value.numpy()
+    except:
+        return value.cpu().detach().numpy()
+
+
+def sqrt(x):
+    """
+    Computes square root of x element-wise.
+
+    Parameters
+    ----------
+    x : tensor
+         Must be one of the following types: bfloat16, half, float32, float64, complex64, complex128.
+
+    Returns
+    -------
+        A Tensor. Has the same type as x.
+    """
+    return jt.sqrt(x)
+
+
+class ReduceSum(object):
+
+    def __init__(self, axis=None, keepdims=False):
+        self.axis = axis
+        self.keepdims = keepdims
+
+    def __call__(self, input):
+        if self.axis is not None:
+            return jt.sum(input=input, dim=self.axis)
+        else:
+            return jt.sum(input=input)
+
+
+class ReduceMean(object):
+
+    def __init__(self, axis=None, keepdims=False):
+        self.axis = axis
+        self.keepdims = keepdims
+
+    def __call__(self, inputs):
+        if self.axis is not None:
+            return jt.mean(input=inputs, dim=self.axis, keepdim=self.keepdims)
+        else:
+            return jt.mean(inputs)
+
+
+def reduce_mean(input_tensor, axis=None, keepdims=False):
+    """
+    Computes the mean of elements across dimensions of a tensor.
+
+    Parameters
+    ----------
+    input_tensor : tensor
+        The tensor to reduce. Should have numeric type.
+    axis : list
+        The dimensions to reduce. If None (the default), reduces all dimensions.
+        Must be in the range [-rank(input_tensor), rank(input_tensor)).
+    name : str
+        A name for the operation (optional).
+
+    Returns
+    -------
+        The reduced tensor.
+    """
+
+    if axis is not None:
+        return jt.mean(input_tensor, dim=axis, keepdim=keepdims)
+    else:
+        return jt.mean(input_tensor)
+
+
+class ReduceMax(object):
+
+    def __init__(self, axis=None, keepdims=False):
+        self.axis = axis
+        self.keepdims = keepdims
+
+
+    def __call__(self, inputs):
+        if self.axis is not None:
+            if isinstance(self.axis, (list, tuple)):
+                out = inputs
+                for dim in self.axis[::-1]:
+                    out = jt.max(out, dim=dim, keepdim=self.keepdims).values
+                return out
+            else:
+                return jt.max(inputs, dim=self.axis, keepdim=self.keepdims).values
+        else:
+            return jt.max(inputs)
+
+
+def reduce_max(input_tensor, axis=None, keepdims=False):
+    """
+    Computes the maximum of elements across dimensions of a tensor.
+
+    Parameters
+    ----------
+    input_tensor : tensor
+        The tensor to reduce. Should have real numeric type.
+    axis : int
+        The dimensions to reduce. If None (the default), reduces all dimensions.
+        Must be in the range [-rank(input_tensor), rank(input_tensor)).
+    name : str
+        A name for the operation (optional).
+
+    Returns
+    -------
+        The reduced tensor.
+    """
+
+    if axis is not None:
+        return jt.max(input_tensor, dim=axis, keepdim=keepdims).values
+    else:
+        return jt.max(input_tensor)
+
+
+def reduce_min(input_tensor, axis=None, keepdims=False):
+    """
+    Computes the minimum of elements across dimensions of a tensor.
+
+    Parameters
+    ----------
+    input_tensor : tensor
+        The tensor to reduce. Should have real numeric type.
+    axis : int
+        The dimensions to reduce. If None (the default), reduces all dimensions.
+        Must be in the range [-rank(input_tensor), rank(input_tensor)).
+    name : str
+        A name for the operation (optional).
+
+    Returns
+    -------
+        The reduced tensor.
+    """
+
+    if axis is not None:
+        return jt.min(input_tensor, dim=axis, keepdim=keepdims).values
+    else:
+        return jt.min(input_tensor)
+
+
+class Pad2d(object):
+    def __init__(self, padding, mode='constant', value=0.0, data_format="NCHW", name=None):
+        self.padding = padding
+        self._mode = mode
+        self._value = value
+        self._data_format = data_format
+        self._name = name
+
+    def __call__(self, x):
+        if self._data_format == "NHWC":
+            x = nhwc_to_nchw(x)
+        output = jt.nn.functional.pad(x, self.padding, self._mode, value=self._value)
+        if self._data_format == "NHWC":
+            output = nchw_to_nhwc(output)
+        return output
+
+
+class Pad(object):
+
+    def __init__(self, paddings, mode="REFLECT", constant_values=0.0):
+        if mode not in ['CONSTANT', 'REFLECT', 'SYMMETRIC']:
+            raise Exception("Unsupported mode: {}".format(mode))
+        self.paddings = self.correct_paddings(paddings)
+        self.mode = mode.lower()
+        self.constant_values = constant_values
+
+    def __call__(self, x):
+        if self.mode in ['symmetric', 'reflect']:
+            if len(x.shape) == 3 and self.paddings[0:2] + self.paddings[4:] == (0, 0, 0, 0):
+                self.paddings = (self.paddings[2], self.paddings[3])
+                x = jt.transpose(x, 1, 2)
+            elif len(x.shape) == 4 and self.paddings[0:2] + self.paddings[6:] == (0, 0, 0, 0):
+                self.paddings = (self.paddings[2:6])[::-1]
+                x = jt.transpose(x, 1, 3)
+            elif len(x.shape) == 5 and self.paddings[0:2] + self.paddings[8:] == (0, 0, 0, 0):
+                self.paddings = (self.paddings[2:8])[::-1]
+                x = jt.transpose(x, 1, 4)
+            else:
+                raise NotImplementedError("Only constant padding is implemented for arbitrary dimensions.")
+
+        out = jt.nn.functional.pad(x, self.paddings, mode=self.mode, value=self.constant_values)
+
+        if self.mode in ['symmetric', 'reflect']:
+            if len(x.shape) == 3:
+                out = jt.transpose(out, 1, 2)
+            if len(x.shape) == 4:
+                out = jt.transpose(out, 1, 3)
+            if len(x.shape) == 5:
+                out = jt.transpose(out, 1, 4)
+        return out
+
+    def correct_paddings(self, paddings):
+        paddings = paddings[::-1]
+        _padding = []
+        for p_i in paddings:
+            for pj in p_i:
+                _padding.append(pj)
+        return tuple(_padding)
+
+
+def pad(tensor, paddings, mode='CONSTANT', constant_values=0):
+    """
+    Pads a tensor.
+
+    Parameters
+    ----------
+    tensor : tensor
+        A Tensor.
+    paddings : tensor
+        A Tensor of type int32.
+    mode : str
+        One of "CONSTANT", "REFLECT", or "SYMMETRIC" (case-insensitive)
+    constant_values : int
+        In "CONSTANT" mode, the scalar pad value to use. Must be same type as tensor.
+
+    Returns
+    -------
+        A Tensor. Has the same type as tensor.
+    """
+    pad_obj = Pad(paddings, mode, constant_values=constant_values)
+    return pad_obj(tensor)
+
+
+class Unstack(object):
+
+    def __init__(self, axis, num=None):
+        self.axis = axis
+        self.num = num
+
+    def __call__(self, values):
+        out = []
+        for o in jt.chunk(values, chunks=self.num, dim=self.axis):
+            out.append(jt.squeeze(o))
+        return out
+
+
+class Stack(object):
+
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def __call__(self, values):
+        return jt.stack(values, dim=self.axis)
+
+
+def stack(values, axis=0):
+    """
+    Stacks a list of rank-R tensors into one rank-(R+1) tensor.
+
+    Parameters
+    ----------
+    values : list
+        A list of Tensor objects with the same shape and type.
+    axis : int
+        An int. The axis to stack along. Defaults to the first dimension.
+        Negative values wrap around, so the valid range is [-(R+1), R+1).
+
+    Returns
+    -------
+        A stacked Tensor with the same type as values.
+    """
+
+    return jt.stack(values, dim=axis)
+
+
+class Meshgrid(object):
+
+    def __init__(self, indexing='xy'):
+        super(Meshgrid, self).__init__()
+        self.index = indexing
+
+    def __call__(self, *inputs):
+        return jt.meshgrid(*inputs, indexing=self.index)
+
+
+def meshgrid(*args, **kwargs):
+    """
+    Broadcasts parameters for evaluation on an N-D grid.
+
+    Parameters
+    ----------
+    x : tensor
+        Tensors with rank 1.
+    y : tensor
+        Tensors with rank 1.
+
+    Returns
+    -------
+        A list of N Tensors with rank N.
+    """
+
+    return jt.meshgrid(*args)
+
+
+def arange(start, limit=None, delta=1, dtype=None):
+    """
+    Creates a sequence of numbers.
+
+    Parameters
+    ----------
+    start : tensor
+        A 0-D Tensor (scalar). Acts as first entry in the range if limit is not None;
+        otherwise, acts as range limit and first entry defaults to 0.
+    limit : tensor
+         A 0-D Tensor (scalar). Upper limit of sequence, exclusive. If None,
+         defaults to the value of start while the first entry of the range defaults to 0.
+    delta : tensor
+        A 0-D Tensor (scalar). Number that increments start. Defaults to 1.
+    dtype : None or dtype
+        The type of the elements of the resulting tensor.
+
+    Returns
+    -------
+        An 1-D Tensor of type dtype.
+    """
+
+    return jt.arange(start=start, end=limit, step=delta, dtype=dtype)
+
+
+class ExpandDims(object):
+
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def __call__(self, input):
+        return jt.unsqueeze(input=input, dim=self.axis)
+
+
+def expand_dims(input, axis):
+    """
+    Inserts a dimension of 1 into a tensor's shape.
+
+    Parameters
+    ----------
+    input : tensor
+        A Tensor.
+    axis : int
+        0-D (scalar). Specifies the dimension index at which to expand the shape of input.
+        Must be in the range [-rank(input) - 1, rank(input)].
+
+    Returns
+    -------
+        A Tensor with the same data as input, but its shape has an additional dimension of size 1 added.
+    """
+
+    return jt.unsqueeze(input, axis)
+
+
+class Tile(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, input, multiples):
+        return jt.tile(input, dims=multiples)
+
+
+def tile(input, multiples):
+    """
+    Constructs a tensor by tiling a given tensor.
+
+    Parameters
+    ----------
+    input : tensor
+        A Tensor. 1-D or higher.
+    multiples : tensor
+        Must be one of the following types: int32, int64. 1-D.
+        Length must be the same as the number of dimensions in input
+
+    Returns
+    -------
+        A Tensor. Has the same type as input.
+    """
+
+    return jt.tile(input, multiples)
+
+
+class Cast(object):
+
+    def __init__(self, dtype=None):
+        self.dtype = dtype
+
+    def __call__(self, x):
+        return x.type(self.dtype)
+
+
+def cast(x, dtype=None):
+    """
+    Casts a tensor to a new type.
+
+    Parameters
+    ----------
+    x : tensor
+        A Tensor or SparseTensor or IndexedSlices of numeric type.
+        It could be uint8, uint16, uint32, uint64, int8, int16, int32, int64, float16, float32, float64.
+    dtype : dtpye
+         The destination type. The list of supported dtypes is the same as x
+
+    Returns
+    -------
+        A Tensor or SparseTensor or IndexedSlices with same shape as x and same type as dtype.
+    """
+
+    return x.type(dtype)
+
+
+class Transpose(object):
+
+    def __init__(self, perm, conjugate=False):
+        self.perm = perm
+        self.conjugate = conjugate
+
+    def __call__(self, a):
+        return transpose(a, self.perm, self.conjugate)
+
+
+def transpose(a, perm=None, conjugate=False):
+    """
+    Transposes a.
+
+    Parameters
+    ----------
+    a : tensor
+        A Tensor.
+    perm : list / int
+        A permutation of the dimensions of a.
+    conjugate : bool
+        Setting it to True is mathematically equivalent to tf.math.conj(tf.transpose(input)).
+
+    Returns
+    -------
+        A transposed Tensor.
+    """
+    if perm == None:
+        if len(a.shape) <= 2:
+            return jt.t(a)
+        if len(a.shape) == 3:
+            perm = [2, 1, 0]
+        if len(a.shape) == 4:
+            perm = [3, 2, 1, 0]
+        if len(a.shape) == 5:
+            perm = [4, 3, 2, 1, 0]
+    out = jt.permute(a, perm)
+    if conjugate:
+        out = jt.conj_physical(out)
+    return out
+
+
+def gather_nd(params, indices, batch_dims=0):
+    """
+    Gather slices from params into a Tensor with shape specified by indices.
+
+    Parameters
+    ----------
+    params : tensor
+        The tensor from which to gather values.
+    indices : tensor
+        Must be one of the following types: int32, int64. Index tensor.
+    batch_dims : int
+        An integer or a scalar 'Tensor'. The number of batch dimensions.
+
+    Returns
+    -------
+        A Tensor. Has the same type as params.
+    """
+
+    out_shape = indices.shape[:-1]
+    indices = indices.unsqueeze(0).transpose(0, -1)
+    ndim = indices.shape[0]
+    indices = indices.long()
+    idx = jt.zeros_like(indices[0], device=indices.device).long()
+    m = 1
+
+    for i in range(ndim)[::-1]:
+        idx += indices[i] * m
+        m *= params.size(i)
+    out = jt.take(params, idx)
+    return out.view(out_shape)
+
+def scatter_nd(indices, updates, shape):
+    raise NotImplementedError
+
+
+class ClipGradByValue(object):
+    def __init__(self, clip_min=-1, clip_max=1):
+        self.min = clip_min
+        self.max = clip_max
+
+    def __call__(self, inputs):
+        jt.nn.utils.clip_grad_value_(inputs, clip_value=self.max)
+
+
+class ClipGradByNorm(object):
+    def __init__(self, clip_norm=0.1):
+        self.clip_norm = clip_norm
+
+    def __call__(self, inputs):
+        jt.nn.utils.clip_grad_norm_(inputs, max_norm=self.clip_norm, norm_type=2)
+
+
+class ClipByGlobalNorm(object):
+    def __init__(self, clip_norm):
+        self.clip_norm = clip_norm
+
+    def __call__(self, inputs):
+        raise NotImplementedError
+
+
+def clip_by_value(t, clip_value_min, clip_value_max):
+    """
+    Clips tensor values to a specified min and max.
+
+    Parameters
+    ----------
+    t : tensor
+        A Tensor or IndexedSlices
+    clip_value_min : tensor
+        A 0-D (scalar) Tensor, or a Tensor with the same shape as t. The minimum value to clip by
+    clip_value_max : tensor
+        A 0-D (scalar) Tensor, or a Tensor with the same shape as t. The minimum value to clip by
+
+    Returns
+    -------
+        A clipped Tensor or IndexedSlices.
+    """
+
+    t_min = clip_value_min
+    t_max = clip_value_max
+
+    result = (t >= t_min) * t + (t < t_min) * t_min
+    result = (result <= t_max) * result + (result > t_max) * t_max
+    return result
+
+
+def split(value, num_or_size_splits, axis=0):
+    """
+    Splits a tensor into sub tensors.
+
+    Parameters
+    ----------
+    value : tensor
+        The Tensor to split.
+    num_or_size_splits : list
+        Either an integer indicating the number of splits along split_dim or a 1-D integer Tensor or
+        Python list containing the sizes of each output tensor along split_dim.
+    axis : int
+        The dimension along which to split. Must be in the range [-rank(value), rank(value)). Defaults to 0.
+    num : int
+        used to specify the number of outputs when it cannot be inferred from the shape of size_splits.
+
+    Returns
+    -------
+        Tensor objects resulting from splitting value.
+    """
+    if isinstance(num_or_size_splits, int):
+        nums = value.size(axis)
+        if nums % num_or_size_splits != 0:
+            raise ValueError("Expected input_axis_nums % num_or_size_splits == 0, but received input_axis_nums % num_or_size_splits = 0")
+        else:
+            num_or_size_splits = int(nums / num_or_size_splits)
+    return jt.split(value, num_or_size_splits, dim=axis)
+
+
+class Floor(object):
+
+    def __call__(self, x):
+        return jt.floor(x)
+
+
+def floor(x):
+    return jt.floor(x)
+
+
+def gather(params, indices, axis = None):
+    if axis is None:
+        axis = 0
+    if axis < 0:
+        axis = len(params.shape) + axis
+    if axis == 0:
+        return params[indices]
+    elif axis == 1:
+        return params[:, indices]
+    elif axis == 2:
+        return params[:, :, indices]
+    elif axis == 3:
+        return params[:,:,:, indices]
+
+
+def linspace(start, stop, num):
+    return jt.linspace(start=start, end=stop, steps=num)
+
+
+def slice(inputs, starts, sizes):
+
+    ends = [starts[i] + sizes[i] for i in range(len(starts))]
+
+    if len(inputs.shape) == 1:
+        return inputs[starts[0] : ends[0]]
+    if len(inputs.shape) == 2:
+        return inputs[starts[0] : ends[0], starts[1]:ends[1]]
+    if len(inputs.shape) == 3:
+        return inputs[starts[0] : ends[0], starts[1]:ends[1], starts[2]:ends[2]]
+    if len(inputs.shape) == 4:
+        return inputs[starts[0]: ends[0], starts[1]:ends[1], starts[2]:ends[2], starts[3]:ends[3]]
+    if len(inputs.shape) == 5:
+        return inputs[starts[0]: ends[0], starts[1]:ends[1], starts[2]:ends[2], starts[3]:ends[3], starts[4]:ends[4]]
+
+
+def add_n(inputs):
+    a = inputs[0]
+    for b in inputs[1:]:
+        a += b
+    return a
+
+
+class OneHot(object):
+
+    def __init__(self, depth=-1, on_value=None, off_value=None, axis=None, dtype=None):
+        self.depth = depth
+        self.on_value = on_value
+        self.off_value = off_value
+        self.axis = axis
+        self.dtype = dtype
+
+    def __call__(self, inputs):
+        if [self.on_value, self.off_value] == [None, None]:
+            return jt.nn.functional.one_hot(inputs, self.depth)
+        else:
+            out = jt.nn.functional.one_hot(inputs, self.depth)
+            out = cast(out, jt.float64)
+            out = jt.where(out == 1, self.on_value, out)
+            out = jt.where(out == 0, self.off_value, out)
+            out = cast(out, jt.int)
+            return out
+
+
+class L2Normalize(object):
+
+    def __init__(self, axis=None, epsilon=1e-12):
+        self.axis = axis
+        self.epsilon = epsilon
+
+    def __call__(self, input, *args, **kwargs):
+
+        return jt.nn.functional.normalize(input, p = 2, dim=self.axis, eps=self.epsilon)
+
+
+
+class EmbeddingLookup(object):
+
+    def __init__(self, max_norm=None):
+        self.max_norm = max_norm
+        self.padding_idx = None
+        self.norm_type = 2.0
+        self.scale_grad_by_freq = False
+        self.sparse = False
+
+    def __call__(self, params, ids):
+        Warning("Parameters max_norm, padding_idx, norm_type, scale_grad_by_freq, and sparse are not supported in Jittor backend.")
+        return jt.nn.embedding(
+            ids, params )
+
+
+class NCELoss(object):
+
+    def __init__(self, num_true=1, sampled_values=None, remove_accidental_hits=False):
+        self.num_true = num_true
+        self.sampled_values = sampled_values
+        self.remove_accidental_hits = remove_accidental_hits
+
+    def __call__(self, weights, biases, labels, inputs, num_sampled, num_classes):
+        raise NotImplementedError
+
+
+class NotEqual(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, x, y):
+        return jt.ne(x, y)
+
+
+class CountNonzero(object):
+
+    def __init__(self, keepdims=None, dtype=None):
+        self.keepdims = keepdims
+        self.dtype = dtype
+
+    def __call__(self, input, axis=None):
+
+        return jt.count_nonzero(input, dim=axis)
+
+
+class Resize:
+
+    def __init__(self, scale, method, antialias=False, data_format='channels_last'):
+        self.method = method
+        self.antialias = antialias
+        self.scale = scale
+        self.data_format = data_format
+
+    def __call__(self, inputs):
+        if self.data_format == "channels_last":
+            inputs = nhwc_to_nchw(inputs)
+        outputs = jt.nn.interpolate(inputs, scale_factor=self.scale, mode=self.method, align_corners=self.antialias)
+        if self.data_format == "channels_last":
+            outputs = nchw_to_nhwc(outputs)
+        return outputs
+
+
+def resize(inputs, output_size, method, antialias):
+    return jt.nn.interpolate(inputs, size=output_size, mode=method, align_corners=antialias)
+
+
+class ZeroPadding1D(object):
+
+    def __init__(self, padding, data_format):
+        if data_format == 'channels_first':
+            padding = ((0, 0), (0, 0), padding)
+        elif data_format == 'channels_last':
+            padding = ((0, 0), padding, (0, 0))
+        else:
+            raise ValueError('data_format must be channels_first or channels_last.')
+        self.pad = Pad(paddings=padding)
+
+    def __call__(self, inputs):
+        return self.pad(inputs)
+
+
+class ZeroPadding2D(object):
+
+    def __init__(self, padding, data_format):
+        if data_format == 'channels_first':
+            padding = ((0, 0), (0, 0), padding[0], padding[1])
+        elif data_format == 'channels_last':
+            padding = ((0, 0), padding[0], padding[1], (0, 0))
+        else:
+            raise ValueError('data_format must be channels_first or channels_last.')
+        self.pad = Pad(paddings=padding)
+
+    def __call__(self, inputs):
+        return self.pad(inputs)
+
+
+class ZeroPadding3D(object):
+
+    def __init__(self, padding, data_format):
+        if data_format == 'channels_first':
+            padding = ((0, 0), (0, 0), padding[0], padding[1], padding[2])
+        elif data_format == 'channels_last':
+            padding = ((0, 0), padding[0], padding[2], padding[1], (0, 0))
+        else:
+            raise ValueError('data_format must be channels_first or channels_last.')
+        self.pad = Pad(paddings=padding)
+
+    def __call__(self, inputs):
+        return self.pad(inputs)
+
+
+class Sign(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, x):
+        return jt.sign(x)
+
+
+class Ceil(object):
+
+    def __call__(self, x):
+        return jt.ceil(x)
+
+
+def ceil(x):
+    return jt.ceil(x)
+
+
+def multiply(x, y):
+    return jt.multiply(x, y)
+
+
+def divide(x, y):
+    return jt.divide(x, y)
+
+
+def identity(x):
+
+    raise NotImplementedError
+
+
+class BatchToSpace(object):
+
+    def __init__(self, block_size, crops):
+        self.bolock_size = block_size
+        self.crops = crops
+
+    def __call__(self, input_x):
+        raise NotImplementedError
+
+
+class DepthToSpace(object):
+
+    def __init__(self, block_size, data_format):
+        self.block_size = block_size
+        self.data_format = data_format
+
+    def __call__(self, input):
+        if self.data_format == 'channels_last':
+            input = nhwc_to_nchw(input)
+        output = jt.nn.functional.pixel_shuffle(input, upscale_factor=self.block_size)
+        if self.data_format == 'channels_last':
+            output = nchw_to_nhwc(output)
+        return output
+
+def triu(data, diagonal=0):
+
+    return jt.triu(data, diagonal)
+
+
+def tril(data, diagonal=0):
+
+    return jt.tril(data, diagonal)
+
+
+def abs(x):
+    return jt.abs(x)
+
+
+def acos(x):
+    return jt.acos(x)
+
+
+def acosh(x):
+    return jt.acosh(x)
+
+
+def angle(x):
+    return jt.angle(x)
+
+
+def argmax(x, axis=None, keepdim=False, dtype='int64'):
+    return jt.argmax(x, dim=axis, keepdim=keepdim)
+
+
+def argmin(x, axis=None, dtype='int64'):
+    return jt.argmin(x, dim=axis)
+
+
+def asin(x):
+    return jt.asin(x)
+
+
+def asinh(x):
+    return jt.asinh(x)
+
+
+def atan(x):
+    return jt.atan(x)
+
+
+def atanh(x):
+    return jt.atanh(x)
+
+
+def cos(x):
+    return jt.cos(x)
+
+
+def cosh(x):
+    return jt.cosh(x)
+
+
+def count_nonzero(x, axis=None, keepdims=None, dtype="int64"):
+    return jt.count_nonzero(x, dim=axis)
+
+
+def cumprod(x, axis=0, exclusive=False, reverse=False):
+    return jt.cumprod(x, dim=axis)
+
+
+def cumsum(x, axis=0, exclusive=False, reverse=False):
+    return jt.cumsum(x, dim=axis)
+
+
+def equal(x, y):
+    return jt.equal(x, y)
+
+
+def exp(x):
+    return jt.exp(x)
+
+
+def floordiv(x, y):
+    return jt.floor_divide(x, y)
+
+
+def floormod(x, y):
+    return jt.fmod(x, y)
+
+
+def greater(x, y):
+    return jt.greater(x, y)
+
+
+def greater_equal(x, y):
+    return jt.greater_equal(x, y)
+
+
+def is_inf(x):
+    return jt.isinf(x)
+
+
+def is_nan(x):
+    return jt.isnan(x)
+
+
+def l2_normalize(x, axis=None, eps=1e-12):
+    axis = 0 if axis is None else axis
+    return jt.misc.normalize(x, p=2.0, dim=axis, eps=eps)
+
+
+def less(x, y):
+    return jt.less(x, y)
+
+
+def less_equal(x, y):
+    return jt.less_equal(x, y)
+
+
+def log(x):
+    return jt.log(x)
+
+
+def log_sigmoid(x):
+    return jt.log(1 / (1 + jt.exp(-x)))
+
+
+def maximum(x, y):
+    return jt.maximum(x, y)
+
+
+def negative(x):
+    return jt.negative(x)
+
+
+def not_equal(x, y):
+    return jt.not_equal(x, y)
+
+
+def pow(x, y):
+    return jt.pow(x, y)
+
+
+def real(x):
+    return x
+
+
+def reciprocal(x):
+    return 1/x
+
+
+def reduce_prod(x, axis=None, keepdims=False):
+    if axis is not None:
+        return jt.prod(x, dim=axis, keepdim=keepdims)
+    else:
+        return jt.prod(x)
+
+def reduce_std(x, axis=None, keepdims=False):
+    if axis is not None:
+        return jt.std(x, dim=axis, keepdim=keepdims)
+    else:
+        return jt.std(x)
+
+def reduce_sum(x, axis=None, keepdims=False):
+    if axis is not None:
+        return jt.sum(x, dim=axis, keepdim=keepdims)
+    else:
+        return jt.sum(x)
+
+
+def reduce_variance(x, axis=None, keepdims=False):
+    if axis is not None:
+        return jt.var(x, dim=axis, keepdim=keepdims)
+    else:
+        return jt.var(x)
+
+
+def round(x):
+    return jt.round(x)
+
+
+def rsqrt(x):
+    return jt.rsqrt(x)
+
+
+def segment_max(x, segment_ids):
+    segment_ids = jt.array(segment_ids, dtype=jt.int64)
+    num_segments = len(jt.unique(segment_ids))
+    return unsorted_segment_max(x, segment_ids, num_segments)
+
+
+def segment_mean(x, segment_ids):
+    segment_ids = jt.array(segment_ids, dtype=jt.int64)
+    num_segments = len(jt.unique(segment_ids))
+    return unsorted_segment_mean(x, segment_ids, num_segments)
+
+
+def segment_min(x, segment_ids):
+    segment_ids = jt.array(segment_ids, dtype=jt.int64)
+    num_segments = len(jt.unique(segment_ids))
+    return unsorted_segment_min(x, segment_ids, num_segments)
+
+
+def segment_prod(x, segment_ids):
+    raise NotImplementedError
+
+
+def segment_sum(x, segment_ids):
+    segment_ids = jt.array(segment_ids, dtype=jt.int64)
+    num_segments = len(jt.unique(segment_ids))
+    return unsorted_segment_sum(x, segment_ids, num_segments)
+
+
+def sigmoid(x):
+    return jt.sigmoid(x)
+
+
+def sign(x):
+    return jt.sign(x)
+
+
+def sin(x):
+    return jt.sin(x)
+
+
+def sinh(x):
+    return jt.sinh(x)
+
+
+def softplus(x):
+    """
+    Computes softplus: log(exp(features) + 1).
+
+    Parameters
+    ----------
+    x : tensor
+        Must be one of the following types: half, bfloat16, float32, float64.
+
+    Returns
+    -------
+        A Tensor. Has the same type as features.
+    """
+
+    # Computes softplus: (1/b) * log(1 + exp(features*b)) ; b=1
+    return jt.nn.softplus(x)
+
+
+def square(x):
+    return jt.square(x)
+
+
+def squared_difference(x, y):
+    return jt.square(x-y)
+
+
+def subtract(x, y):
+    return jt.subtract(x, y)
+
+
+def tan(x):
+    return jt.tan(x)
+
+
+def tanh(x):
+    """
+    Computes hyperbolic tangent of x element-wise.
+
+    Parameters
+    ----------
+    x : tensor
+        Must be one of the following types: bfloat16, half, float32, float64, complex64, complex128.
+
+    Returns
+    -------
+        A Tensor. Has the same type as x.
+    """
+
+    return jt.tanh(x)
+
+
+def any(x, axis=None, keepdims=False):
+    if axis is not None:
+        return jt.any(x, dim=axis, keepdim=keepdims)
+    else:
+        return jt.any(x)
+
+def all(x, axis=None, keepdims=False):
+    if axis is not None:
+        return jt.all(x, dim=axis, keepdim=keepdims)
+    else:
+        return jt.all(x)
+
+
+def logical_and(x, y):
+    return jt.logical_and(x, y)
+
+
+def logical_or(x, y):
+    return jt.logical_or(x, y)
+
+
+def logical_not(x):
+    return jt.logical_not(x)
+
+
+def logical_xor(x, y):
+    return jt.logical_xor(x, y)
+
+
+def argsort(x, axis=-1, descending=False):
+    return jt.argsort(x, dim=axis, descending=descending)
+
+
+def bmm(x, y):
+    return jt.bmm(x, y)
+
+
+def where(condition, x, y):
+    return jt.where(condition,x, y)
+
+
+def ones_like(x, dtype=None):
+    return jt.ones_like(x, dtype=dtype)
+
+
+def zeros_like(x, dtype=None):
+    return jt.zeros_like(x, dtype=dtype)
+
+
+def squeeze(x, axis=None):
+    return jt.squeeze(x, dim=axis)
+
+
+def unsorted_segment_sum(x, segment_ids, num_segments):
+
+    segment_ids = jt.array(segment_ids, dtype=jt.int64)
+    assert x.shape[0] == segment_ids.shape[0], "the length of segment_ids should be equal to data.shape[0]."
+    if len(segment_ids.shape) == 1:
+        s = jt.prod(jt.array(x.shape[1:])).to(jt.int32)
+        segment_ids = segment_ids.repeat_interleave(s).view(segment_ids.shape[0], *x.shape[1:])
+
+    assert x.shape == segment_ids.shape, "data.shape and segment_ids.shape should be equal"
+
+    shape = [num_segments] + list(x.shape[1:])
+    tensor = jt.zeros(*shape).to(x.dtype).scatter_add(0, segment_ids, x)
+    return tensor
+
+
+def unsorted_segment_mean(x, segment_ids, num_segments):
+
+    segment_ids = jt.array(segment_ids, dtype=jt.int64)
+    assert x.shape[0] == segment_ids.shape[0], "the length of segment_ids should be equal to data.shape[0]."
+    res = []
+    for i in range(num_segments):
+        mask_index = segment_ids == i
+        if jt.any(mask_index):
+            a = jt.mean(x[mask_index], 0)
+            res.append(a)
+        else:
+            a = jt.zeros_like(x[0])
+            res.append(a)
+    if res[0].shape == [1]:
+        return jt.concat(res, 0)
+    else:
+        return jt.stack(res, 0)
+
+def unsorted_segment_min(x, segment_ids, num_segments):
+
+    segment_ids = jt.array(segment_ids, dtype=jt.int64)
+    assert x.shape[0] == segment_ids.shape[0], "the length of segment_ids should be equal to data.shape[0]."
+    res = []
+    for i in range(num_segments):
+        mask_index = segment_ids == i
+        if jt.any(mask_index):
+            res.append(jt.min(x[mask_index], 0)[0])
+        else:
+            a = jt.zeros_like(x[0])
+            a.fill_(jt.array(float('inf')).to(a.dtype))
+            res.append(a)
+    if res[0].shape == [1]:
+        return jt.concat(res, 0)
+    else:
+        return jt.stack(res, 0)
+
+
+def unsorted_segment_max(x, segment_ids, num_segments):
+
+    segment_ids = jt.array(segment_ids, dtype=jt.int64)
+    assert x.shape[0] == segment_ids.shape[0], "the length of segment_ids should be equal to data.shape[0]."
+    res = []
+    for i in range(num_segments):
+        mask_index = segment_ids == i
+        if jt.any(mask_index):
+            res.append(jt.max(x[mask_index], 0)[0])
+        else:
+            a = jt.zeros_like(x[0])
+            a.fill_(jt.array(float('-inf')).to(a.dtype))
+            res.append(a)
+    if res[0].shape == [1]:
+        return jt.concat(res, 0)
+    else:
+        return jt.stack(res, 0)
+
+def set_seed(seed):
+
+    jt.misc.set_global_seed(seed)
+
+def is_tensor(x):
+
+    return isinstance(x, jt.Tensor)
+
+def tensor_scatter_nd_update(tensor, indices, updates):
+    tensor = jt.array(tensor)
+    indices = jt.array(indices, dtype=jt.long)
+    updates = jt.array(updates)
+    indices = jt.flatten(indices)
+    tensor[indices] = updates
+    return tensor
+
+def diag(input, diagonal=0):
+
+    return jt.diag(input, diagonal)
+
+def mask_select(x, mask, axis = 0):
+    if axis is None:
+        axis = 0
+    if axis < 0:
+        axis = len(x.shape) + axis
+    if x.shape == mask.shape:
+        return jt.masked_select(x, mask)
+    if axis == 0:
+        return x[mask]
+    elif axis == 1:
+        return x[:, mask]
+    elif axis == 2:
+        return x[:, :, mask]
+    elif axis == 3:
+        return x[:,:,:, mask]
+
+def eye(n, m=None, dtype=None):
+    if m is None:
+        m = n
+    return jt.init.eye((n,m), dtype =dtype)
+
+
+def einsum(equation, *operands):
+    return jt.linalg.einsum(equation, *operands)
+
+
+class Einsum(object):
+    def __init__(self, equation):
+        super(Einsum, self).__init__()
+        self.equation = equation
+
+    def __call__(self, *args):
+        return jt.einsum(self.equation, *args)
+
+def set_device(device = 'GPU', id = 0):
+    if device == 'GPU':
+        jt.flags.use_cuda = 1
+
+def distributed_init(backend="cncl"):
+    jt.distributed.init_process_group(backend=backend)
+
+def distributed_model(module, device_ids=None, output_device=None, 
+                    dim=0, broadcast_buffers=True, process_group=None, bucket_cap_mb=25, 
+                    find_unused_parameters=False, check_reduction=False, gradient_as_bucket_view=False):
+    return jt.nn.parallel.DistributedDataParallel(module, device_ids=device_ids,
+                                                     output_device=output_device,
+                                                     dim=dim, broadcast_buffers=broadcast_buffers,
+                                                     process_group=process_group, bucket_cap_mb=bucket_cap_mb,
+                                                     find_unused_parameters=find_unused_parameters,
+                                                     check_reduction=check_reduction, 
+                                                     gradient_as_bucket_view=gradient_as_bucket_view)
+
+def scatter_update(tensor, indices, updates):
+    tensor = jt.array(tensor)
+    indices = jt.array(indices, dtype=jt.long)
+    updates = jt.array(updates)
+    tensor[indices] = updates
+    return tensor
+
+def get_device():
+    flag_gpu = jt.flags.use_cuda 
+    if flag_gpu:
+        id = 0
+        device = 'GPU:' + str(id)    
+    else:
+        device = 'CPU'
+        
+    return device
+
+def to_device(tensor, device='GPU', id=0):
+    device = device.lower()
+    if device == 'gpu':
+        jt.flags.use_cuda = 1
+    return tensor
+
+def roll(input, shifts, dims=None):
+
+    return jt.roll(input, shifts, dims)
+
+
+def logsoftmax(input, dim=None):
+
+    return jt.nn.log_softmax(input, dim)
+
+def topk(input, k, dim=-1, largest=True, sorted=True):
+
+    return jt.topk(input, k, dim, largest, sorted)
+
+def numel(input):
+
+    return jt.size(input)
+
+
+
+def histogram(input, bins=100, min=0, max=0, name=None):
+    raise NotImplementedError
+
+
+def flatten(x, start_axis=0, stop_axis=-1, name=None):
+    raise NotImplementedError
+
+
+def interpolate(x,
+                size=None,
+                scale_factor=None,
+                mode='nearest',
+                align_corners=False,
+                align_mode=0,
+                data_format='NCHW',
+                name=None):
+    raise NotImplementedError
+
+
+def index_select(x, index, axis=0, name=None):
+    raise NotImplementedError
+
+
+def dot(x, y, name=None):
+    raise NotImplementedError
+
+
+class Swish(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, x):
+        raise NotImplementedError
+
+def expand(x, shape):
+
+    raise NotImplementedError
+
+def unique(x, return_index=False, return_inverse=False, return_counts=False, axis=None, dtype='int64'):
+
+    raise NotImplementedError
+
+
+def flip(x, axis):
+
+    return jt.flip(x, axis)
+
+
+def mv(x, vec):
+
+    return jt.matmul(x, vec)
+

From af9a586f9074c251294ada7c50488f303b56753e Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Wed, 6 Mar 2024 15:31:33 +0800
Subject: [PATCH 02/27] Created Jittor_nn.py File. The file is a copy of
 oneflow_nn.py. the code has been adjusted to Jittor for the first 565 lines

---
 tensorlayerx/backend/ops/Jittor_nn.py      | 2272 ++++++++++++++++++++
 tensorlayerx/backend/ops/jitter_backend.py |    0
 2 files changed, 2272 insertions(+)
 create mode 100644 tensorlayerx/backend/ops/Jittor_nn.py
 create mode 100644 tensorlayerx/backend/ops/jitter_backend.py

diff --git a/tensorlayerx/backend/ops/Jittor_nn.py b/tensorlayerx/backend/ops/Jittor_nn.py
new file mode 100644
index 0000000..a579c9a
--- /dev/null
+++ b/tensorlayerx/backend/ops/Jittor_nn.py
@@ -0,0 +1,2272 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+
+# Unified nn API for TensorLayerX, using Jittor as backend.
+# Similar to file ./mindspore_nn.py and ./oneflow_nn.py
+
+import jittor as jt
+import jittor.nn as nn
+
+
+def padding_format(padding):
+    """
+    Checks that the padding format correspond format.
+
+    Parameters
+    ----------
+    padding : str
+        Must be one of the following:"same", "SAME", "VALID", "valid"
+
+    Returns
+    -------
+        str "SAME" or "VALID"
+    """
+
+    if padding in ["SAME", "same"]:
+        padding = "same"
+    elif padding in ["VALID", "valid"]:
+        padding = "valid"
+    elif padding == None:
+        padding = None
+    elif isinstance(padding, tuple) or isinstance(padding, int):
+        return padding
+    else:
+        raise Exception("Unsupported padding: " + str(padding))
+    return padding
+
+def preprocess_padding(padding, dim='2d'):
+    check_padding(padding, dim)
+    if dim == '1d':
+        out_padding = (0, 0, padding, padding)
+    elif dim == '2d':
+        if isinstance(padding, tuple):
+            out_padding = (padding[0], padding[0], padding[1], padding[1])
+        else:
+            out_padding = padding
+    elif dim == '3d':
+        if isinstance(padding, tuple):
+            out_padding = (padding[0], padding[0], padding[1], padding[1], padding[2], padding[2])
+        else:
+            out_padding = padding
+    else:
+        raise RuntimeError("Unsupported input dimensions.")
+    return out_padding
+
+
+
+def check_padding(padding, dim='2d'):
+    if dim == '1d' and isinstance(object, tuple):
+        raise RuntimeError("expected padding to be a single integer value or a list of 1 values to match the convolution dimensions.")
+    if dim == '2d' and isinstance(padding, tuple) and len(padding) > 2:
+        raise RuntimeError("expected padding to be a single integer value or a list of 2 values to match the convolution dimensions.")
+    if dim == '3d' and isinstance(padding, tuple) and len(padding) > 3:
+        raise RuntimeError("expected padding to be a single integer value or a list of 3 values to match the convolution dimensions.")
+
+
+def preprocess_1d_format(data_format, padding):
+    """
+    Checks that the 1-D dataformat format correspond format.
+
+    Parameters
+    ----------
+    data_format : str
+        Must be one of the following:"channels_last","NWC","NCW","channels_first"
+    padding : str
+        Must be one of the following:"same","valid","SAME","VALID"
+
+    Returns
+    -------
+        str "NWC" or "NCW" and "SAME" or "VALID"
+    """
+
+    if data_format in ["channels_last", "NWC", "NLC"]:
+        data_format = "NLC"
+    elif data_format in ["channels_first", "NCW", "NCL"]:
+        data_format = "NCL"
+    elif data_format == None:
+        data_format = None
+    else:
+        raise Exception("Unsupported data format: " + str(data_format))
+    padding = padding_format(padding)
+    return data_format, padding
+
+
+def preprocess_2d_format(data_format, padding):
+    """
+    Checks that the 2-D dataformat format correspond format.
+
+    Parameters
+    ----------
+    data_format : str
+        Must be one of the following:"channels_last","NHWC","NCHW","channels_first"
+    padding : str
+        Must be one of the following:"same","valid","SAME","VALID"
+
+    Returns
+    -------
+        str "NHWC" or "NCHW" and "SAME" or "VALID"
+    """
+
+    if data_format in ["channels_last", "NHWC"]:
+        data_format = "NHWC"
+    elif data_format in ["channels_first", "NCHW"]:
+        data_format = "NCHW"
+    elif data_format == None:
+        data_format = None
+    else:
+        raise Exception("Unsupported data format: " + str(data_format))
+    padding = padding_format(padding)
+    return data_format, padding
+
+
+def preprocess_3d_format(data_format, padding):
+    """
+    Checks that the 3-D dataformat format correspond format.
+
+    Parameters
+    ----------
+    data_format : str
+        Must be one of the following:"channels_last","NDHWC","NCDHW","channels_first"
+    padding : str
+        Must be one of the following:"same","valid","SAME","VALID"
+
+    Returns
+    -------
+        str "NDHWC" or "NCDHW" and "SAME" or "VALID"
+    """
+
+    if data_format in ['channels_last', 'NDHWC']:
+        data_format = 'NDHWC'
+    elif data_format in ['channels_first', 'NCDHW']:
+        data_format = 'NCDHW'
+    elif data_format == None:
+        data_format = None
+    else:
+        raise Exception("Unsupported data format: " + str(data_format))
+    padding = padding_format(padding)
+    return data_format, padding
+
+
+def nchw_to_nhwc(x):
+    """
+    Channels first to channels last
+
+    Parameters
+    ----------
+    x : tensor
+        channels first tensor data
+
+    Returns
+    -------
+        channels last tensor data
+    """
+    if len(P.Shape()(x)) == 3:
+        x = P.Transpose()(x, (0, 2, 1))
+    elif len(P.Shape()(x)) == 4:
+        x = P.Transpose()(x, (0, 2, 3, 1))
+    elif len(P.Shape()(x)) == 5:
+        x = P.Transpose()(x, (0, 2, 3, 4, 1))
+    # else:
+    #     raise Exception("Unsupported dimensions")
+    return x
+
+def nhwc_to_nchw(x):
+    """
+    Channles last to channels first
+
+    Parameters
+    ----------
+    x : tensor
+        channels last tensor data
+
+    Returns
+    -------
+        channels first tensor data
+    """
+
+    if len(P.Shape()(x)) == 3:
+        x = P.Transpose()(x, (0, 2, 1))
+    elif len(P.Shape()(x)) == 4:
+        x = P.Transpose()(x, (0, 3, 1, 2))
+    elif len(P.Shape()(x)) == 5:
+        x = P.Transpose()(x, (0, 4, 1, 2, 3))
+    # else:
+    #     raise Exception("Unsupported dimensions")
+    return x
+
+class ReLU(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, x):
+        return nn.relu(x)
+
+
+def relu(x):
+    """
+    Computes rectified linear: max(features, 0).
+
+    Parameters
+    ----------
+    x : tensor
+        Must be one of the following types: float32, float64, int32, uint8, int16,
+        int8, int64, bfloat16, uint16, half, uint32, uint64, qint8.
+
+    Returns
+    -------
+        A Tensor. Has the same type as features.
+    """
+
+    return nn.relu(x)
+
+
+class ELU(object):
+
+    def __init__(self, alpha=1.0):
+        self.alpha = alpha
+
+    def __call__(self, x):
+        return nn.elu(x, alpha=self.alpha)
+
+
+def elu(x, alpha=1.0):
+    """
+    Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+
+    See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+    ](http://arxiv.org/abs/1511.07289)
+
+    Parameters
+    ----------
+    x : tensor
+        Must be one of the following types: half, bfloat16, float32, float64.
+
+    Returns
+    -------
+        A Tensor with the same type as features.
+  """
+
+    return nn.elu(x, alpha=alpha)
+
+
+class ReLU6(object):
+
+    def __call__(self, x):
+        return nn.relu6(x)
+
+
+def relu6(x):
+    """
+    Computes Rectified Linear 6: min(max(features, 0), 6).
+
+    Parameters
+    ----------
+    x : tensor
+        Must be one of the following types: float32, float64, int32, uint8, int16,
+        int8, int64, bfloat16, uint16, half, uint32, uint64, qint8.
+
+    Returns
+    -------
+        A Tensor with the same type as features.
+    """
+
+    return nn.relu6(x)
+
+
+class LeakyReLU(object):
+# jittor.nn. leaky_relu ( x , scale = 0.01 )
+    def __init__(self, negative_slope=0.01):
+        self.negative_slope = negative_slope
+
+    def __call__(self, x):
+        return nn.leaky_relu(x, scale=self.negative_slope)
+
+
+def leaky_relu(x, negative_slope=0.01):
+    """
+    Compute the Leaky ReLU activation function.
+
+    Parameters
+    ----------
+    x : tensor
+        representing preactivation values. Must be one of the following types:
+        float16, float32, float64, int32, int64.
+
+    Returns
+    -------
+        The activation value.
+    """
+
+    return nn.leaky_relu(x, scale=negative_slope)
+
+
+class Softplus(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, x):
+        return nn.softplus(x)
+
+
+class Tanh(object):
+# jittor.nn.hardtanh(x, min_val=-1, max_val=1)
+    def __init__(self):
+        super(Tanh, self).__init__()
+        self.tanh = nn.Sigmoid()
+
+    def __call__(self, x):
+        return self.tanh(x)
+
+
+class Sigmoid(object):
+# classjittor.nn.Sigmoid
+    def __init__(self):
+        super(Sigmoid, self).__init__()
+        self.sigmoid = nn.Sigmoid()
+        pass
+
+    def __call__(self, x):
+        return self.sigmoid(x)
+
+
+def sigmoid(x):
+    """
+    Computes sigmoid of x element-wise.
+
+    Parameters
+    ----------
+    x : tensor
+        A Tensor with type float16, float32, float64, complex64, or complex128.
+
+    Returns
+    -------
+        A Tensor with the same type as x.
+    """
+    outputs = nn.Sigmoid()
+    return outputs(x)
+
+
+class Softmax(object):
+
+    def __init__(self, axis=None):
+        self.axis = axis
+
+    def __call__(self, x):
+        return nn.softmax(x, dim=self.axis)
+
+
+def softmax(logits, axis=None):
+    """
+    Computes softmax activations.
+
+    Parameters
+    ----------
+    logits : tensor
+        Must be one of the following types: half, float32, float64.
+    axis : int
+        The dimension softmax would be performed on. The default is -1 which indicates the last dimension.
+
+    Returns
+    -------
+        A Tensor. Has the same type and shape as logits.
+    """
+
+    return nn.softmax(logits, axis)
+
+
+class GeLU(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, x):
+        return nn.gelu(x)
+
+
+def gelu(x):
+
+    return nn.gelu(x)
+
+
+class Dropout(object):
+
+    def __init__(self, p=0.5, seed=0 , is_train=False):
+        self.p = p
+        self.seed = seed
+        self.is_train = is_train
+    def __call__(self, inputs):
+        return nn.dropout(inputs, p=self.p, is_train=self.is_train)
+
+def dropout(x, p=0.5, is_train=False):
+    return nn.dropout(x , p=p, is_train=is_train)
+
+
+class BiasAdd(object):
+    """
+    Adds bias to value.
+
+    Parameters
+    ----------
+    x : tensor
+        A Tensor with type float, double, int64, int32, uint8, int16, int8, complex64, or complex128.
+    bias : tensor
+        Must be the same type as value unless value is a quantized type,
+        in which case a different quantized type may be used.
+    Returns
+    -------
+        A Tensor with the same type as value.
+    """
+
+    def __init__(self, data_format='channels_last'):
+        super(BiasAdd, self).__init__()
+        if data_format in ['channels_first', 'NCL', 'NCW', 'NCHW', 'NCDHW']:
+            self.data_format = 'channels_first'
+        elif data_format in ['channels_last', 'NLC', 'NWC', 'NHWC', 'NDHWC']:
+            self.data_format = 'channels_last'
+        else:
+            raise ("Unsupported data format: " + str(data_format))
+
+    def __call__(self, x, bias):
+        if len(x.shape) > 2 and self.data_format == 'channels_first':
+            x = nchw_to_nhwc(x)
+        outputs = x + bias
+        if len(x.shape) > 2 and self.data_format == 'channels_first':
+            outputs = nhwc_to_nchw(outputs)
+        return outputs
+
+
+def bias_add(x, bias, data_format=None):
+    """
+    Adds bias to value.
+
+    Parameters
+    ----------
+    x : tensor
+        A Tensor with type float, double, int64, int32, uint8, int16, int8, complex64, or complex128.
+    bias : tensor
+        Must be the same type as value unless value is a quantized type,
+        in which case a different quantized type may be used.
+    data_format : A string.
+        'N...C' and 'NC...' are supported.
+    name : str
+        A name for the operation (optional).
+    Returns
+    -------
+        A Tensor with the same type as value.
+    """
+
+    add_obj = BiasAdd(data_format=data_format)
+    return add_obj(x, bias)
+
+
+def same_padding(input, weight, strides, dilations):
+    #                     H(in) + 2* padding[0] - dilation[0] * (Ksize[0] - 1) - 1
+    # H(out) = = floor( --------------------------------------------------------------   + 1 )
+    #                                        stride[0]
+    if isinstance(weight, flow.Tensor):
+        if len(input.shape) == 3:
+            filter_rows = weight.size(2)
+        if len(input.shape) == 4:
+            filter_rows = weight.size(2)
+            filter_cols = weight.size(3)
+        elif len(input.shape) == 5:
+            filter_rows = weight.size(2)
+            filter_cols = weight.size(3)
+            filter_depth = weight.size(4)
+    else:
+        if len(input.shape) == 3:
+            filter_rows = weight[0]
+        elif len(input.shape) == 4:
+            filter_rows = weight[0]
+            filter_cols = weight[1]
+        elif len(input.shape) == 5:
+            filter_rows = weight[0]
+            filter_cols = weight[1]
+            filter_depth = weight[2]
+
+    if len(input.shape) == 3:
+        input_rows = input.size(2)
+        out_rows = (input_rows + strides - 1) // strides
+        padding_rows = max(0, (out_rows - 1) * strides + (filter_rows - 1) * dilations + 1 - input_rows)
+        rows_odd = (padding_rows % 2 != 0)
+        return rows_odd, padding_rows
+
+    if len(input.shape) == 4:
+        input_rows = input.size(2)
+        input_cols = input.size(3)
+
+        # filter_rows = weight.size(2)
+        # filter_cols = weight.size(3)
+
+        out_rows = (input_rows + strides[0] - 1) // strides[0]
+        out_cols = (input_cols + strides[1] - 1) // strides[1]
+
+        padding_rows = max(0, (out_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - input_rows)
+        padding_cols = max(0, (out_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - input_cols)
+
+        rows_odd = (padding_rows % 2 != 0)
+        cols_odd = (padding_cols % 2 != 0)
+        return rows_odd, cols_odd, padding_rows, padding_cols
+
+    if len(input.shape) == 5:
+        input_rows = input.size(2)
+        input_cols = input.size(3)
+        input_depth = input.size(4)
+
+        # filter_rows = weight.size(2)
+        # filter_cols = weight.size(3)
+        # filter_depth = weight.size(4)
+
+        out_rows = (input_rows + strides[0] - 1) // strides[0]
+        out_cols = (input_cols + strides[1] - 1) // strides[1]
+        out_depth = (input_depth + strides[2] - 1) // strides[2]
+
+        padding_rows = max(0, (out_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - input_rows)
+        padding_cols = max(0, (out_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - input_cols)
+        padding_depth = max(0, (out_depth - 1) * strides[2] + (filter_depth - 1) * dilations[2] + 1 - input_depth)
+
+        rows_odd = (padding_rows % 2 != 0)
+        cols_odd = (padding_cols % 2 != 0)
+        depth_odd = (padding_depth % 2 != 0)
+        return rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth
+
+
+class Conv1D(object):
+
+    def __init__(self, stride, padding, data_format='NWC', dilations=None, out_channel=None, k_size=None, groups=1):
+        self.stride = stride
+        self.dilations = dilations
+        self.groups = groups
+        self.data_format, self.padding = preprocess_1d_format(data_format, padding)
+
+    def __call__(self, input, filters):
+        if self.data_format == 'NLC':
+            input = nhwc_to_nchw(input)
+        if self.padding == 'same':
+            out = self.conv1d_same_padding(input, filters)
+        else:
+            out = F.conv1d(input, filters, stride=self.stride, padding=self.padding,
+                           dilation=self.dilations, groups=self.groups)
+        if self.data_format == 'NLC':
+            out = nchw_to_nhwc(out)
+
+        return out
+
+    def conv1d_same_padding(self, input, filters):
+        rows_odd, padding_rows = same_padding(input, filters, self.stride, 1)
+        if rows_odd:
+            input = F.pad(input, [0, int(rows_odd)], 'replicate')
+        return F.conv1d(input, filters, stride=self.stride, padding=(padding_rows // 2), groups=self.groups)
+
+
+def conv1d(input, filters, stride, padding, data_format='NWC', dilations=None):
+    """
+    Computes a 1-D convolution given 3-D input and filter tensors.
+
+    Parameters
+    ----------
+    input : tensor
+        A 3D Tensor. Must be of type float16, float32, or float64
+    filters : tensor
+        A 3D Tensor. Must have the same type as input.
+    stride : int of list
+         An int or list of ints that has length 1 or 3. The number of entries by which the filter is moved right at each step.
+    padding : string
+         'SAME' or 'VALID'
+    data_format : string
+        An optional string from "NWC", "NCW". Defaults to "NWC", the data is stored in the order of
+        [batch, in_width, in_channels]. The "NCW" format stores data as [batch, in_channels, in_width].
+    dilations : int or list
+        An int or list of ints that has length 1 or 3 which defaults to 1.
+        The dilation factor for each dimension of input. If set to k > 1,
+        there will be k-1 skipped cells between each filter element on that dimension.
+        Dilations in the batch and depth dimensions must be 1.
+    name : string
+        A name for the operation (optional).
+    Returns
+    -------
+        A Tensor. Has the same type as input.
+    """
+
+    return Conv1D(stride=stride, padding=padding, data_format=data_format, dilations=dilations)(input, filters)
+
+
+class Conv2D(object):
+
+    def __init__(self, strides, padding, data_format='NHWC', dilations=None, out_channel=None, k_size=None, groups=1):
+        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        if self.data_format is 'NHWC':
+            self.strides = (strides[1], strides[2])
+            self.dilations = (dilations[1], dilations[2])
+        elif self.data_format is 'NCHW':
+            self.strides = (strides[2], strides[3])
+            self.dilations = (dilations[2], dilations[3])
+        self.groups = groups
+
+    def __call__(self, input, filters):
+        if self.data_format == 'NHWC':
+            input = nhwc_to_nchw(input)
+
+        if self.padding == 'same':
+            output = self.conv2d_same_padding(input, filters)
+        else:
+            output = F.conv2d(input, filters, stride=self.strides, padding=self.padding,
+                              dilation=self.dilations, groups=self.groups)
+
+        if self.data_format == 'NHWC':
+            output = nchw_to_nhwc(output)
+        return output
+
+    def conv2d_same_padding(self, input, weight, bias=None):
+        rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, weight, self.strides, self.dilations)
+        if rows_odd or cols_odd:
+            input = F.pad(input, [0, int(cols_odd), 0, int(rows_odd)])
+
+        return F.conv2d(
+            input, weight, bias, self.strides, padding=(padding_rows // 2, padding_cols // 2), dilation=self.dilations,
+            groups=self.groups
+        )
+
+
+def conv2d(input, filters, strides, padding, data_format='NHWC', dilations=None):
+    """
+    Computes a 2-D convolution given 4-D input and filters tensors.
+
+    Parameters
+    ----------
+    input : tensor
+        Must be one of the following types: half, bfloat16, float32, float64. A 4-D tensor.
+        The dimension order is interpreted according to the value of data_format, see below for details.
+    filters : tensor
+         Must have the same type as input. A 4-D tensor of shape [filter_height, filter_width, in_channels, out_channels]
+    strides : int of list
+        The stride of the sliding window for each dimension of input. If a single value is given it is replicated in the H and W dimension.
+        By default the N and C dimensions are set to 1. The dimension order is determined by the value of data_format, see below for details.
+    padding : string
+        "SAME" or "VALID"
+    data_format : string
+        "NHWC", "NCHW". Defaults to "NHWC".
+    dilations : list or ints
+        list of ints that has length 1, 2 or 4, defaults to 1. The dilation factor for each dimension ofinput.
+    name : string
+         A name for the operation (optional).
+
+    Returns
+    -------
+        A Tensor. Has the same type as input.
+    """
+
+    if data_format == 'NHWC':
+        input = nhwc_to_nchw(input)
+
+    output = F.conv2d(input, filters, stride=strides, padding=padding, dilation=dilations)
+
+    if data_format == 'NHWC':
+        output = nchw_to_nhwc(output)
+    return output
+
+
+class Conv3D(object):
+
+    def __init__(self, strides, padding, data_format='NDHWC', dilations=None, out_channel=None, k_size=None):
+        self.data_format, self.padding = preprocess_3d_format(data_format, padding)
+        if self.data_format is 'NDHWC':
+            self._strides = (strides[1], strides[2], strides[3])
+            self._dilations = (dilations[1], dilations[2], dilations[3])
+        elif self.data_format is 'NCDHW':
+            self._strides = (strides[2], strides[3], strides[4])
+            self._dilations = (dilations[2], dilations[3], dilations[4])
+
+    def __call__(self, input, filters):
+        if self.data_format == 'NDHWC':
+            input = nhwc_to_nchw(input)
+
+        if self.padding == 'same':
+            out = self.conv3d_same_padding(input, weight=filters)
+        else:
+            out = F.conv3d(input, weight=filters, stride=self._strides, padding=self.padding, dilation=self._dilations)
+
+        if self.data_format == 'NDHWC':
+            out = nchw_to_nhwc(out)
+
+        return out
+
+    def conv3d_same_padding(self, input, weight, bias=None, groups=1):
+        rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding(input, weight,
+                                                                                                self._strides, self._dilations)
+        if rows_odd or cols_odd or depth_odd:
+            input = F.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)])
+
+        return F.conv3d(
+            input, weight, bias, self._strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2),
+            dilation=self._dilations, groups=groups
+        )
+
+
+def conv3d(input, filters, strides, padding, data_format='NDHWC', dilations=None):
+    """
+    Computes a 3-D convolution given 5-D input and filters tensors.
+
+    Parameters
+    ----------
+    input : tensor
+        Must be one of the following types: half, bfloat16, float32, float64.
+        Shape [batch, in_depth, in_height, in_width, in_channels].
+    filters : tensor
+        Must have the same type as input. Shape [filter_depth, filter_height, filter_width, in_channels, out_channels].
+        in_channels must match between input and filters.
+    strides : list of ints
+        A list of ints that has length >= 5. 1-D tensor of length 5.
+        The stride of the sliding window for each dimension of input.
+        Must have strides[0] = strides[4] = 1.
+    padding : string
+        A string from: "SAME", "VALID". The type of padding algorithm to use.
+    data_format : string
+        An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". The data format of the input and output data.
+        With the default format "NDHWC", the data is stored in the order of: [batch, in_depth, in_height, in_width, in_channels].
+        Alternatively, the format could be "NCDHW", the data storage order is: [batch, in_channels, in_depth, in_height, in_width].
+    dilations : list of ints
+        Defaults to [1, 1, 1, 1, 1]. 1-D tensor of length 5. The dilation factor for each dimension of input.
+        If set to k > 1, there will be k-1 skipped cells between each filter element on that dimension.
+        The dimension order is determined by the value of data_format, see above for details.
+        Dilations in the batch and depth dimensions must be 1.
+    name : string
+        A name for the operation (optional).
+
+    Returns
+    -------
+        A Tensor. Has the same type as input.
+    """
+
+    return Conv3D(strides=strides, padding=padding, data_format=data_format, dilations=dilations)(input, filters)
+
+
+def local_response_norm(input: flow.Tensor, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.0) -> flow.Tensor:
+    r"""Applies local response normalization over an input signal composed of
+    several input planes, where channels occupy the second dimension.
+    Applies normalization across channels.
+
+    reference from torch.nn.LocalResponseNorm
+    """
+    dim = input.dim()
+    if dim < 3:
+        raise ValueError(
+            "Expected 3D or higher dimensionality \
+                         input (got {} dimensions)".format(
+                dim
+            )
+        )
+    if input.numel() == 0:
+        return input
+
+    div = input.mul(input).unsqueeze(1)
+    if dim == 3:
+        div = F.pad(div, (0, 0, size // 2, (size - 1) // 2))
+        div = F.avg_pool2d(div, (size, 1), stride=1).squeeze(1)
+    else:
+        sizes = input.size()
+        div = div.view(sizes[0], 1, sizes[1], sizes[2], -1)
+        div = F.pad(div, (0, 0, 0, 0, size // 2, (size - 1) // 2))
+        div = F.avg_pool3d(div, (size, 1, 1), stride=1).squeeze(1)
+        div = div.view(sizes)
+    div = div.mul(alpha).add(k).pow(beta)
+    return input / div
+
+
+def lrn(inputs, depth_radius, bias, alpha, beta):
+    """
+    Local Response Normalization.
+
+    Parameters
+    ----------
+    inputs : tensor
+        Must be one of the following types: half, bfloat16, float32. 4-D.
+    depth_radius : int
+        Defaults to 5. 0-D. Half-width of the 1-D normalization window.
+    bias : float
+        Defaults to 1. An offset (usually positive to avoid dividing by 0).
+    alpha : float
+        Defaults to 1. A scale factor, usually positive.
+    beta : float
+         Defaults to 0.5. An exponent.
+
+    Returns
+    -------
+        A Tensor. Has the same type as input.
+    """
+
+    return local_response_norm(inputs, depth_radius, alpha, beta, bias)
+
+
+def moments(x, axes, shift=None, keepdims=False):
+    """
+    Calculates the mean and variance of x.
+
+    Parameters
+    ----------
+    x : tensor
+        A Tensor
+    axes : list or ints
+        Axes along which to compute mean and variance.
+    shift : int
+        Not used in the current implementation.
+    keepdims : bool
+        produce moments with the same dimensionality as the input.
+
+    Returns
+    -------
+        Two Tensor objects: mean and variance.
+    """
+
+    raise NotImplementedError
+
+
+class MaxPool1d(object):
+
+    def __init__(self, ksize, strides, padding, return_mask, data_format=None):
+        self.data_format, self.padding = preprocess_1d_format(data_format=data_format, padding=padding)
+        self.return_mask = return_mask
+        self.max_pool1d = MaxPool([ksize, ], strides, padding, data_format)
+
+    def __call__(self, inputs):
+        return self.max_pool1d(inputs)
+
+
+class MaxPool(object):
+
+    def __init__(self, ksize, strides, padding, return_mask, data_format=None):
+        self.ksize = ksize
+        self.strides = strides
+        self.return_mask = return_mask
+        if data_format in ['channels_last', 'NLC', 'NWC', 'NHWC', 'NDHWC']:
+            self.data_format = 'channels_last'
+        elif data_format in ['channels_first', 'NCL', 'NCW', 'NCHW', 'NCDHW']:
+            self.data_format = 'channels_first'
+        self.padding = padding
+        if self.padding in ['VALID', 'valid']:
+            self.padding = 0
+
+    def __call__(self, inputs):
+        if self.data_format == 'channels_last':
+            inputs = nhwc_to_nchw(inputs)
+        if len(inputs.shape) == 2 or len(inputs.shape) == 3:
+            if self.padding in ['SAME', 'same']:
+                out = self.maxpool1d_same_padding(inputs)
+            else:
+                out = F.max_pool1d(inputs, self.ksize, self.strides, padding=self.padding)
+        if len(inputs.shape) == 4:
+            if self.padding in ['SAME', 'same']:
+                out = self.maxpool2d_same_padding(inputs)
+            else:
+                out = F.max_pool2d(inputs, self.ksize, self.strides, padding=self.padding)
+        if len(inputs.shape) == 5:
+            if self.padding in ['SAME', 'same']:
+                out = self.maxpool3d_same_padding(inputs)
+            else:
+                out = F.max_pool3d(inputs, self.ksize, self.strides, padding=self.padding)
+
+        if self.data_format == 'channels_last':
+            return nchw_to_nhwc(out)
+        else:
+            return out
+
+    def maxpool1d_same_padding(self, input):
+        rows_odd, padding_rows = same_padding(input, self.ksize, self.strides, 1)
+        if rows_odd:
+            input = F.pad(input, [0, int(rows_odd)], 'constant', float('-inf'))
+        return F.max_pool1d(input, self.ksize, self.strides, padding=(padding_rows // 2))
+
+    def maxpool2d_same_padding(self, input):
+        rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, self.ksize, self.strides, (1, 1))
+        if rows_odd or cols_odd:
+            # TODO The fill value for maxpool is -INF.
+            input = F.pad(input, [0, int(rows_odd), 0, int(cols_odd)], 'constant', float('-inf'))
+
+        return F.max_pool2d(input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2))
+
+    def maxpool3d_same_padding(self, input):
+        rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding(
+            input, self.ksize, self.strides, (1, 1, 1)
+        )
+        if rows_odd or cols_odd or depth_odd:
+            input = F.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)], 'constant', float('-inf'))
+        return F.max_pool3d(
+            input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2)
+        )
+
+
+def max_pool(input, ksize, strides, padding, data_format=None):
+    """
+    Performs the max pooling on the input.
+
+    Parameters
+    ----------
+    input : tensor
+        Tensor of rank N+2, of shape [batch_size] + input_spatial_shape + [num_channels] if data_format does not start
+        with "NC" (default), or [batch_size, num_channels] + input_spatial_shape if data_format starts with "NC".
+        Pooling happens over the spatial dimensions only.
+    ksize : int or list of ints
+        An int or list of ints that has length 1, N or N+2.
+        The size of the window for each dimension of the input tensor.
+    strides : int or list of ints
+        An int or list of ints that has length 1, N or N+2.
+        The stride of the sliding window for each dimension of the input tensor.
+    padding : string
+        'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+    name : string
+        A name for the operation (optional).
+
+    Returns
+    -------
+        A Tensor of format specified by data_format. The max pooled output tensor.
+    """
+
+    maxpool_obj = MaxPool(ksize, strides, padding, data_format)
+    return maxpool_obj(input)
+
+
+def max_pool1d(input, kernel_size, stride=None, padding=0, return_mask=False,  data_format='NCL'):
+    raise NotImplementedError
+
+
+def max_pool2d(input, kernel_size, stride=None, padding=0, return_mask=False, data_format='NCHW'):
+    raise NotImplementedError
+
+
+def max_pool3d(input, kernel_size, stride=None, padding=0, return_mask=False, data_format="NCDHW"):
+    raise NotImplementedError
+
+
+
+class AvgPool1d(object):
+
+    def __init__(self, ksize, strides, padding, data_format=None):
+        self.data_format, self.padding = preprocess_1d_format(data_format=data_format, padding=padding)
+        self.avg_poo1d = AvgPool([ksize, ], strides, padding, data_format)
+
+    def __call__(self, inputs):
+        return self.avg_poo1d(inputs)
+
+
+class AvgPool(object):
+
+    def __init__(self, ksize, strides, padding, data_format=None):
+        self.ksize = ksize
+        self.strides = strides
+        if data_format in ['channels_last', 'NLC', 'NWC', 'NHWC', 'NDHWC']:
+            self.data_format = 'channels_last'
+        elif data_format in ['channels_first', 'NCL', 'NCW', 'NCHW', 'NCDHW']:
+            self.data_format = 'channels_first'
+        self.padding = padding
+        if self.padding in ['VALID', 'valid']:
+            self.padding = 0
+
+    def __call__(self, inputs):
+        if self.data_format == 'channels_last':
+            inputs = nhwc_to_nchw(inputs)
+        if len(inputs.shape) == 2 or len(inputs.shape) == 3:
+            if self.padding in ['SAME', 'same']:
+                out = self.avgpool1d_same_padding(inputs)
+            else:
+                out = F.avg_pool1d(inputs, self.ksize, self.strides, padding=self.padding)
+        if len(inputs.shape) == 4:
+            if self.padding in ['SAME', 'same']:
+                out = self.avgpool2d_same_padding(inputs)
+            else:
+                out = F.avg_pool2d(inputs, self.ksize, self.strides, padding=self.padding)
+        if len(inputs.shape) == 5:
+            if self.padding in ['SAME', 'same']:
+                out = self.avgpool3d_same_padding(inputs)
+            else:
+                out = F.avg_pool3d(inputs, self.ksize, self.strides, padding=self.padding)
+
+        if self.data_format == 'channels_last':
+            return nchw_to_nhwc(out)
+        else:
+            return out
+
+    def avgpool1d_same_padding(self, input):
+        rows_odd, padding_rows = same_padding(input, self.ksize, self.strides, 1)
+        if rows_odd:
+            input = F.pad(input, [0, int(rows_odd)], 'replicate')
+        return F.avg_pool1d(input, self.ksize, self.strides, padding=(padding_rows // 2))
+
+    def avgpool2d_same_padding(self, input):
+        rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, self.ksize, self.strides, (1, 1))
+        if rows_odd or cols_odd:
+            # TODO The fill value for maxpool is -INF.
+            input = F.pad(input, [0, int(rows_odd), 0, int(cols_odd)], mode='replicate')
+
+        return F.avg_pool2d(input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2))
+
+    def avgpool3d_same_padding(self, input):
+        rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding(
+            input, self.ksize, self.strides, (1, 1, 1)
+        )
+        if rows_odd or cols_odd or depth_odd:
+            input = F.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)], mode='replicate')
+        return F.avg_pool3d(
+            input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2)
+        )
+
+
+def avg_pool(input, ksize, strides, padding):
+    """
+    Performs the avg pooling on the input.
+
+    Parameters
+    ----------
+    input : tensor
+        Tensor of rank N+2, of shape [batch_size] + input_spatial_shape + [num_channels]
+        if data_format does not start with "NC" (default), or [batch_size, num_channels] + input_spatial_shape
+        if data_format starts with "NC". Pooling happens over the spatial dimensions only.
+    ksize : int or list of ints
+        An int or list of ints that has length 1, N or N+2.
+        The size of the window for each dimension of the input tensor.
+    strides : int or list of ints
+        An int or list of ints that has length 1, N or N+2.
+        The stride of the sliding window for each dimension of the input tensor.
+    padding : string
+        'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+    name : string
+        Optional name for the operation.
+
+    Returns
+    -------
+        A Tensor of format specified by data_format. The average pooled output tensor.
+    """
+
+    avg_pool_obj = AvgPool(ksize, strides, padding)
+    return avg_pool_obj(input)
+
+
+class MaxPool3d(object):
+
+    def __init__(self, ksize, strides, padding, return_mask, data_format=None):
+        self.data_format, self.padding = preprocess_3d_format(data_format, padding)
+        self.return_mask = return_mask
+        self.max_pool3d = MaxPool(ksize, strides, padding, data_format)
+
+    def __call__(self, inputs):
+        return self.max_pool3d(inputs)
+
+
+# def max_pool3d(input, ksize, strides, padding, data_format=None):
+#     """
+#     Performs the max pooling on the input.
+#
+#     Parameters
+#     ----------
+#     input : tensor
+#          A 5-D Tensor of the format specified by data_format.
+#     ksize : int or list of ints
+#         An int or list of ints that has length 1, 3 or 5.
+#         The size of the window for each dimension of the input tensor.
+#     strides : int or list of ints
+#         An int or list of ints that has length 1, 3 or 5.
+#         The stride of the sliding window for each dimension of the input tensor.
+#     padding : string
+#         'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+#     data_format : string
+#          "NDHWC", "NCDHW". Defaults to "NDHWC". The data format of the input and output data.
+#          With the default format "NDHWC", the data is stored in the order of: [batch, in_depth, in_height, in_width, in_channels].
+#          Alternatively, the format could be "NCDHW", the data storage order is: [batch, in_channels, in_depth, in_height, in_width].
+#     name : string
+#          A name for the operation (optional).
+#
+#     Returns
+#     -------
+#         A Tensor of format specified by data_format. The max pooled output tensor.
+#     """
+#
+#     data_format, padding = preprocess_3d_format(data_format, padding)
+#     max_pool3d_obj = MaxPool(ksize, strides, padding, data_format)
+#     return max_pool3d_obj(input)
+
+
+class AvgPool3d(object):
+
+    def __init__(self, ksize, strides, padding, data_format=None):
+        self.data_format, self.padding = preprocess_3d_format(data_format, padding)
+        self.avg_pool3d_obj = AvgPool(ksize, strides, self.padding, self.data_format)
+
+    def __call__(self, inputs):
+        return self.avg_pool3d_obj(inputs)
+
+
+def avg_pool1d(input, kernel_size, stride=None, padding=0,  data_format='NCL'):
+
+    raise NotImplementedError
+
+
+def avg_pool2d(input, kernel_size, stride=None, padding=0, data_format='NCHW'):
+
+    raise NotImplementedError
+
+
+def avg_pool3d(input, kernel_size, stride=None, padding=0, data_format='NCDHW'):
+
+    raise NotImplementedError
+
+
+def pool(input, window_shape, pooling_type, strides=None, padding='VALID', data_format=None, dilations=None, name=None):
+    """
+    Performs an N-D pooling operation.
+
+    Parameters
+    ----------
+    input : tensor
+        Tensor of rank N+2, of shape [batch_size] + input_spatial_shape + [num_channels]
+        if data_format does not start with "NC" (default), or [batch_size, num_channels] + input_spatial_shape
+        if data_format starts with "NC". Pooling happens over the spatial dimensions only.
+    window_shape : int
+        Sequence of N ints >= 1.
+    pooling_type : string
+        Specifies pooling operation, must be "AVG" or "MAX".
+    strides : ints
+        Sequence of N ints >= 1. Defaults to [1]*N. If any value of strides is > 1, then all values of dilation_rate must be 1.
+    padding : string
+        The padding algorithm, must be "SAME" or "VALID". Defaults to "SAME".
+        See the "returns" section of tf.ops.convolution for details.
+    data_format : string
+        Specifies whether the channel dimension of the input and output is the last dimension (default, or if data_format does not start with "NC"),
+        or the second dimension (if data_format starts with "NC").
+        For N=1, the valid values are "NWC" (default) and "NCW". For N=2, the valid values are "NHWC" (default) and "NCHW".
+        For N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    dilations : list of ints
+        Dilation rate. List of N ints >= 1. Defaults to [1]*N. If any value of dilation_rate is > 1, then all values of strides must be 1.
+    name : string
+        Optional. Name of the op.
+
+    Returns
+    -------
+        Tensor of rank N+2, of shape [batch_size] + output_spatial_shape + [num_channels]
+    """
+
+    if pooling_type in ["MAX", "max"]:
+        pool_obj = MaxPool(window_shape, strides, padding, data_format)
+    elif pooling_type in ["AVG", "avg"]:
+        pool_obj = AvgPool(window_shape, strides, padding, data_format)
+    else:
+        raise ValueError('Unsupported pool_mode: ' + str(pooling_type))
+
+    return pool_obj(input)
+
+
+class DepthwiseConv2d(object):
+
+    def __init__(self, strides, padding, data_format=None, dilations=None, ksize=None, channel_multiplier=1, in_channels=None):
+        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        if self.data_format is 'NHWC':
+            self.strides = (1, strides[0], strides[1], 1)
+            self.dilations = (1, dilations[0], dilations[1], 1)
+        elif self.data_format is 'NCHW':
+            self.strides = (1, 1, strides[0], strides[1])
+            self.dilations = (1, 1, dilations[0], dilations[1])
+        self.depthwise = Conv2D(padding=self.padding, strides=self.strides, data_format=self.data_format,
+                                dilations=self.dilations, groups=in_channels)
+        self.pointwise = Conv2D(strides=(1, 1, 1, 1), padding=self.padding,
+                                data_format=self.data_format, dilations=self.dilations, k_size=1)
+
+    def __call__(self, input, filter, point_filter=None):
+        depthwise_conv = self.depthwise(input, filter)
+        pointwise_conv = self.pointwise(depthwise_conv, point_filter)
+
+        return pointwise_conv
+
+
+def depthwise_conv2d(input, filter, strides, padding, data_format=None, dilations=None, name=None):
+    """
+    Depthwise 2-D convolution.
+
+    Parameters
+    ----------
+    input : tensor
+        4-D with shape according to data_format.
+    filter : tensor
+        4-D with shape [filter_height, filter_width, in_channels, channel_multiplier].
+    strides : list
+        1-D of size 4. The stride of the sliding window for each dimension of input.
+    padding : string
+        'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+    data_format : string
+        The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations : list
+        1-D of size 2. The dilation rate in which we sample input values across the height and width dimensions in atrous convolution.
+        If it is greater than 1, then all values of strides must be 1.
+    name : string
+        A name for this operation (optional).
+
+    Returns
+    -------
+        A 4-D Tensor with shape according to data_format.
+        E.g., for "NHWC" format, shape is [batch, out_height, out_width, in_channels * channel_multiplier].
+    """
+
+    depthwise_conv2d_obj = DepthwiseConv2d(strides, padding, data_format, dilations)
+    return depthwise_conv2d_obj(input, filter)
+
+
+def same_padding_deconvolution(input, weight, strides, dilations):
+    # H(out) = floor((H(in) - 1)*stride[0] - 2* padding[0] + dilation[0] * (ksize[0]-1) + 1)
+
+    if isinstance(weight, flow.Tensor):
+        if len(input.shape) == 3:
+            filter_rows = weight.size(2)
+        if len(input.shape) == 4:
+            filter_rows = weight.size(2)
+            filter_cols = weight.size(3)
+        elif len(input.shape) == 5:
+            filter_rows = weight.size(2)
+            filter_cols = weight.size(3)
+            filter_depth = weight.size(4)
+    else:
+        if len(input.shape) == 3:
+            filter_rows = weight[0]
+        elif len(input.shape) == 4:
+            filter_rows = weight[0]
+            filter_cols = weight[1]
+        elif len(input.shape) == 5:
+            filter_rows = weight[0]
+            filter_cols = weight[1]
+            filter_depth = weight[2]
+
+    if len(input.shape) == 3:
+        input_rows = input.size(2)
+        out_rows = input_rows * strides - strides + 1
+        padding_rows = max(0, (input_rows - 1) * strides + (filter_rows - 1) * dilations + 1 - out_rows)
+        rows_odd = (padding_rows % 2 != 0)
+        return rows_odd, padding_rows
+
+    if len(input.shape) == 4:
+        input_rows = input.size(2)
+        input_cols = input.size(3)
+
+        out_rows = input_rows * strides[0] - strides[0] + 1
+        out_cols = input_rows * strides[1] - strides[1] + 1
+
+        padding_rows = max(0, (input_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - out_rows)
+        padding_cols = max(0, (input_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - out_cols)
+
+        rows_odd = (padding_rows % 2 != 0)
+        cols_odd = (padding_cols % 2 != 0)
+        return rows_odd, cols_odd, padding_rows, padding_cols
+
+    if len(input.shape) == 5:
+        input_rows = input.size(2)
+        input_cols = input.size(3)
+        input_depth = input.size(4)
+
+        out_rows = input_rows * strides[0] - strides[0] + 1
+        out_cols = input_rows * strides[1] - strides[1] + 1
+        out_depth = input_rows * strides[2] - strides[2] + 1
+
+        padding_rows = max(0, (input_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - out_rows)
+        padding_cols = max(0, (input_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - out_cols)
+        padding_depth = max(0, (input_depth - 1) * strides[2] + (filter_depth - 1) * dilations[2] + 1 - out_depth)
+
+        rows_odd = (padding_rows % 2 != 0)
+        cols_odd = (padding_cols % 2 != 0)
+        depth_odd = (padding_depth % 2 != 0)
+        return rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth
+
+
+class Conv1d_transpose(object):
+
+    def __init__(
+        self, stride, padding, data_format='NWC', dilations=None, out_channel=None, k_size=None, in_channels=None
+    ):
+        self.stride = stride
+        self.dilations = dilations
+        self.data_format, self.padding = preprocess_1d_format(data_format, padding)
+
+    def __call__(self, input, filters):
+        if self.data_format == 'NLC':
+            input = nhwc_to_nchw(input)
+        if self.padding == 'same':
+            out = self.conv1d_transpose_same_padding(input, filters)
+        else:
+            out = F.conv_transpose1d(
+                input,
+                weight=filters,
+                padding=(0 if isinstance(self.padding, str) else self.padding),
+                stride=self.stride,
+                dilation=self.dilations
+            )
+        if self.data_format == 'NLC':
+            out = nchw_to_nhwc(out)
+        return out
+
+    def conv1d_transpose_same_padding(self, input, filters):
+        rows_odd, padding_rows = same_padding_deconvolution(input, filters, self.stride, 1)
+        if rows_odd:
+            input = F.pad(input, [0, int(rows_odd)])
+            out_padding = 0
+        else:
+            out_padding = 1
+        return F.conv_transpose1d(input, weight=filters, padding=(padding_rows // 2), stride=self.stride,
+                                  dilation=self.dilations, output_padding=out_padding)
+
+
+def conv1d_transpose(
+    input, filters, output_shape, strides, padding='SAME', data_format='NWC', dilations=None, name=None
+):
+    """
+    The transpose of conv1d.
+
+    Parameters
+    ----------
+    input : tensor
+        A 3-D Tensor of type float and shape [batch, in_width, in_channels]
+        for NWC data format or [batch, in_channels, in_width] for NCW data format.
+    filters : tensor
+        A 3-D Tensor with the same type as value and shape [filter_width, output_channels, in_channels].
+        filter's in_channels dimension must match that of value.
+    output_shape : tensor
+        A 1-D Tensor, containing three elements, representing the output shape of the deconvolution op.
+    strides : list
+        An int or list of ints that has length 1 or 3. The number of entries by which the filter is moved right at each step.
+    padding : string
+        'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+    data_format : string
+        'NWC' and 'NCW' are supported.
+    dilations : list
+         An int or list of ints that has length 1 or 3 which defaults to 1.
+         The dilation factor for each dimension of input. If set to k > 1,
+         there will be k-1 skipped cells between each filter element on that dimension.
+         Dilations in the batch and depth dimensions must be 1.
+    name : string
+        Optional name for the returned tensor.
+
+    Returns
+    -------
+        A Tensor with the same type as value.
+    """
+
+    conv1d_transpose_obj = Conv1d_transpose(strides, padding, data_format, dilations)
+    return conv1d_transpose_obj(input, filters)
+
+
+class Conv2d_transpose(object):
+
+    def __init__(
+        self, strides, padding, data_format='NHWC', dilations=None, name=None, out_channel=None, k_size=None,
+        in_channels=None, groups = 1, output_padding = 0,
+    ):
+        self.strides = strides
+        self.dilations = dilations
+        self.name = name
+        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        self.groups = groups
+        self.output_padding = output_padding
+
+    def __call__(self, input, filters, output_size):
+        if self.data_format == 'NHWC':
+            input = nhwc_to_nchw(input)
+        if self.padding == 'same':
+            out = self.conv2d_transpore_same(input, filters)
+        else:
+            out = F.conv_transpose2d(
+                input,
+                weight=filters,
+                padding=(0 if isinstance(self.padding, str) else self.padding),
+                stride=self.strides,
+                dilation=self.dilations
+            )
+        if self.data_format == 'NHWC':
+            out = nchw_to_nhwc(out)
+        return out
+
+    def conv2d_transpore_same(self, input, filters):
+        rows_odd, cols_odd, padding_rows, padding_cols = same_padding_deconvolution(
+            input, filters, self.strides, (1, 1))
+        if rows_odd or cols_odd:
+            input = F.pad(input, [0, int(rows_odd), 0, int(cols_odd)])
+            out_padding = 0
+        else:
+            out_padding = 1
+        out = F.conv_transpose2d(input, weight=filters, padding=(padding_rows // 2, padding_cols // 2), stride=self.strides,
+                                 dilation=self.dilations, output_padding=out_padding)
+        return out
+
+
+def conv2d_transpose(
+    input, filters, output_shape, strides, padding='SAME', data_format='NHWC', dilations=None, name=None
+):
+    """
+    The transpose of conv2d.
+
+    Parameters
+    ----------
+    input : tensor
+        A 4-D Tensor of type float and shape [batch, height, width, in_channels]
+        for NHWC data format or [batch, in_channels, height, width] for NCHW data format.
+    filters : tensor
+        A 4-D Tensor with the same type as input and shape [height, width,
+        output_channels, in_channels]. filter's in_channels dimension must match that of input.
+    output_shape : tensor
+        A 1-D Tensor representing the output shape of the deconvolution op.
+    strides : list
+        An int or list of ints that has length 1, 2 or 4. The stride of the sliding window for each dimension of input.
+        If a single value is given it is replicated in the H and W dimension.
+        By default the N and C dimensions are set to 0.
+        The dimension order is determined by the value of data_format, see below for details.
+    padding : string
+        'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+    data_format : string
+         'NHWC' and 'NCHW' are supported.
+    dilations : list
+        An int or list of ints that has length 1, 2 or 4, defaults to 1.
+    name : string
+        Optional name for the returned tensor.
+
+    Returns
+    -------
+        A Tensor with the same type as input.
+    """
+
+    raise NotImplementedError
+
+
+class Conv3d_transpose(object):
+
+    def __init__(
+        self, strides, padding, data_format='NDHWC', dilations=None, name=None, out_channel=None, k_size=None,
+        in_channels=None
+    ):
+        self.strides = strides
+        self.dilations = dilations
+        self.name = name
+        self.out_channel = out_channel
+        self.data_format, self.padding = preprocess_3d_format(data_format, padding)
+
+    def __call__(self, input, filters):
+        if self.data_format == 'NDHWC':
+            input = nhwc_to_nchw(input)
+        if self.padding == 'same':
+            out = self.conv3d_transpore_same(input, filters)
+        else:
+            out = F.conv_transpose3d(
+                input,
+                weight=filters,
+                padding=(0 if isinstance(self.padding, str) else self.padding),
+                stride=self.strides,
+                dilation=self.dilations
+            )
+        if self.data_format == 'NDHWC':
+            out = nchw_to_nhwc(out)
+        return out
+
+    def conv3d_transpore_same(self, input, filters):
+        rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding_deconvolution(
+            input, filters, self.strides, (1, 1, 1))
+        if rows_odd or cols_odd or depth_odd:
+            input = F.pad(input, [0, int(rows_odd), 0, int(cols_odd), 0, int(depth_odd)])
+            out_padding = 0
+        else:
+            out_padding = 1
+        out = F.conv_transpose3d(input, weight=filters, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2),
+                                 stride=self.strides, dilation=self.dilations, output_padding=out_padding)
+        return out
+
+
+def conv3d_transpose(
+    input, filters, output_shape, strides, padding='SAME', data_format='NDHWC', dilations=None, name=None
+):
+    """
+    The transpose of conv3d.
+
+    Parameters
+    ----------
+    input : tensor
+         A 5-D Tensor of type float and shape [batch, height, width, in_channels] for
+         NHWC data format or [batch, in_channels, height, width] for NCHW data format.
+    filters : tensor
+        A 5-D Tensor with the same type as value and shape [height, width, output_channels, in_channels].
+        filter's in_channels dimension must match that of value.
+    output_shape : tensor
+        A 1-D Tensor representing the output shape of the deconvolution op.
+    strides : list
+        An int or list of ints that has length 1, 3 or 5.
+    padding : string
+        'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+    data_format : string
+        'NDHWC' and 'NCDHW' are supported.
+    dilations : list of ints
+        An int or list of ints that has length 1, 3 or 5, defaults to 1.
+    name : string
+        Optional name for the returned tensor.
+
+    Returns
+    -------
+        A Tensor with the same type as value.
+    """
+
+    data_format, padding = preprocess_3d_format(data_format, padding)
+    conv3d_transpose_obj = Conv3d_transpose(strides, padding, data_format, dilations)
+    return conv3d_transpose_obj(input, filters)
+
+
+def _to_channel_first_bias(b):
+    """Reshape [c] to [c, 1, 1]."""
+    raise NotImplementedError
+
+
+def _bias_scale(x, b, data_format):
+    """The multiplication counter part of tf.nn.bias_add."""
+    raise NotImplementedError
+
+
+def _bias_add(x, b, data_format):
+    """Alternative implementation of tf.nn.bias_add which is compatiable with tensorRT."""
+    raise NotImplementedError
+
+
+def batch_normalization(x, mean, variance, offset, scale, variance_epsilon, data_format, name=None):
+    """Data Format aware version of tf.nn.batch_normalization."""
+    raise NotImplementedError
+
+
+class BatchNorm(object):
+    """
+    The :class:`BatchNorm` is a batch normalization layer for both fully-connected and convolution outputs.
+    See ``tf.nn.batch_normalization`` and ``tf.nn.moments``.
+
+    Parameters
+    ----------
+    decay : float
+        A decay factor for `ExponentialMovingAverage`.
+        Suggest to use a large value for large dataset.
+    epsilon : float
+        Eplison.
+    act : activation function
+        The activation function of this layer.
+    is_train : boolean
+        Is being used for training or inference.
+    beta_init : initializer or None
+        The initializer for initializing beta, if None, skip beta.
+        Usually you should not skip beta unless you know what happened.
+    gamma_init : initializer or None
+        The initializer for initializing gamma, if None, skip gamma.
+        When the batch normalization layer is use instead of 'biases', or the next layer is linear, this can be
+        disabled since the scaling can be done by the next layer. see `Inception-ResNet-v2 <https://github.com/tensorflow/models/blob/master/research/slim/nets/inception_resnet_v2.py>`__
+    moving_mean_init : initializer or None
+        The initializer for initializing moving mean, if None, skip moving mean.
+    moving_var_init : initializer or None
+        The initializer for initializing moving var, if None, skip moving var.
+    num_features: int
+        Number of features for input tensor. Useful to build layer if using BatchNorm1d, BatchNorm2d or BatchNorm3d,
+        but should be left as None if using BatchNorm. Default None.
+    data_format : str
+        channels_last 'channel_last' (default) or channels_first.
+    name : None or str
+        A unique layer name.
+
+    Examples
+    ---------
+    With TensorLayer
+
+    >>> net = tlx.layers.Input([None, 50, 50, 32], name='input')
+    >>> net = tlx.layers.BatchNorm()(net)
+
+    Notes
+    -----
+    The :class:`BatchNorm` is universally suitable for 3D/4D/5D input in static model, but should not be used
+    in dynamic model where layer is built upon class initialization. So the argument 'num_features' should only be used
+    for subclasses :class:`BatchNorm1d`, :class:`BatchNorm2d` and :class:`BatchNorm3d`. All the three subclasses are
+    suitable under all kinds of conditions.
+
+    References
+    ----------
+    - `Source <https://github.com/ry/tensorflow-resnet/blob/master/resnet.py>`__
+    - `stackoverflow <http://stackoverflow.com/questions/38312668/how-does-one-do-inference-with-batch-normalization-with-tensor-flow>`__
+
+    """
+
+    def __init__(
+        self, decay=0.9, epsilon=0.00001, beta=None, gamma=None, moving_mean=None, moving_var=None, num_features=None,
+        data_format='channels_last', is_train=False
+    ):
+        self.decay = 1 - decay
+        self.epsilon = epsilon
+        self.data_format = data_format
+        self.beta = beta
+        self.gamma = gamma
+        self.moving_mean = moving_mean
+        self.moving_var = moving_var
+        self.num_features = num_features
+        self.is_train = is_train
+        self.axes = None
+
+        if self.decay < 0.0 or 1.0 < self.decay:
+            raise ValueError("decay should be between 0 to 1")
+
+        self.bn = nn.BatchNorm2d(
+            num_features=self.num_features,
+            eps=self.epsilon,
+            momentum=self.decay,
+            affine=True,
+            track_running_stats=True,
+        )
+
+    def __call__(self, inputs):
+        if self.data_format == 'channels_last':
+            inputs = nhwc_to_nchw(inputs)
+
+        out = _C.normalization(inputs,
+                               self.moving_mean,
+                               self.moving_var,
+                               self.gamma,
+                               self.beta,
+                               is_training=self.is_train,
+                               momentum=self.decay)
+        if self.data_format == 'channels_last':
+            out = nchw_to_nhwc(out)
+        return out
+
+
+class GroupConv2D(object):
+
+    def __init__(self, strides, padding, data_format, dilations, out_channel, k_size, groups=1):
+        self.groups = groups
+        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        self.conv2d = Conv2D(strides, self.padding, self.data_format, dilations, groups=self.groups)
+
+    def __call__(self, input, filters):
+        return self.conv2d(input, filters)
+
+
+class SeparableConv1D(object):
+
+    def __init__(self, stride, padding, data_format, dilations, out_channel, k_size, in_channel, depth_multiplier):
+        self.data_format, self.padding = preprocess_1d_format(data_format, padding)
+        self.depthwise_conv = Conv1D(stride, self.padding, self.data_format, dilations, groups=in_channel)
+        self.pointwise_conv = Conv1D(1, self.padding, self.data_format, 1)
+
+    def __call__(self, inputs, depthwise_filters, pointwise_filters):
+        depthwise_conv = self.depthwise_conv(inputs, depthwise_filters)
+        pointwise_conv = self.pointwise_conv(depthwise_conv, pointwise_filters)
+        return pointwise_conv
+
+
+class SeparableConv2D(object):
+
+    def __init__(self, strides, padding, data_format, dilations, out_channel, k_size, in_channel, depth_multiplier):
+        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        self.depthwise_conv = Conv2D(strides, self.padding, self.data_format, dilations, groups=in_channel)
+        self.pointwise_conv = Conv2D((1, 1), self.padding, self.data_format, (1, 1))
+
+    def __call__(self, input, filter, point_filter=None):
+        depthwise_conv = self.depthwise_conv(input, filter)
+        pointwise_conv = self.pointwise_conv(depthwise_conv, point_filter)
+        return pointwise_conv
+
+
+class AdaptiveMeanPool1D(object):
+
+    def __init__(self, output_size, data_format):
+        self.data_format, _ = preprocess_1d_format(data_format, None)
+        self.op = nn.AdaptiveAvgPool1d(output_size)
+
+    def __call__(self, input):
+        if self.data_format == 'NLC':
+            input = nhwc_to_nchw(input)
+        output = self.op(input)
+        if self.data_format == 'NLC':
+            output = nchw_to_nhwc(output)
+        return output
+
+
+class AdaptiveMeanPool2D(object):
+
+    def __init__(self, output_size, data_format):
+        self.data_format, _ = preprocess_2d_format(data_format, None)
+        self.op = nn.AdaptiveAvgPool2d(output_size=output_size)
+
+    def __call__(self, inputs):
+        if self.data_format == 'NHWC':
+            inputs = nhwc_to_nchw(inputs)
+        output = self.op(inputs)
+        if self.data_format == 'NHWC':
+            output = nchw_to_nhwc(output)
+        return output
+
+
+class AdaptiveMeanPool3D(object):
+
+    def __init__(self, output_size, data_format):
+        self.data_format, _ = preprocess_3d_format(data_format, None)
+        self.op = nn.AdaptiveAvgPool3d(output_size=output_size)
+
+    def __call__(self, inputs):
+        if self.data_format == 'NDHWC':
+            inputs = nhwc_to_nchw(inputs)
+        output = self.op(inputs)
+        if self.data_format == 'NDHWC':
+            output = nchw_to_nhwc(output)
+        return output
+
+def adaptive_avg_pool1d(input, output_size):
+
+    return F.adaptive_avg_pool1d(input, output_size)
+
+
+def adaptive_avg_pool2d(input, output_size):
+
+    return F.adaptive_avg_pool2d(input, output_size)
+
+
+def adaptive_avg_pool3d(input, output_size):
+
+    return F.adaptive_avg_pool3d(input, output_size)
+
+class AdaptiveMaxPool1D(object):
+
+    def __init__(self, output_size, data_format):
+        self.data_format, _ = preprocess_1d_format(data_format, None)
+        self.op = nn.AdaptiveMaxPool1d(output_size=output_size)
+
+    def __call__(self, input):
+        if self.data_format == 'NLC':
+            input = nhwc_to_nchw(input)
+        output = self.op(input)
+        if self.data_format == 'NLC':
+            output = nchw_to_nhwc(output)
+        return output
+
+
+class AdaptiveMaxPool2D(object):
+
+    def __init__(self, output_size, data_format):
+        self.data_format, _ = preprocess_2d_format(data_format, None)
+        self.op = nn.AdaptiveMaxPool2d(output_size=output_size)
+
+    def __call__(self, inputs):
+        if self.data_format == 'NHWC':
+            inputs = nhwc_to_nchw(inputs)
+        output = self.op(inputs)
+        if self.data_format == 'NHWC':
+            output = nchw_to_nhwc(output)
+        return output
+
+
+class AdaptiveMaxPool3D(object):
+
+    def __init__(self, output_size, data_format):
+        self.data_format, _ = preprocess_3d_format(data_format, None)
+        self.op = nn.AdaptiveMaxPool3d(output_size=output_size)
+
+    def __call__(self, inputs):
+        if self.data_format == 'NDHWC':
+            inputs = nhwc_to_nchw(inputs)
+        output = self.op(inputs)
+        if self.data_format == 'NDHWC':
+            output = nchw_to_nhwc(output)
+        return output
+
+def adaptive_max_pool1d(input, output_size, return_indices = False):
+
+    return F.adaptive_max_pool1d(input, output_size, return_indices)
+
+def adaptive_max_pool2d(input, output_size, return_indices = False):
+
+    return F.adaptive_max_pool2d(input, output_size, return_indices)
+
+def adaptive_max_pool3d(input, output_size, return_indices=False):
+
+    return F.adaptive_max_pool3d(input, output_size, return_indices)
+
+class BinaryConv2D(object):
+
+    def __init__(self, strides, padding, data_format, dilations, out_channel, k_size, in_channel):
+        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        self.strides = strides
+        self.dilations = dilations
+
+    def quantize(self, x):
+        raise NotImplementedError
+
+    def __call__(self, inputs, filters):
+        raise NotImplementedError
+
+
+class DorefaConv2D(object):
+
+    def __init__(self, bitW, bitA, strides, padding, data_format, dilations, out_channel, k_size, in_channel):
+        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        self.strides = strides
+        self.dilations = dilations
+        self.bitW = bitW
+        self.bitA = bitA
+
+    def _quantize_dorefa(self, x, k):
+        raise NotImplementedError
+
+    def cabs(self, x):
+        raise NotImplementedError
+
+    def quantize_active(self, x, bitA):
+        raise NotImplementedError
+
+    def quantize_weight(self, x, bitW, force_quantization=False):
+        raise NotImplementedError
+
+    def __call__(self, inputs, filters):
+        raise NotImplementedError
+
+
+class rnncell(object):
+
+    def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh, act):
+        self.weight_ih = weight_ih
+        self.weight_hh = weight_hh
+        self.bias_ih = bias_ih
+        self.bias_hh = bias_hh
+        self.act = act
+
+    def __call__(self, input, h):
+        if self.act == 'tanh':
+            h = _C.rnn_tanh_cell(
+                input,
+                h,
+                self.weight_ih,
+                self.weight_hh,
+                self.bias_ih,
+                self.bias_hh,
+            )
+        else:
+            h = _C.rnn_relu_cell(
+                input,
+                h,
+                self.weight_ih,
+                self.weight_hh,
+                self.bias_ih,
+                self.bias_hh,
+            )
+        return h, h
+
+
+class lstmcell(object):
+
+    def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh, act=None):
+        self.weight_ih = weight_ih
+        self.weight_hh = weight_hh
+        self.bias_ih = bias_ih
+        self.bias_hh = bias_hh
+
+    def __call__(self, input, h, c):
+        h = (h, c)
+        h, c = _C.lstm_cell(
+            input,
+            h,
+            self.weight_ih,
+            self.weight_hh,
+            self.bias_ih,
+            self.bias_hh,
+        )
+        return h, h, c
+
+
+class grucell(object):
+
+    def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh, act=None):
+        self.weight_ih = weight_ih
+        self.weight_hh = weight_hh
+        self.bias_ih = bias_ih
+        self.bias_hh = bias_hh
+
+    def __call__(self, input, h):
+        h = _C.gru_cell(
+            input,
+            h,
+            self.weight_ih,
+            self.weight_hh,
+            self.bias_ih,
+            self.bias_hh,
+        )
+        return h, h
+
+
+class rnnbase(Module):
+
+    def __init__(
+        self,
+        mode,
+        input_size,
+        hidden_size,
+        num_layers,
+        bias,
+        batch_first,
+        dropout,
+        bidirectional,
+        is_train,
+        w_ih,
+        w_hh,
+        b_ih,
+        b_hh,
+    ):
+        super(rnnbase, self).__init__()
+        self.mode = mode
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = float(dropout)
+        self.train = is_train
+        if not 0 <= dropout < 1:
+            raise ValueError("dropout should be a number in range [0, 1).")
+        if dropout > 0 and num_layers == 1:
+            raise ValueError(
+                "dropout option adds dropout after all but last "
+                "recurrent layer, so non-zero dropout expects "
+                "num_layers greater than 1, but got dropout={} and "
+                "num_layers={}".format(dropout, num_layers)
+            )
+        self.bidirectional = bidirectional
+        self.num_directions = 2 if bidirectional else 1
+        self.rnn_impls = {
+            'RNN_TANH': _C.rnn_tanh,
+            'RNN_RELU': _C.rnn_relu,
+            'GRU': _C.gru,
+        }
+        self.w_ih = w_ih
+        self.w_hh = w_hh
+        self.b_ih = b_ih
+        self.b_hh = b_hh
+
+        self.proj_size = 0
+        self.act_fn = None
+        self._flat_weights_names = []
+        self._all_weights = []
+        cur = 0
+        for layer in range(num_layers):
+            for direction in range(self.num_directions):
+                if bias:
+                    layer_params = (w_ih[cur], w_hh[cur], b_ih[cur], b_hh[cur])
+                else:
+                    layer_params = (w_ih[cur], w_hh[cur])
+
+                suffix = '_reverse' if direction == 1 else ''
+                param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
+                if bias:
+                    param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
+                param_names = [x.format(layer, suffix) for x in param_names]
+
+                for name, param in zip(param_names, layer_params):
+                    setattr(self, name, param)
+                self._flat_weights_names.extend(param_names)
+                self._all_weights.append(param_names)
+                cur += 1
+        self._flat_weights = [
+            (lambda wn: getattr(self, wn) if hasattr(self, wn) else None)(wn) for wn in self._flat_weights_names
+        ]
+        self.flatten_parameters()
+
+
+    def flatten_parameters(self):
+        if len(self._flat_weights) != len(self._flat_weights_names):
+            return
+
+        for w in self._flat_weights:
+            if not isinstance(w, flow.Tensor):
+                return
+        first_fw = self._flat_weights[0]
+        dtype = first_fw.dtype
+        for fw in self._flat_weights:
+            if (not isinstance(fw.data, flow.Tensor) or not (fw.data.dtype == dtype) or not fw.data.is_cuda):
+                return
+
+    def _apply(self, fn):
+        ret = super(rnnbase, self)._apply(fn)
+        self._flat_weights = [
+            (lambda wn: getattr(self, wn) if hasattr(self, wn) else None)(wn) for wn in self._flat_weights_names
+        ]
+        self.flatten_parameters()
+        return ret
+
+    def check_input(self, input_shape):
+        if len(input_shape) != 3:
+            raise ValueError("input must have 3 dimensions. But got {}.".format(len(input_shape)))
+        if self.input_size != input_shape[-1]:
+            raise ValueError(
+                "The last dimension of input should be equal to input_size {}.But got {}".format(
+                    self.input_size, input_shape[-1]
+                )
+            )
+
+    def check_hidden(self, h, batch_size):
+        expected_hidden_size = (self.num_layers * self.num_directions, batch_size, self.hidden_size)
+        if h.shape != expected_hidden_size:
+            raise ValueError('Expected hidden size {}, got {}.'.format(expected_hidden_size, h.shape))
+
+    def forward(self, input, states):
+        batch_size = input.shape[0] if self.batch_first else input.shape[1]
+        input_shape = input.shape
+        self.check_input(input_shape)
+        if self.mode == 'LSTM':
+            if states is None:
+                h = flow.zeros(
+                    self.num_layers * self.num_directions, batch_size, self.hidden_size, dtype=input.dtype,
+                    device=input.device
+                )
+                c = flow.zeros(
+                    self.num_layers * self.num_directions, batch_size, self.hidden_size, dtype=input.dtype,
+                    device=input.device
+                )
+                states = (h, c)
+            else:
+                h, c = states
+                self.check_hidden(h, batch_size)
+                self.check_hidden(c, batch_size)
+            result = _C.lstm(
+                input, states, self._flat_weights, self.bias, self.num_layers, self.dropout, self.training,
+                self.bidirectional, self.batch_first
+            )
+            return result[0], result[1:]
+        else:
+            if states is None:
+                h = flow.zeros(
+                    self.num_layers * self.num_directions, batch_size, self.hidden_size, dtype=input.dtype,
+                    device=input.device
+                )
+                states = h
+            else:
+                self.check_hidden(states, batch_size)
+            impl = self.rnn_impls[self.mode]
+            result = impl(
+                input, states, self._flat_weights, self.bias, self.num_layers, self.dropout, self.training,
+                self.bidirectional, self.batch_first
+            )
+            return result[0], result[1]
+
+class layernorm(object):
+
+    def __init__(self, normalized_shape, gamma, beta, eps, input_shape):
+        self.normalized_shape = normalized_shape
+        self.gamma = gamma
+        self.beta = beta
+        self.eps = eps
+        self.input_shape = input_shape
+        self.axis = list(range((len(input_shape) - len(normalized_shape)), len(input_shape)))
+        self.ndims = len(input_shape)
+        self.broadcast_shape = [1] * self.ndims
+        for dim in self.axis:
+            self.broadcast_shape[dim] = input_shape[dim]
+
+    def __call__(self, input):
+        return F.layer_norm(input, self.normalized_shape, self.gamma, self.beta, self.eps)
+
+class multiheadattention(Module):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout,
+        batch_first,
+        need_weights,
+        q_weight,
+        k_weight,
+        v_weight,
+        out_weight,
+        q_bias,
+        k_bias,
+        v_bias,
+        out_bias,
+        train,
+    ):
+        super(multiheadattention, self).__init__()
+        self.embed_dim_check = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.batch_first = batch_first
+        self.need_weights = need_weights
+        self.q_weight = q_weight
+        self.k_weight = k_weight
+        self.v_weight = v_weight
+        self.out_weight = out_weight
+        self.q_bias = q_bias
+        self.k_bias = k_bias
+        self.v_bias = v_bias
+        self.out_bias = out_bias
+        self.train = train
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim_check, 'embed_dim must be divisible by num_heads'
+        self.register_parameter('in_proj_weight', None)
+
+        if q_bias is not None:
+            self.in_proj_bias = flow.cat((self.q_bias, self.k_bias, self.v_bias))
+        else:
+            self.register_parameter('in_proj_bias', None)
+
+        self.bias_k = self.bias_v = None
+        self.add_zero_attn = False
+
+    def forward(self, q, k, v, attn_mask, key_padding_mask):
+        k = q if k is None else k
+        v = q if v is None else v
+        if self.batch_first:
+            q, k, v = [x.transpose(1, 0) for x in (q, k, v)]
+        attn_output, attn_output_weights = F.multi_head_attention_forward(
+            q, k, v, self.embed_dim_check, self.num_heads, self.in_proj_weight, self.in_proj_bias, self.bias_k,
+            self.bias_v, self.add_zero_attn, self.dropout, self.out_weight, self.out_bias, training=self.training,
+            key_padding_mask=key_padding_mask, need_weights=self.need_weights, attn_mask=attn_mask,
+            use_separate_proj_weight=True, q_proj_weight=self.q_weight, k_proj_weight=self.k_weight,
+            v_proj_weight=self.v_weight
+        )
+        if self.batch_first:
+            return attn_output.transpose(1, 0), attn_output_weights
+        else:
+            return attn_output, attn_output_weights
+
+class BinaryDense(object):
+
+    def __init__(self, weights, bias):
+        self.weights = weights
+        self.bias = bias
+
+    def __call__(self, inputs):
+        raise NotImplementedError
+
+
+class DorefaDense(object):
+
+    def __init__(self, weights, bias, bitW, bitA):
+        self.weights = weights
+        self.bias = bias
+        self.bitW = bitW
+        self.bitA = bitA
+
+    def __call__(self, inputs):
+        raise NotImplementedError
+
+
+class TernaryDense(object):
+
+    def __init__(self, weights, bias):
+        self.weights = weights
+        self.bias = bias
+
+    def __call__(self, inputs):
+        raise NotImplementedError
+
+
+class QuanDense(object):
+
+    def __init__(self, weights, bias, bitW, bitA):
+        self.weights = weights
+        self.bias = bias
+        self.bitW = bitW
+        self.bitA = bitA
+
+    def __call__(self, inputs):
+        raise NotImplementedError
+
+
+class QuanDenseBn(object):
+
+    def __init__(
+        self, weights, scale_para, offset_para, moving_mean, moving_variance, decay, bitW, bitA, epsilon, is_train
+    ):
+        self.weights = weights
+        self.scale_para = scale_para
+        self.offset_para = offset_para
+        self.moving_mean = moving_mean
+        self.moving_variance = moving_variance
+        self.decay = decay
+        self.bitW = bitW
+        self.bitA = bitA
+        self.epsilon = epsilon
+        self.is_train = is_train
+
+    def __call__(self, inputs):
+        raise NotImplementedError
+
+
+class TernaryConv(object):
+
+    def __init__(self, weights, strides, padding, data_format, dilations):
+        self.weights = weights
+        self.strides = strides
+        self.dilations = dilations
+        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+
+    def __call__(self, inputs):
+        raise NotImplementedError
+
+
+class QuanConv(object):
+
+    def __init__(self, weights, strides, padding, data_format, dilations, bitW, bitA):
+        self.weights = weights
+        self.strides = strides
+        self.dilations = dilations
+        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        self.bitW = bitW
+        self.bitA = bitA
+
+    def __call__(self, inputs):
+        raise NotImplementedError
+
+
+class QuanConvBn(object):
+
+    def __init__(
+        self, weights, scale_para, offset_para, moving_mean, moving_variance, strides, padding, data_format, dilations,
+        bitW, bitA, decay, epsilon, is_train
+    ):
+        self.weights = weights
+        self.strides = strides
+        self.dilations = dilations
+        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        self.bitW = bitW
+        self.bitA = bitA
+        self.scale_para = scale_para
+        self.offset_para = offset_para
+        self.moving_mean = moving_mean
+        self.moving_variance = moving_variance
+        self.decay = decay
+        self.epsilon = epsilon
+        self.is_train = is_train
+
+    def __call__(self, inputs):
+        raise NotImplementedError
+
+
+class PReLU(object):
+
+    def __init__(self, data_format):
+
+        self.data_format = data_format
+
+    def __call__(self, input, weight):
+        # weight = weight.to(input.device)
+        return F.prelu(input, weight)
+
+
+def prelu(input, weight, data_format):
+
+    return F.prelu(input, weight)
+
+def hardsigmoid(input):
+
+    return F.hardsigmoid(input)
+
+def hardswish(input):
+
+    return F.hardswish(input)
+
+def swish(input):
+
+    return F.swish(input)
+
+def linear(input, weight, bias = None):
+
+    return F.linear(input, weight, bias)
+
+def unfold(input, kernel_size, dilation = 1, padding = 0, stride = 1):
+
+    return F.unfold(input, kernel_size, stride=stride, padding=padding, dilation=dilation)
\ No newline at end of file
diff --git a/tensorlayerx/backend/ops/jitter_backend.py b/tensorlayerx/backend/ops/jitter_backend.py
new file mode 100644
index 0000000..e69de29

From 7b55016409530e23e8e4f8c69027daead5699b83 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Wed, 6 Mar 2024 15:51:29 +0800
Subject: [PATCH 03/27] removed jitter_backend.py. This file is no longer
 needed and was created by me

---
 tensorlayerx/backend/ops/jitter_backend.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tensorlayerx/backend/ops/jitter_backend.py

diff --git a/tensorlayerx/backend/ops/jitter_backend.py b/tensorlayerx/backend/ops/jitter_backend.py
deleted file mode 100644
index e69de29..0000000

From 33e4d356a08d76f1461f1ab980d9439a14475f8a Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Fri, 8 Mar 2024 02:10:26 +0800
Subject: [PATCH 04/27] Integration Progress uptill line 2000

---
 tensorlayerx/backend/ops/Jittor_nn.py | 898 +++++++++++++++-----------
 1 file changed, 512 insertions(+), 386 deletions(-)

diff --git a/tensorlayerx/backend/ops/Jittor_nn.py b/tensorlayerx/backend/ops/Jittor_nn.py
index a579c9a..3b4b6d5 100644
--- a/tensorlayerx/backend/ops/Jittor_nn.py
+++ b/tensorlayerx/backend/ops/Jittor_nn.py
@@ -7,7 +7,8 @@
 
 import jittor as jt
 import jittor.nn as nn
-
+import collections
+from itertools import repeat
 
 def padding_format(padding):
     """
@@ -161,12 +162,13 @@ def nchw_to_nhwc(x):
     -------
         channels last tensor data
     """
-    if len(P.Shape()(x)) == 3:
-        x = P.Transpose()(x, (0, 2, 1))
-    elif len(P.Shape()(x)) == 4:
-        x = P.Transpose()(x, (0, 2, 3, 1))
-    elif len(P.Shape()(x)) == 5:
-        x = P.Transpose()(x, (0, 2, 3, 4, 1))
+    shape = x.shape
+    if len(shape) == 3:
+        x = jt.transpose(x, (0, 2, 1))
+    elif len(shape) == 4:
+        x = jt.transpose(x, (0, 2, 3, 1))
+    elif len(shape) == 5:
+        x = jt.transpose(x, (0, 2, 3, 4, 1))
     # else:
     #     raise Exception("Unsupported dimensions")
     return x
@@ -184,13 +186,13 @@ def nhwc_to_nchw(x):
     -------
         channels first tensor data
     """
-
-    if len(P.Shape()(x)) == 3:
-        x = P.Transpose()(x, (0, 2, 1))
-    elif len(P.Shape()(x)) == 4:
-        x = P.Transpose()(x, (0, 3, 1, 2))
-    elif len(P.Shape()(x)) == 5:
-        x = P.Transpose()(x, (0, 4, 1, 2, 3))
+    shape = x.shape
+    if len(shape) == 3:
+        x = jt.transpose(x, (0, 2, 1))
+    elif len(shape) == 4:
+        x = jt.transpose(x, (0, 3, 1, 2))
+    elif len(shape) == 5:
+        x = jt.transpose(x, (0, 4, 1, 2, 3))
     # else:
     #     raise Exception("Unsupported dimensions")
     return x
@@ -462,11 +464,75 @@ def bias_add(x, bias, data_format=None):
     return add_obj(x, bias)
 
 
+
+class Conv1D(object):
+
+    def __init__(self, stride, padding, data_format='NWC', dilations=None, out_channel=None, k_size=None, groups=1):
+        self.stride = stride
+        self.dilations = dilations
+        self.groups = groups
+        self.data_format, self.padding = preprocess_1d_format(data_format, padding)
+        # self.conv1d = nn.Conv1d()
+    def __call__(self, input, filters):
+        if self.data_format == 'NLC':
+            input = nhwc_to_nchw(input)
+        if self.padding == 'same':
+            out = self.conv1d_same_padding(input, filters)
+        else:
+            
+            out = nn.Conv1d(input, filters, stride=self.stride, padding=self.padding,
+                           dilation=self.dilations, groups=self.groups)
+        if self.data_format == 'NLC':
+            out = nchw_to_nhwc(out)
+
+        return out
+
+    def conv1d_same_padding(self, input, filters):
+        rows_odd, padding_rows = same_padding(input, filters, self.stride, 1)
+        if rows_odd:
+            input = nn.pad(input, [0, int(rows_odd)], 'replicate')
+        
+        return nn.Conv1d(input, filters, stride=self.stride, padding=(padding_rows // 2), groups=self.groups)
+
+
+
+def conv1d(input, filters, stride, padding, data_format='NWC', dilations=None):
+    """
+    Computes a 1-D convolution given 3-D input and filter tensors.
+
+    Parameters
+    ----------
+    input : tensor
+        A 3D Tensor. Must be of type float16, float32, or float64
+    filters : tensor
+        A 3D Tensor. Must have the same type as input.
+    stride : int of list
+         An int or list of ints that has length 1 or 3. The number of entries by which the filter is moved right at each step.
+    padding : string
+         'SAME' or 'VALID'
+    data_format : string
+        An optional string from "NWC", "NCW". Defaults to "NWC", the data is stored in the order of
+        [batch, in_width, in_channels]. The "NCW" format stores data as [batch, in_channels, in_width].
+    dilations : int or list
+        An int or list of ints that has length 1 or 3 which defaults to 1.
+        The dilation factor for each dimension of input. If set to k > 1,
+        there will be k-1 skipped cells between each filter element on that dimension.
+        Dilations in the batch and depth dimensions must be 1.
+    name : string
+        A name for the operation (optional).
+    Returns
+    -------
+        A Tensor. Has the same type as input.
+    """
+
+    return Conv1D(stride=stride, padding=padding, data_format=data_format, dilations=dilations)(input, filters)
+
+
 def same_padding(input, weight, strides, dilations):
     #                     H(in) + 2* padding[0] - dilation[0] * (Ksize[0] - 1) - 1
     # H(out) = = floor( --------------------------------------------------------------   + 1 )
     #                                        stride[0]
-    if isinstance(weight, flow.Tensor):
+    if isinstance(weight, jt.array):
         if len(input.shape) == 3:
             filter_rows = weight.size(2)
         if len(input.shape) == 4:
@@ -534,66 +600,6 @@ def same_padding(input, weight, strides, dilations):
         return rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth
 
 
-class Conv1D(object):
-
-    def __init__(self, stride, padding, data_format='NWC', dilations=None, out_channel=None, k_size=None, groups=1):
-        self.stride = stride
-        self.dilations = dilations
-        self.groups = groups
-        self.data_format, self.padding = preprocess_1d_format(data_format, padding)
-
-    def __call__(self, input, filters):
-        if self.data_format == 'NLC':
-            input = nhwc_to_nchw(input)
-        if self.padding == 'same':
-            out = self.conv1d_same_padding(input, filters)
-        else:
-            out = F.conv1d(input, filters, stride=self.stride, padding=self.padding,
-                           dilation=self.dilations, groups=self.groups)
-        if self.data_format == 'NLC':
-            out = nchw_to_nhwc(out)
-
-        return out
-
-    def conv1d_same_padding(self, input, filters):
-        rows_odd, padding_rows = same_padding(input, filters, self.stride, 1)
-        if rows_odd:
-            input = F.pad(input, [0, int(rows_odd)], 'replicate')
-        return F.conv1d(input, filters, stride=self.stride, padding=(padding_rows // 2), groups=self.groups)
-
-
-def conv1d(input, filters, stride, padding, data_format='NWC', dilations=None):
-    """
-    Computes a 1-D convolution given 3-D input and filter tensors.
-
-    Parameters
-    ----------
-    input : tensor
-        A 3D Tensor. Must be of type float16, float32, or float64
-    filters : tensor
-        A 3D Tensor. Must have the same type as input.
-    stride : int of list
-         An int or list of ints that has length 1 or 3. The number of entries by which the filter is moved right at each step.
-    padding : string
-         'SAME' or 'VALID'
-    data_format : string
-        An optional string from "NWC", "NCW". Defaults to "NWC", the data is stored in the order of
-        [batch, in_width, in_channels]. The "NCW" format stores data as [batch, in_channels, in_width].
-    dilations : int or list
-        An int or list of ints that has length 1 or 3 which defaults to 1.
-        The dilation factor for each dimension of input. If set to k > 1,
-        there will be k-1 skipped cells between each filter element on that dimension.
-        Dilations in the batch and depth dimensions must be 1.
-    name : string
-        A name for the operation (optional).
-    Returns
-    -------
-        A Tensor. Has the same type as input.
-    """
-
-    return Conv1D(stride=stride, padding=padding, data_format=data_format, dilations=dilations)(input, filters)
-
-
 class Conv2D(object):
 
     def __init__(self, strides, padding, data_format='NHWC', dilations=None, out_channel=None, k_size=None, groups=1):
@@ -613,7 +619,7 @@ def __call__(self, input, filters):
         if self.padding == 'same':
             output = self.conv2d_same_padding(input, filters)
         else:
-            output = F.conv2d(input, filters, stride=self.strides, padding=self.padding,
+            output = nn.conv2d(input, filters, stride=self.strides, padding=self.padding,
                               dilation=self.dilations, groups=self.groups)
 
         if self.data_format == 'NHWC':
@@ -625,7 +631,7 @@ def conv2d_same_padding(self, input, weight, bias=None):
         if rows_odd or cols_odd:
             input = F.pad(input, [0, int(cols_odd), 0, int(rows_odd)])
 
-        return F.conv2d(
+        return nn.conv2d(
             input, weight, bias, self.strides, padding=(padding_rows // 2, padding_cols // 2), dilation=self.dilations,
             groups=self.groups
         )
@@ -662,7 +668,7 @@ def conv2d(input, filters, strides, padding, data_format='NHWC', dilations=None)
     if data_format == 'NHWC':
         input = nhwc_to_nchw(input)
 
-    output = F.conv2d(input, filters, stride=strides, padding=padding, dilation=dilations)
+    output = nn.conv2d(input, filters, stride=strides, padding=padding, dilation=dilations)
 
     if data_format == 'NHWC':
         output = nchw_to_nhwc(output)
@@ -680,6 +686,7 @@ def __init__(self, strides, padding, data_format='NDHWC', dilations=None, out_ch
             self._strides = (strides[2], strides[3], strides[4])
             self._dilations = (dilations[2], dilations[3], dilations[4])
 
+
     def __call__(self, input, filters):
         if self.data_format == 'NDHWC':
             input = nhwc_to_nchw(input)
@@ -700,8 +707,8 @@ def conv3d_same_padding(self, input, weight, bias=None, groups=1):
         if rows_odd or cols_odd or depth_odd:
             input = F.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)])
 
-        return F.conv3d(
-            input, weight, bias, self._strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2),
+        return nn.conv3d(
+            input, weight, bias, self._strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth//2),
             dilation=self._dilations, groups=groups
         )
 
@@ -744,38 +751,6 @@ def conv3d(input, filters, strides, padding, data_format='NDHWC', dilations=None
     return Conv3D(strides=strides, padding=padding, data_format=data_format, dilations=dilations)(input, filters)
 
 
-def local_response_norm(input: flow.Tensor, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.0) -> flow.Tensor:
-    r"""Applies local response normalization over an input signal composed of
-    several input planes, where channels occupy the second dimension.
-    Applies normalization across channels.
-
-    reference from torch.nn.LocalResponseNorm
-    """
-    dim = input.dim()
-    if dim < 3:
-        raise ValueError(
-            "Expected 3D or higher dimensionality \
-                         input (got {} dimensions)".format(
-                dim
-            )
-        )
-    if input.numel() == 0:
-        return input
-
-    div = input.mul(input).unsqueeze(1)
-    if dim == 3:
-        div = F.pad(div, (0, 0, size // 2, (size - 1) // 2))
-        div = F.avg_pool2d(div, (size, 1), stride=1).squeeze(1)
-    else:
-        sizes = input.size()
-        div = div.view(sizes[0], 1, sizes[1], sizes[2], -1)
-        div = F.pad(div, (0, 0, 0, 0, size // 2, (size - 1) // 2))
-        div = F.avg_pool3d(div, (size, 1, 1), stride=1).squeeze(1)
-        div = div.view(sizes)
-    div = div.mul(alpha).add(k).pow(beta)
-    return input / div
-
-
 def lrn(inputs, depth_radius, bias, alpha, beta):
     """
     Local Response Normalization.
@@ -798,7 +773,7 @@ def lrn(inputs, depth_radius, bias, alpha, beta):
         A Tensor. Has the same type as input.
     """
 
-    return local_response_norm(inputs, depth_radius, alpha, beta, bias)
+    raise NotImplementedError
 
 
 def moments(x, axes, shift=None, keepdims=False):
@@ -826,18 +801,13 @@ def moments(x, axes, shift=None, keepdims=False):
 
 class MaxPool1d(object):
 
-    def __init__(self, ksize, strides, padding, return_mask, data_format=None):
-        self.data_format, self.padding = preprocess_1d_format(data_format=data_format, padding=padding)
-        self.return_mask = return_mask
-        self.max_pool1d = MaxPool([ksize, ], strides, padding, data_format)
-
-    def __call__(self, inputs):
-        return self.max_pool1d(inputs)
+    def __call__():
+        return NotImplementedError
 
 
 class MaxPool(object):
 
-    def __init__(self, ksize, strides, padding, return_mask, data_format=None):
+    def __init__(self, ksize, strides, padding, return_mask = False, data_format=None):
         self.ksize = ksize
         self.strides = strides
         self.return_mask = return_mask
@@ -853,52 +823,55 @@ def __call__(self, inputs):
         if self.data_format == 'channels_last':
             inputs = nhwc_to_nchw(inputs)
         if len(inputs.shape) == 2 or len(inputs.shape) == 3:
-            if self.padding in ['SAME', 'same']:
-                out = self.maxpool1d_same_padding(inputs)
-            else:
-                out = F.max_pool1d(inputs, self.ksize, self.strides, padding=self.padding)
+            raise NotImplementedError
+        
         if len(inputs.shape) == 4:
             if self.padding in ['SAME', 'same']:
                 out = self.maxpool2d_same_padding(inputs)
             else:
-                out = F.max_pool2d(inputs, self.ksize, self.strides, padding=self.padding)
+                out = nn.max_pool2d(inputs, self.ksize, self.strides, padding=self.padding,
+                            return_indices=self.return_mask)
         if len(inputs.shape) == 5:
             if self.padding in ['SAME', 'same']:
                 out = self.maxpool3d_same_padding(inputs)
             else:
-                out = F.max_pool3d(inputs, self.ksize, self.strides, padding=self.padding)
+                out = nn.max_pool3d(inputs, self.ksize, self.strides, padding=self.padding,
+                            return_indices=self.return_mask)
 
         if self.data_format == 'channels_last':
-            return nchw_to_nhwc(out)
+            if self.return_mask:
+                    outputs = [None, None]
+                    outputs[0] = nchw_to_nhwc(out[0])
+                    outputs[1] = nchw_to_nhwc(out[1])
+                    return outputs
+            else:
+                return nchw_to_nhwc(out)
         else:
             return out
 
-    def maxpool1d_same_padding(self, input):
-        rows_odd, padding_rows = same_padding(input, self.ksize, self.strides, 1)
-        if rows_odd:
-            input = F.pad(input, [0, int(rows_odd)], 'constant', float('-inf'))
-        return F.max_pool1d(input, self.ksize, self.strides, padding=(padding_rows // 2))
 
     def maxpool2d_same_padding(self, input):
         rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, self.ksize, self.strides, (1, 1))
         if rows_odd or cols_odd:
             # TODO The fill value for maxpool is -INF.
-            input = F.pad(input, [0, int(rows_odd), 0, int(cols_odd)], 'constant', float('-inf'))
+            input = nn.pad(input, [0, int(rows_odd), 0, int(cols_odd)], 'constant', float('-inf'))
 
-        return F.max_pool2d(input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2))
+        return nn.max_pool2d(input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2),
+                            return_indices=self.return_mask)
 
     def maxpool3d_same_padding(self, input):
         rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding(
             input, self.ksize, self.strides, (1, 1, 1)
         )
         if rows_odd or cols_odd or depth_odd:
-            input = F.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)], 'constant', float('-inf'))
-        return F.max_pool3d(
-            input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2)
+            input = nn.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)], 'constant', float('-inf'))
+        return nn.max_pool3d(
+                input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2),
+                return_indices=self.return_mask
         )
 
 
-def max_pool(input, ksize, strides, padding, data_format=None):
+def max_pool(input, ksize, strides, padding, return_mask, data_format=None):
     """
     Performs the max pooling on the input.
 
@@ -924,31 +897,27 @@ def max_pool(input, ksize, strides, padding, data_format=None):
         A Tensor of format specified by data_format. The max pooled output tensor.
     """
 
-    maxpool_obj = MaxPool(ksize, strides, padding, data_format)
+    maxpool_obj = MaxPool(ksize, strides, padding, return_mask, data_format)
     return maxpool_obj(input)
 
-
-def max_pool1d(input, kernel_size, stride=None, padding=0, return_mask=False,  data_format='NCL'):
+def max_pool1d(input, kernel_size, stride=None, padding=0, return_mask=False, data_format='NCL'):
     raise NotImplementedError
 
-
 def max_pool2d(input, kernel_size, stride=None, padding=0, return_mask=False, data_format='NCHW'):
-    raise NotImplementedError
 
+    maxpool_obj = MaxPool(kernel_size, stride, padding, return_mask, data_format)
+    return maxpool_obj(input)
 
 def max_pool3d(input, kernel_size, stride=None, padding=0, return_mask=False, data_format="NCDHW"):
-    raise NotImplementedError
 
+    maxpool_obj = MaxPool(kernel_size, stride, padding, return_mask, data_format)
+    return maxpool_obj(input)
 
 
 class AvgPool1d(object):
 
-    def __init__(self, ksize, strides, padding, data_format=None):
-        self.data_format, self.padding = preprocess_1d_format(data_format=data_format, padding=padding)
-        self.avg_poo1d = AvgPool([ksize, ], strides, padding, data_format)
-
-    def __call__(self, inputs):
-        return self.avg_poo1d(inputs)
+    def __call__(inputs):
+        raise NotImplementedError
 
 
 class AvgPool(object):
@@ -968,48 +937,41 @@ def __call__(self, inputs):
         if self.data_format == 'channels_last':
             inputs = nhwc_to_nchw(inputs)
         if len(inputs.shape) == 2 or len(inputs.shape) == 3:
-            if self.padding in ['SAME', 'same']:
-                out = self.avgpool1d_same_padding(inputs)
-            else:
-                out = F.avg_pool1d(inputs, self.ksize, self.strides, padding=self.padding)
+            raise NotImplementedError
+               
         if len(inputs.shape) == 4:
             if self.padding in ['SAME', 'same']:
                 out = self.avgpool2d_same_padding(inputs)
             else:
-                out = F.avg_pool2d(inputs, self.ksize, self.strides, padding=self.padding)
+                out = nn.avg_pool2d(inputs, self.ksize, self.strides, padding=self.padding)
         if len(inputs.shape) == 5:
             if self.padding in ['SAME', 'same']:
                 out = self.avgpool3d_same_padding(inputs)
             else:
-                out = F.avg_pool3d(inputs, self.ksize, self.strides, padding=self.padding)
+                out = nn.AvgPool2d(inputs, self.ksize, self.strides, padding=self.padding)
 
         if self.data_format == 'channels_last':
             return nchw_to_nhwc(out)
         else:
             return out
 
-    def avgpool1d_same_padding(self, input):
-        rows_odd, padding_rows = same_padding(input, self.ksize, self.strides, 1)
-        if rows_odd:
-            input = F.pad(input, [0, int(rows_odd)], 'replicate')
-        return F.avg_pool1d(input, self.ksize, self.strides, padding=(padding_rows // 2))
 
     def avgpool2d_same_padding(self, input):
         rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, self.ksize, self.strides, (1, 1))
         if rows_odd or cols_odd:
             # TODO The fill value for maxpool is -INF.
-            input = F.pad(input, [0, int(rows_odd), 0, int(cols_odd)], mode='replicate')
+            input = nn.pad(input, [0, int(rows_odd), 0, int(cols_odd)], mode='replicate')
 
-        return F.avg_pool2d(input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2))
+        return nn.avg_pool2d(input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2))
 
     def avgpool3d_same_padding(self, input):
         rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding(
             input, self.ksize, self.strides, (1, 1, 1)
         )
         if rows_odd or cols_odd or depth_odd:
-            input = F.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)], mode='replicate')
-        return F.avg_pool3d(
-            input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2)
+            input = nn.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)], mode='replicate')
+        return nn.AvgPool3d(
+                input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2)
         )
 
 
@@ -1042,13 +1004,24 @@ def avg_pool(input, ksize, strides, padding):
     avg_pool_obj = AvgPool(ksize, strides, padding)
     return avg_pool_obj(input)
 
+def avg_pool1d(input, kernel_size, stride=None, padding=0, data_format='NCL'):
+    raise NotImplementedError
+
+def avg_pool2d(input, kernel_size, stride=None, padding=0, data_format='NCHW'):
+    data_format, padding = preprocess_2d_format(data_format, padding)
+    avg_pool_obj = AvgPool(kernel_size, stride, padding, data_format)
+    return avg_pool_obj(input)
+
+def avg_pool3d(input, kernel_size, stride=None, padding=0, data_format='NCDHW'):
+    data_format, padding = preprocess_3d_format(data_format, padding)
+    avg_pool_obj = AvgPool(kernel_size, stride, padding, data_format)
+    return avg_pool_obj(input)
 
 class MaxPool3d(object):
 
     def __init__(self, ksize, strides, padding, return_mask, data_format=None):
         self.data_format, self.padding = preprocess_3d_format(data_format, padding)
-        self.return_mask = return_mask
-        self.max_pool3d = MaxPool(ksize, strides, padding, data_format)
+        self.max_pool3d = MaxPool(ksize, strides, padding, return_mask, data_format)
 
     def __call__(self, inputs):
         return self.max_pool3d(inputs)
@@ -1097,19 +1070,33 @@ def __call__(self, inputs):
         return self.avg_pool3d_obj(inputs)
 
 
-def avg_pool1d(input, kernel_size, stride=None, padding=0,  data_format='NCL'):
-
-    raise NotImplementedError
-
-
-def avg_pool2d(input, kernel_size, stride=None, padding=0, data_format='NCHW'):
-
-    raise NotImplementedError
-
-
-def avg_pool3d(input, kernel_size, stride=None, padding=0, data_format='NCDHW'):
-
-    raise NotImplementedError
+# def avg_pool3d(input, ksize, strides, padding, data_format=None):
+#     """
+#     Performs the average pooling on the input.
+#
+#     Parameters
+#     ----------
+#     input : tensor
+#         A 5-D Tensor of shape [batch, height, width, channels] and type float32, float64, qint8, quint8, or qint32.
+#     ksize : int or list of ints
+#         An int or list of ints that has length 1, 3 or 5. The size of the window for each dimension of the input tensor.
+#     strides : int or list of ints
+#         An int or list of ints that has length 1, 3 or 5.
+#         The stride of the sliding window for each dimension of the input tensor.
+#     padding : string
+#         'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+#     data_format : string
+#         'NDHWC' and 'NCDHW' are supported.
+#     name : string
+#         Optional name for the operation.
+#
+#     Returns
+#     -------
+#         A Tensor with the same type as value. The average pooled output tensor.
+#     """
+#
+#     avg_pool_obj = AvgPool(ksize, strides, padding, data_format)
+#     return avg_pool_obj(input)
 
 
 def pool(input, window_shape, pooling_type, strides=None, padding='VALID', data_format=None, dilations=None, name=None):
@@ -1168,8 +1155,7 @@ def __init__(self, strides, padding, data_format=None, dilations=None, ksize=Non
             self.dilations = (1, 1, dilations[0], dilations[1])
         self.depthwise = Conv2D(padding=self.padding, strides=self.strides, data_format=self.data_format,
                                 dilations=self.dilations, groups=in_channels)
-        self.pointwise = Conv2D(strides=(1, 1, 1, 1), padding=self.padding,
-                                data_format=self.data_format, dilations=self.dilations, k_size=1)
+        self.pointwise = Conv2D(strides=(1, 1, 1, 1), padding=self.padding, data_format=self.data_format, dilations=self.dilations, k_size=1)
 
     def __call__(self, input, filter, point_filter=None):
         depthwise_conv = self.depthwise(input, filter)
@@ -1211,9 +1197,9 @@ def depthwise_conv2d(input, filter, strides, padding, data_format=None, dilation
 
 
 def same_padding_deconvolution(input, weight, strides, dilations):
-    # H(out) = floor((H(in) - 1)*stride[0] - 2* padding[0] + dilation[0] * (ksize[0]-1) + 1)
+    #H(out) = floor((H(in) - 1)*stride[0] - 2* padding[0] + dilation[0] * (ksize[0]-1) + 1)
 
-    if isinstance(weight, flow.Tensor):
+    if isinstance(weight, jt.array):
         if len(input.shape) == 3:
             filter_rows = weight.size(2)
         if len(input.shape) == 4:
@@ -1237,7 +1223,7 @@ def same_padding_deconvolution(input, weight, strides, dilations):
     if len(input.shape) == 3:
         input_rows = input.size(2)
         out_rows = input_rows * strides - strides + 1
-        padding_rows = max(0, (input_rows - 1) * strides + (filter_rows - 1) * dilations + 1 - out_rows)
+        padding_rows = max(0, (input_rows-1) * strides + (filter_rows - 1) * dilations + 1 - out_rows)
         rows_odd = (padding_rows % 2 != 0)
         return rows_odd, padding_rows
 
@@ -1246,7 +1232,8 @@ def same_padding_deconvolution(input, weight, strides, dilations):
         input_cols = input.size(3)
 
         out_rows = input_rows * strides[0] - strides[0] + 1
-        out_cols = input_rows * strides[1] - strides[1] + 1
+        out_cols = input_cols * strides[1] - strides[1] + 1
+
 
         padding_rows = max(0, (input_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - out_rows)
         padding_cols = max(0, (input_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - out_cols)
@@ -1261,8 +1248,8 @@ def same_padding_deconvolution(input, weight, strides, dilations):
         input_depth = input.size(4)
 
         out_rows = input_rows * strides[0] - strides[0] + 1
-        out_cols = input_rows * strides[1] - strides[1] + 1
-        out_depth = input_rows * strides[2] - strides[2] + 1
+        out_cols = input_cols * strides[1] - strides[1] + 1
+        out_depth = input_depth * strides[2] - strides[2] + 1
 
         padding_rows = max(0, (input_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - out_rows)
         padding_cols = max(0, (input_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - out_cols)
@@ -1276,84 +1263,97 @@ def same_padding_deconvolution(input, weight, strides, dilations):
 
 class Conv1d_transpose(object):
 
-    def __init__(
-        self, stride, padding, data_format='NWC', dilations=None, out_channel=None, k_size=None, in_channels=None
-    ):
-        self.stride = stride
-        self.dilations = dilations
-        self.data_format, self.padding = preprocess_1d_format(data_format, padding)
+    # def __init__(
+    #     self, stride, padding, data_format='NWC', dilations=None, out_channel=None, k_size=None, in_channels=None
+    # ):
+    #     self.stride = stride
+    #     self.dilations = dilations
+    #     self.data_format, self.padding = preprocess_1d_format(data_format, padding)
 
     def __call__(self, input, filters):
-        if self.data_format == 'NLC':
-            input = nhwc_to_nchw(input)
-        if self.padding == 'same':
-            out = self.conv1d_transpose_same_padding(input, filters)
-        else:
-            out = F.conv_transpose1d(
-                input,
-                weight=filters,
-                padding=(0 if isinstance(self.padding, str) else self.padding),
-                stride=self.stride,
-                dilation=self.dilations
-            )
-        if self.data_format == 'NLC':
-            out = nchw_to_nhwc(out)
-        return out
+        raise NotImplementedError
+#         if self.data_format == 'NLC':
+#             input = nhwc_to_nchw(input)
+#         if self.padding == 'same':
+#             out = self.conv1d_transpose_same_padding(input, filters)
+#         else:
+#             out = F.conv_transpose1d(
+#                 input,
+#                 weight=filters,
+#                 padding=(0 if isinstance(self.padding, str) else self.padding),
+#                 stride=self.stride,
+#                 dilation=self.dilations
+#             )
+#         if self.data_format == 'NLC':
+#             out = nchw_to_nhwc(out)
+#         return out
+
+#     def conv1d_transpose_same_padding(self, input, filters):
+#         rows_odd, padding_rows = same_padding_deconvolution(input, filters, self.stride, 1)
+#         if rows_odd:
+#             input = F.pad(input, [0, int(rows_odd)])
+#             out_padding = 0
+#         else:
+#             out_padding = 1
+#         return F.conv_transpose1d(input, weight=filters, padding=(padding_rows // 2), stride=self.stride,
+#                                   dilation=self.dilations, output_padding=out_padding)
+
+
+
+# def conv1d_transpose(
+#     input, filters, output_shape, strides, padding='SAME', data_format='NWC', dilations=None, name=None
+# ):
+#     """
+#     The transpose of conv1d.
 
-    def conv1d_transpose_same_padding(self, input, filters):
-        rows_odd, padding_rows = same_padding_deconvolution(input, filters, self.stride, 1)
-        if rows_odd:
-            input = F.pad(input, [0, int(rows_odd)])
-            out_padding = 0
-        else:
-            out_padding = 1
-        return F.conv_transpose1d(input, weight=filters, padding=(padding_rows // 2), stride=self.stride,
-                                  dilation=self.dilations, output_padding=out_padding)
+#     Parameters
+#     ----------
+#     input : tensor
+#         A 3-D Tensor of type float and shape [batch, in_width, in_channels]
+#         for NWC data format or [batch, in_channels, in_width] for NCW data format.
+#     filters : tensor
+#         A 3-D Tensor with the same type as value and shape [filter_width, output_channels, in_channels].
+#         filter's in_channels dimension must match that of value.
+#     output_shape : tensor
+#         A 1-D Tensor, containing three elements, representing the output shape of the deconvolution op.
+#     strides : list
+#         An int or list of ints that has length 1 or 3. The number of entries by which the filter is moved right at each step.
+#     padding : string
+#         'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+#     data_format : string
+#         'NWC' and 'NCW' are supported.
+#     dilations : list
+#          An int or list of ints that has length 1 or 3 which defaults to 1.
+#          The dilation factor for each dimension of input. If set to k > 1,
+#          there will be k-1 skipped cells between each filter element on that dimension.
+#          Dilations in the batch and depth dimensions must be 1.
+#     name : string
+#         Optional name for the returned tensor.
 
+#     Returns
+#     -------
+#         A Tensor with the same type as value.
+#     """
 
-def conv1d_transpose(
-    input, filters, output_shape, strides, padding='SAME', data_format='NWC', dilations=None, name=None
-):
-    """
-    The transpose of conv1d.
+#     conv1d_transpose_obj = Conv1d_transpose(strides, padding, data_format, dilations)
+#     return conv1d_transpose_obj(input, filters)
 
-    Parameters
-    ----------
-    input : tensor
-        A 3-D Tensor of type float and shape [batch, in_width, in_channels]
-        for NWC data format or [batch, in_channels, in_width] for NCW data format.
-    filters : tensor
-        A 3-D Tensor with the same type as value and shape [filter_width, output_channels, in_channels].
-        filter's in_channels dimension must match that of value.
-    output_shape : tensor
-        A 1-D Tensor, containing three elements, representing the output shape of the deconvolution op.
-    strides : list
-        An int or list of ints that has length 1 or 3. The number of entries by which the filter is moved right at each step.
-    padding : string
-        'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
-    data_format : string
-        'NWC' and 'NCW' are supported.
-    dilations : list
-         An int or list of ints that has length 1 or 3 which defaults to 1.
-         The dilation factor for each dimension of input. If set to k > 1,
-         there will be k-1 skipped cells between each filter element on that dimension.
-         Dilations in the batch and depth dimensions must be 1.
-    name : string
-        Optional name for the returned tensor.
+def _ntuple(n, name="parse"):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return tuple(x)
+        return tuple(repeat(x, n))
 
-    Returns
-    -------
-        A Tensor with the same type as value.
-    """
+    parse.__name__ = name
+    return parse
 
-    conv1d_transpose_obj = Conv1d_transpose(strides, padding, data_format, dilations)
-    return conv1d_transpose_obj(input, filters)
 
+_single = _ntuple(1, "_single")
 
 class Conv2d_transpose(object):
 
     def __init__(
-        self, strides, padding, data_format='NHWC', dilations=None, name=None, out_channel=None, k_size=None,
+        self, strides, padding, data_format='NHWC', dilations=None, name=None, out_channels=None, k_size=None,
         in_channels=None, groups = 1, output_padding = 0,
     ):
         self.strides = strides
@@ -1363,39 +1363,83 @@ def __init__(
         self.groups = groups
         self.output_padding = output_padding
 
+    def _output_padding(self, input, output_size,
+                        stride, padding, kernel_size,
+                        num_spatial_dims, dilation = None):
+        if output_size is None:
+            ret = _single(self.output_padding)  # converting to list if was not already
+        else:
+            has_batch_dim = input.dim() == num_spatial_dims + 2
+            num_non_spatial_dims = 2 if has_batch_dim else 1
+            if len(output_size) == num_non_spatial_dims + num_spatial_dims:
+                output_size = output_size[num_non_spatial_dims:]
+            if len(output_size) != num_spatial_dims:
+                raise ValueError(
+                    "ConvTranspose{}D: for {}D input, output_size must have {} or {} elements (got {})"
+                    .format(num_spatial_dims, input.dim(), num_spatial_dims,
+                            num_non_spatial_dims + num_spatial_dims, len(output_size)))
+
+            min_sizes = []
+            max_sizes = []
+            for d in range(num_spatial_dims):
+                dim_size = ((input.size(d + num_non_spatial_dims) - 1) * stride[d] -
+                            2 * padding[d] +
+                            (dilation[d] if dilation is not None else 1) * (kernel_size[d] - 1) + 1)
+                min_sizes.append(dim_size)
+                max_sizes.append(min_sizes[d] + stride[d] - 1)
+
+            for i in range(len(output_size)):
+                size = output_size[i]
+                min_size = min_sizes[i]
+                max_size = max_sizes[i]
+                if size < min_size or size > max_size:
+                    raise ValueError((
+                        "requested an output size of {}, but valid sizes range "
+                        "from {} to {} (for an input of {})").format(
+                            output_size, min_sizes, max_sizes, input.size()[2:]))
+
+            res = []
+            for d in range(num_spatial_dims):
+                res.append(output_size[d] - min_sizes[d])
+
+            ret = res
+        return ret
+
     def __call__(self, input, filters, output_size):
         if self.data_format == 'NHWC':
             input = nhwc_to_nchw(input)
         if self.padding == 'same':
             out = self.conv2d_transpore_same(input, filters)
         else:
-            out = F.conv_transpose2d(
+            out_padding = self._output_padding(input, output_size, self.strides, (0 if isinstance(self.padding, str) else self.padding),
+                                               filters.shape,
+                                               2, self.dilations)
+            out = nn.conv_transpose2d(
                 input,
                 weight=filters,
                 padding=(0 if isinstance(self.padding, str) else self.padding),
                 stride=self.strides,
-                dilation=self.dilations
+                dilation=self.dilations,
+                output_padding = out_padding,
+                groups = self.groups
             )
         if self.data_format == 'NHWC':
             out = nchw_to_nhwc(out)
         return out
 
-    def conv2d_transpore_same(self, input, filters):
-        rows_odd, cols_odd, padding_rows, padding_cols = same_padding_deconvolution(
-            input, filters, self.strides, (1, 1))
+    def conv2d_transpore_same(self,input, filters):
+        rows_odd, cols_odd, padding_rows, padding_cols = same_padding_deconvolution(input, filters, self.strides, (1, 1))
         if rows_odd or cols_odd:
-            input = F.pad(input, [0, int(rows_odd), 0, int(cols_odd)])
+            input = nn.pad(input, [0, int(rows_odd), 0, int(cols_odd)])
             out_padding = 0
         else:
             out_padding = 1
-        out = F.conv_transpose2d(input, weight=filters, padding=(padding_rows // 2, padding_cols // 2), stride=self.strides,
-                                 dilation=self.dilations, output_padding=out_padding)
+        out = nn.conv_transpose2d(input, weight=filters, padding=(padding_rows // 2, padding_cols // 2), stride=self.strides,
+                                 dilation=self.dilations, output_padding=out_padding, groups=self.groups)
         return out
 
 
-def conv2d_transpose(
-    input, filters, output_shape, strides, padding='SAME', data_format='NHWC', dilations=None, name=None
-):
+def conv2d_transpose(x, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1, data_format='NCHW', output_size=None):
     """
     The transpose of conv2d.
 
@@ -1427,9 +1471,71 @@ def conv2d_transpose(
     -------
         A Tensor with the same type as input.
     """
+    data_format, padding = preprocess_2d_format(data_format, padding)
+    if isinstance(padding, str):
+        raise ValueError("padding should be int or tuple of int.")
+    def _output_padding(input, output_size,
+                        stride, padding, kernel_size,
+                        num_spatial_dims, dilation = None):
+        if output_size is None:
+            ret = _single(output_padding)  # converting to list if was not already
+        else:
+            has_batch_dim = input.dim() == num_spatial_dims + 2
+            num_non_spatial_dims = 2 if has_batch_dim else 1
+            if len(output_size) == num_non_spatial_dims + num_spatial_dims:
+                output_size = output_size[num_non_spatial_dims:]
+            if len(output_size) != num_spatial_dims:
+                raise ValueError(
+                    "ConvTranspose{}D: for {}D input, output_size must have {} or {} elements (got {})"
+                    .format(num_spatial_dims, input.dim(), num_spatial_dims,
+                            num_non_spatial_dims + num_spatial_dims, len(output_size)))
+
+            min_sizes = []
+            max_sizes = []
+            for d in range(num_spatial_dims):
+                dim_size = ((input.size(d + num_non_spatial_dims) - 1) * stride[d] -
+                            2 * padding[d] +
+                            (dilation[d] if dilation is not None else 1) * (kernel_size[d] - 1) + 1)
+                min_sizes.append(dim_size)
+                max_sizes.append(min_sizes[d] + stride[d] - 1)
+
+            for i in range(len(output_size)):
+                size = output_size[i]
+                min_size = min_sizes[i]
+                max_size = max_sizes[i]
+                if size < min_size or size > max_size:
+                    raise ValueError((
+                        "requested an output size of {}, but valid sizes range "
+                        "from {} to {} (for an input of {})").format(
+                            output_size, min_sizes, max_sizes, input.size()[2:]))
+
+            res = []
+            for d in range(num_spatial_dims):
+                res.append(output_size[d] - min_sizes[d])
+
+            ret = res
+        return ret
 
-    raise NotImplementedError
-
+    if data_format == 'NHWC':
+        x = nhwc_to_nchw(x)
+
+    out_padding = _output_padding(x, output_size, stride,
+                                           padding,
+                                           weight.shape[2:],
+                                           2, dilation)
+    out = nn.conv_transpose2d(
+            x,
+            weight=weight,
+            bias = bias,
+            padding=padding,
+            stride=stride,
+            dilation=dilation,
+            output_padding=out_padding,
+            groups=groups
+        )
+    if data_format == 'NHWC':
+        out = nchw_to_nhwc(out)
+    return out
 
 class Conv3d_transpose(object):
 
@@ -1449,7 +1555,7 @@ def __call__(self, input, filters):
         if self.padding == 'same':
             out = self.conv3d_transpore_same(input, filters)
         else:
-            out = F.conv_transpose3d(
+            out = nn.conv_transpose3d(
                 input,
                 weight=filters,
                 padding=(0 if isinstance(self.padding, str) else self.padding),
@@ -1460,7 +1566,7 @@ def __call__(self, input, filters):
             out = nchw_to_nhwc(out)
         return out
 
-    def conv3d_transpore_same(self, input, filters):
+    def conv3d_transpore_same(self,input, filters):
         rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding_deconvolution(
             input, filters, self.strides, (1, 1, 1))
         if rows_odd or cols_odd or depth_odd:
@@ -1468,7 +1574,7 @@ def conv3d_transpore_same(self, input, filters):
             out_padding = 0
         else:
             out_padding = 1
-        out = F.conv_transpose3d(input, weight=filters, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2),
+        out = nn.conv_transpose3d(input, weight=filters, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2),
                                  stride=self.strides, dilation=self.dilations, output_padding=out_padding)
         return out
 
@@ -1524,7 +1630,7 @@ def _bias_add(x, b, data_format):
     """Alternative implementation of tf.nn.bias_add which is compatiable with tensorRT."""
     raise NotImplementedError
 
-
+# Batch norms exists for jittor but not added here
 def batch_normalization(x, mean, variance, offset, scale, variance_epsilon, data_format, name=None):
     """Data Format aware version of tf.nn.batch_normalization."""
     raise NotImplementedError
@@ -1590,7 +1696,7 @@ def __init__(
         self, decay=0.9, epsilon=0.00001, beta=None, gamma=None, moving_mean=None, moving_var=None, num_features=None,
         data_format='channels_last', is_train=False
     ):
-        self.decay = 1 - decay
+        self.decay =  1-decay
         self.epsilon = epsilon
         self.data_format = data_format
         self.beta = beta
@@ -1604,25 +1710,17 @@ def __init__(
         if self.decay < 0.0 or 1.0 < self.decay:
             raise ValueError("decay should be between 0 to 1")
 
-        self.bn = nn.BatchNorm2d(
-            num_features=self.num_features,
-            eps=self.epsilon,
-            momentum=self.decay,
-            affine=True,
-            track_running_stats=True,
-        )
-
     def __call__(self, inputs):
         if self.data_format == 'channels_last':
             inputs = nhwc_to_nchw(inputs)
 
-        out = _C.normalization(inputs,
-                               self.moving_mean,
-                               self.moving_var,
-                               self.gamma,
-                               self.beta,
-                               is_training=self.is_train,
-                               momentum=self.decay)
+        out = nn.batch_norm(inputs,
+                                             running_mean=self.moving_mean,
+                                             running_var=self.moving_var,
+                                             weight=self.gamma,
+                                             bias=self.beta,
+                                             training=self.is_train,
+                                             momentum=self.decay)
         if self.data_format == 'channels_last':
             out = nchw_to_nhwc(out)
         return out
@@ -1639,6 +1737,7 @@ def __call__(self, input, filters):
         return self.conv2d(input, filters)
 
 
+
 class SeparableConv1D(object):
 
     def __init__(self, stride, padding, data_format, dilations, out_channel, k_size, in_channel, depth_multiplier):
@@ -1646,6 +1745,7 @@ def __init__(self, stride, padding, data_format, dilations, out_channel, k_size,
         self.depthwise_conv = Conv1D(stride, self.padding, self.data_format, dilations, groups=in_channel)
         self.pointwise_conv = Conv1D(1, self.padding, self.data_format, 1)
 
+
     def __call__(self, inputs, depthwise_filters, pointwise_filters):
         depthwise_conv = self.depthwise_conv(inputs, depthwise_filters)
         pointwise_conv = self.pointwise_conv(depthwise_conv, pointwise_filters)
@@ -1659,6 +1759,7 @@ def __init__(self, strides, padding, data_format, dilations, out_channel, k_size
         self.depthwise_conv = Conv2D(strides, self.padding, self.data_format, dilations, groups=in_channel)
         self.pointwise_conv = Conv2D((1, 1), self.padding, self.data_format, (1, 1))
 
+
     def __call__(self, input, filter, point_filter=None):
         depthwise_conv = self.depthwise_conv(input, filter)
         pointwise_conv = self.pointwise_conv(depthwise_conv, point_filter)
@@ -1667,75 +1768,81 @@ def __call__(self, input, filter, point_filter=None):
 
 class AdaptiveMeanPool1D(object):
 
-    def __init__(self, output_size, data_format):
-        self.data_format, _ = preprocess_1d_format(data_format, None)
-        self.op = nn.AdaptiveAvgPool1d(output_size)
+    # def __init__(self, output_size, data_format):
+    #     self.data_format, _ = preprocess_1d_format(data_format, None)
+    #     self.op = nn.AdaptiveAvgPool1d(output_size=output_size)
 
-    def __call__(self, input):
-        if self.data_format == 'NLC':
-            input = nhwc_to_nchw(input)
-        output = self.op(input)
-        if self.data_format == 'NLC':
-            output = nchw_to_nhwc(output)
-        return output
+    def __call__():
+        raise NotImplementedError
+        # if self.data_format == 'NLC':
+        #     input = nhwc_to_nchw(input)
+        # output = self.op(input)
+        # if self.data_format == 'NLC':
+        #     output = nchw_to_nhwc(output)
+        # return output
 
 
 class AdaptiveMeanPool2D(object):
 
-    def __init__(self, output_size, data_format):
-        self.data_format, _ = preprocess_2d_format(data_format, None)
-        self.op = nn.AdaptiveAvgPool2d(output_size=output_size)
+    # def __init__(self, output_size, data_format):
+    #     self.data_format, _ = preprocess_2d_format(data_format, None)
+    #     self.op = nn.AdaptiveMeanPool2d(output_size=output_size)
 
-    def __call__(self, inputs):
-        if self.data_format == 'NHWC':
-            inputs = nhwc_to_nchw(inputs)
-        output = self.op(inputs)
-        if self.data_format == 'NHWC':
-            output = nchw_to_nhwc(output)
-        return output
+    def __call__():
+        raise NotImplementedError
+    #     if self.data_format == 'NHWC':
+    #         inputs = nhwc_to_nchw(inputs)
+    #     output = self.op(inputs)
+    #     if self.data_format == 'NHWC':
+    #         output = nchw_to_nhwc(output)
+    #     return output
 
 
 class AdaptiveMeanPool3D(object):
 
-    def __init__(self, output_size, data_format):
-        self.data_format, _ = preprocess_3d_format(data_format, None)
-        self.op = nn.AdaptiveAvgPool3d(output_size=output_size)
+    # def __init__(self, output_size, data_format):
+        # self.data_format, _ = preprocess_3d_format(data_format, None)
+        # self.op = torch.nn.AdaptiveAvgPool3d(output_size=output_size)
+
+    def __call__():
+        raise NotImplementedError
+        # if self.data_format == 'NDHWC':
+        #     inputs = nhwc_to_nchw(inputs)
+        # output = self.op(inputs)
+        # if self.data_format == 'NDHWC':
+        #     output = nchw_to_nhwc(output)
+        # return output
 
-    def __call__(self, inputs):
-        if self.data_format == 'NDHWC':
-            inputs = nhwc_to_nchw(inputs)
-        output = self.op(inputs)
-        if self.data_format == 'NDHWC':
-            output = nchw_to_nhwc(output)
-        return output
 
 def adaptive_avg_pool1d(input, output_size):
 
-    return F.adaptive_avg_pool1d(input, output_size)
+    raise NotImplementedError
 
 
 def adaptive_avg_pool2d(input, output_size):
 
-    return F.adaptive_avg_pool2d(input, output_size)
+    return nn.AdaptiveAvgPool2d(input, output_size)
 
 
 def adaptive_avg_pool3d(input, output_size):
 
-    return F.adaptive_avg_pool3d(input, output_size)
+    return nn.AdaptiveAvgPool3d(input, output_size)
+
 
 class AdaptiveMaxPool1D(object):
 
-    def __init__(self, output_size, data_format):
-        self.data_format, _ = preprocess_1d_format(data_format, None)
-        self.op = nn.AdaptiveMaxPool1d(output_size=output_size)
+    # def __init__(self, output_size, data_format):
+    #     self.data_format, _ = preprocess_1d_format(data_format, None)
+    #     self.op = torch.nn.AdaptiveMaxPool1d(output_size=output_size)
 
     def __call__(self, input):
-        if self.data_format == 'NLC':
-            input = nhwc_to_nchw(input)
-        output = self.op(input)
-        if self.data_format == 'NLC':
-            output = nchw_to_nhwc(output)
-        return output
+        raise NotImplementedError
+        # if self.data_format == 'NLC':
+        #     input = nhwc_to_nchw(input)
+        # output = self.op(input)
+        # if self.data_format == 'NLC':
+        #     output = nchw_to_nhwc(output)
+        # return output
 
 
 class AdaptiveMaxPool2D(object):
@@ -1758,7 +1865,6 @@ class AdaptiveMaxPool3D(object):
     def __init__(self, output_size, data_format):
         self.data_format, _ = preprocess_3d_format(data_format, None)
         self.op = nn.AdaptiveMaxPool3d(output_size=output_size)
-
     def __call__(self, inputs):
         if self.data_format == 'NDHWC':
             inputs = nhwc_to_nchw(inputs)
@@ -1768,16 +1874,16 @@ def __call__(self, inputs):
         return output
 
 def adaptive_max_pool1d(input, output_size, return_indices = False):
-
-    return F.adaptive_max_pool1d(input, output_size, return_indices)
-
+    raise NotImplementedError
+    
 def adaptive_max_pool2d(input, output_size, return_indices = False):
 
-    return F.adaptive_max_pool2d(input, output_size, return_indices)
+    return nn.AdaptiveMaxPool2d(input, output_size, return_indices)
 
 def adaptive_max_pool3d(input, output_size, return_indices=False):
 
-    return F.adaptive_max_pool3d(input, output_size, return_indices)
+    return nn.AdaptiveMaxPool3d(input, output_size, return_indices)
+
 
 class BinaryConv2D(object):
 
@@ -1820,53 +1926,44 @@ def __call__(self, inputs, filters):
 
 class rnncell(object):
 
-    def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh, act):
-        self.weight_ih = weight_ih
-        self.weight_hh = weight_hh
-        self.bias_ih = bias_ih
-        self.bias_hh = bias_hh
-        self.act = act
+    def __init__(self,  input_size , hidden_size , bias = True, nonlinearity='tanh'):
+        self.input_size = input_size
+        self.hidden_size= hidden_size
+        self.bias = bias
+        self.act = nonlinearity
 
     def __call__(self, input, h):
         if self.act == 'tanh':
-            h = _C.rnn_tanh_cell(
+            h = nn.RNNCell(
                 input,
                 h,
-                self.weight_ih,
-                self.weight_hh,
-                self.bias_ih,
-                self.bias_hh,
+                bias=self.bias
+                nonlinearity='tanh'
             )
         else:
-            h = _C.rnn_relu_cell(
+            h = nn.RNNCell(
                 input,
                 h,
-                self.weight_ih,
-                self.weight_hh,
-                self.bias_ih,
-                self.bias_hh,
+                bias=self.bias
+                nonlinearity='relu'
             )
         return h, h
 
 
 class lstmcell(object):
 
-    def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh, act=None):
-        self.weight_ih = weight_ih
-        self.weight_hh = weight_hh
-        self.bias_ih = bias_ih
-        self.bias_hh = bias_hh
+    def __init__(self,  input_size , hidden_size , bias = True, nonlinearity='tanh'):
+        self.input_size = input_size
+        self.hidden_size= hidden_size
+        self.bias = bias
 
     def __call__(self, input, h, c):
         h = (h, c)
-        h, c = _C.lstm_cell(
-            input,
-            h,
-            self.weight_ih,
-            self.weight_hh,
-            self.bias_ih,
-            self.bias_hh,
-        )
+        h, c = nn.LSTMCell(
+                input,
+                h,
+                bias=self.bias
+                )
         return h, h, c
 
 
@@ -1879,7 +1976,7 @@ def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh, act=None):
         self.bias_hh = bias_hh
 
     def __call__(self, input, h):
-        h = _C.gru_cell(
+        h = _VF.gru_cell(
             input,
             h,
             self.weight_ih,
@@ -1929,15 +2026,17 @@ def __init__(
         self.bidirectional = bidirectional
         self.num_directions = 2 if bidirectional else 1
         self.rnn_impls = {
-            'RNN_TANH': _C.rnn_tanh,
-            'RNN_RELU': _C.rnn_relu,
-            'GRU': _C.gru,
+            'RNN_TANH': _VF.rnn_tanh,
+            'RNN_RELU': _VF.rnn_relu,
+            'GRU': _VF.gru,
         }
         self.w_ih = w_ih
         self.w_hh = w_hh
         self.b_ih = b_ih
         self.b_hh = b_hh
 
+        # stdv = 1.0 / np.sqrt(self.hidden_size)
+        # _init = tf.random_uniform_initializer(minval=-stdv, maxval=stdv)
         self.proj_size = 0
         self.act_fn = None
         self._flat_weights_names = []
@@ -1966,19 +2065,34 @@ def __init__(
         ]
         self.flatten_parameters()
 
-
     def flatten_parameters(self):
         if len(self._flat_weights) != len(self._flat_weights_names):
             return
 
         for w in self._flat_weights:
-            if not isinstance(w, flow.Tensor):
+            if not isinstance(w, torch.Tensor):
                 return
         first_fw = self._flat_weights[0]
         dtype = first_fw.dtype
         for fw in self._flat_weights:
-            if (not isinstance(fw.data, flow.Tensor) or not (fw.data.dtype == dtype) or not fw.data.is_cuda):
+            if (not isinstance(fw.data, torch.Tensor) or not (fw.data.dtype == dtype) or not fw.data.is_cuda or
+                    not torch.backends.cudnn.is_acceptable(fw.data)):
                 return
+        unique_data_ptrs = set(p.data_ptr() for p in self._flat_weights)
+        if len(unique_data_ptrs) != len(self._flat_weights):
+            return
+
+        with torch.cuda.device_of(first_fw):
+            import torch.backends.cudnn.rnn as rnn
+            with torch.no_grad():
+                if torch._use_cudnn_rnn_flatten_weight():
+                    num_weights = 4 if self.bias else 2
+                    if self.proj_size > 0:
+                        num_weights += 1
+                    torch._cudnn_rnn_flatten_weight(
+                        self._flat_weights, num_weights, self.input_size, rnn.get_cudnn_mode(self.mode),
+                        self.hidden_size, self.proj_size, self.num_layers, self.batch_first, bool(self.bidirectional)
+                    )
 
     def _apply(self, fn):
         ret = super(rnnbase, self)._apply(fn)
@@ -2009,11 +2123,11 @@ def forward(self, input, states):
         self.check_input(input_shape)
         if self.mode == 'LSTM':
             if states is None:
-                h = flow.zeros(
+                h = torch.zeros(
                     self.num_layers * self.num_directions, batch_size, self.hidden_size, dtype=input.dtype,
                     device=input.device
                 )
-                c = flow.zeros(
+                c = torch.zeros(
                     self.num_layers * self.num_directions, batch_size, self.hidden_size, dtype=input.dtype,
                     device=input.device
                 )
@@ -2022,14 +2136,14 @@ def forward(self, input, states):
                 h, c = states
                 self.check_hidden(h, batch_size)
                 self.check_hidden(c, batch_size)
-            result = _C.lstm(
+            result = _VF.lstm(
                 input, states, self._flat_weights, self.bias, self.num_layers, self.dropout, self.training,
                 self.bidirectional, self.batch_first
             )
             return result[0], result[1:]
         else:
             if states is None:
-                h = flow.zeros(
+                h = torch.zeros(
                     self.num_layers * self.num_directions, batch_size, self.hidden_size, dtype=input.dtype,
                     device=input.device
                 )
@@ -2043,6 +2157,7 @@ def forward(self, input, states):
             )
             return result[0], result[1]
 
+
 class layernorm(object):
 
     def __init__(self, normalized_shape, gamma, beta, eps, input_shape):
@@ -2060,7 +2175,9 @@ def __init__(self, normalized_shape, gamma, beta, eps, input_shape):
     def __call__(self, input):
         return F.layer_norm(input, self.normalized_shape, self.gamma, self.beta, self.eps)
 
+
 class multiheadattention(Module):
+
     def __init__(
         self,
         embed_dim,
@@ -2098,7 +2215,7 @@ def __init__(
         self.register_parameter('in_proj_weight', None)
 
         if q_bias is not None:
-            self.in_proj_bias = flow.cat((self.q_bias, self.k_bias, self.v_bias))
+            self.in_proj_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias))
         else:
             self.register_parameter('in_proj_bias', None)
 
@@ -2122,6 +2239,7 @@ def forward(self, q, k, v, attn_mask, key_padding_mask):
         else:
             return attn_output, attn_output_weights
 
+
 class BinaryDense(object):
 
     def __init__(self, weights, bias):
@@ -2243,30 +2361,38 @@ def __init__(self, data_format):
         self.data_format = data_format
 
     def __call__(self, input, weight):
-        # weight = weight.to(input.device)
-        return F.prelu(input, weight)
+        if self.data_format == 'channels_last' :
+            input = nhwc_to_nchw(input)
+        output = torch.prelu(input, weight)
+        if self.data_format == 'channels_last':
+            output = nchw_to_nhwc(output)
+        return output
 
 
 def prelu(input, weight, data_format):
-
-    return F.prelu(input, weight)
+    if data_format == 'channels_last':
+        input = nhwc_to_nchw(input)
+    output = torch.prelu(input, weight)
+    if data_format == 'channels_last':
+        output = nchw_to_nhwc(output)
+    return output
 
 def hardsigmoid(input):
 
-    return F.hardsigmoid(input)
+    return torch.nn.functional.hardsigmoid(input)
 
 def hardswish(input):
 
-    return F.hardswish(input)
+    return torch.nn.functional.hardswish(input)
 
 def swish(input):
 
-    return F.swish(input)
+    return torch.sigmoid(input) * input
 
 def linear(input, weight, bias = None):
 
-    return F.linear(input, weight, bias)
+    return torch.nn.functional.linear(input, weight, bias)
 
 def unfold(input, kernel_size, dilation = 1, padding = 0, stride = 1):
 
-    return F.unfold(input, kernel_size, stride=stride, padding=padding, dilation=dilation)
\ No newline at end of file
+

From 157ec0bdc650bfe2dbac202a794fc9d25ce7e9a0 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Sat, 9 Mar 2024 03:19:57 +0800
Subject: [PATCH 05/27] Finished wirting the code for Jittor_nn.py and changed
 the name to jittor_nn.py

---
 .../ops/{Jittor_nn.py => jittor_nn.py}        | 608 +++++++++++-------
 1 file changed, 378 insertions(+), 230 deletions(-)
 rename tensorlayerx/backend/ops/{Jittor_nn.py => jittor_nn.py} (83%)

diff --git a/tensorlayerx/backend/ops/Jittor_nn.py b/tensorlayerx/backend/ops/jittor_nn.py
similarity index 83%
rename from tensorlayerx/backend/ops/Jittor_nn.py
rename to tensorlayerx/backend/ops/jittor_nn.py
index 3b4b6d5..488e80d 100644
--- a/tensorlayerx/backend/ops/Jittor_nn.py
+++ b/tensorlayerx/backend/ops/jittor_nn.py
@@ -9,6 +9,9 @@
 import jittor.nn as nn
 import collections
 from itertools import repeat
+from jittor import Module, init , flatten
+import math
+from abc import abstractmethod
 
 def padding_format(padding):
     """
@@ -629,7 +632,7 @@ def __call__(self, input, filters):
     def conv2d_same_padding(self, input, weight, bias=None):
         rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, weight, self.strides, self.dilations)
         if rows_odd or cols_odd:
-            input = F.pad(input, [0, int(cols_odd), 0, int(rows_odd)])
+            input = nn.pad(input, [0, int(cols_odd), 0, int(rows_odd)])
 
         return nn.conv2d(
             input, weight, bias, self.strides, padding=(padding_rows // 2, padding_cols // 2), dilation=self.dilations,
@@ -694,7 +697,7 @@ def __call__(self, input, filters):
         if self.padding == 'same':
             out = self.conv3d_same_padding(input, weight=filters)
         else:
-            out = F.conv3d(input, weight=filters, stride=self._strides, padding=self.padding, dilation=self._dilations)
+            out = nn.conv3d(input, weight=filters, stride=self._strides, padding=self.padding, dilation=self._dilations)
 
         if self.data_format == 'NDHWC':
             out = nchw_to_nhwc(out)
@@ -705,7 +708,7 @@ def conv3d_same_padding(self, input, weight, bias=None, groups=1):
         rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding(input, weight,
                                                                                                 self._strides, self._dilations)
         if rows_odd or cols_odd or depth_odd:
-            input = F.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)])
+            input = nn.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)])
 
         return nn.conv3d(
             input, weight, bias, self._strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth//2),
@@ -1570,7 +1573,7 @@ def conv3d_transpore_same(self,input, filters):
         rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding_deconvolution(
             input, filters, self.strides, (1, 1, 1))
         if rows_odd or cols_odd or depth_odd:
-            input = F.pad(input, [0, int(rows_odd), 0, int(cols_odd), 0, int(depth_odd)])
+            input = nn.pad(input, [0, int(rows_odd), 0, int(cols_odd), 0, int(depth_odd)])
             out_padding = 0
         else:
             out_padding = 1
@@ -1617,17 +1620,17 @@ def conv3d_transpose(
 
 
 def _to_channel_first_bias(b):
-    """Reshape [c] to [c, 1, 1]."""
+
     raise NotImplementedError
 
 
 def _bias_scale(x, b, data_format):
-    """The multiplication counter part of tf.nn.bias_add."""
+
     raise NotImplementedError
 
 
 def _bias_add(x, b, data_format):
-    """Alternative implementation of tf.nn.bias_add which is compatiable with tensorRT."""
+    
     raise NotImplementedError
 
 # Batch norms exists for jittor but not added here
@@ -1937,14 +1940,14 @@ def __call__(self, input, h):
             h = nn.RNNCell(
                 input,
                 h,
-                bias=self.bias
+                bias=self.bias,
                 nonlinearity='tanh'
             )
         else:
             h = nn.RNNCell(
                 input,
                 h,
-                bias=self.bias
+                bias=self.bias,
                 nonlinearity='relu'
             )
         return h, h
@@ -1969,20 +1972,16 @@ def __call__(self, input, h, c):
 
 class grucell(object):
 
-    def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh, act=None):
-        self.weight_ih = weight_ih
-        self.weight_hh = weight_hh
-        self.bias_ih = bias_ih
-        self.bias_hh = bias_hh
+    def __init__(self,  input_size , hidden_size , bias = True, nonlinearity='tanh'):
+        self.input_size = input_size
+        self.hidden_size= hidden_size
+        self.bias = bias
 
     def __call__(self, input, h):
-        h = _VF.gru_cell(
-            input,
-            h,
-            self.weight_ih,
-            self.weight_hh,
-            self.bias_ih,
-            self.bias_hh,
+        h = nn.GRUCell(
+                input,
+                h,
+                bias=self.bias
         )
         return h, h
 
@@ -1991,171 +1990,211 @@ class rnnbase(Module):
 
     def __init__(
         self,
-        mode,
-        input_size,
-        hidden_size,
-        num_layers,
-        bias,
-        batch_first,
-        dropout,
-        bidirectional,
-        is_train,
-        w_ih,
-        w_hh,
-        b_ih,
-        b_hh,
+            mode:str,  
+            input_size:int,
+            hidden_size:int,  
+            num_layers:int= 1,
+            bias:bool=True,
+            batch_first:bool=False ,  
+            dropout: float= 0,  
+            bidirectional:bool=False,  
+            proj_size : int = 0 ,  
+            nonlinearity: str = None
     ):
         super(rnnbase, self).__init__()
-        self.mode = mode
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.bias = bias
-        self.batch_first = batch_first
-        self.dropout = float(dropout)
-        self.train = is_train
-        if not 0 <= dropout < 1:
-            raise ValueError("dropout should be a number in range [0, 1).")
-        if dropout > 0 and num_layers == 1:
-            raise ValueError(
-                "dropout option adds dropout after all but last "
-                "recurrent layer, so non-zero dropout expects "
-                "num_layers greater than 1, but got dropout={} and "
-                "num_layers={}".format(dropout, num_layers)
-            )
-        self.bidirectional = bidirectional
-        self.num_directions = 2 if bidirectional else 1
-        self.rnn_impls = {
-            'RNN_TANH': _VF.rnn_tanh,
-            'RNN_RELU': _VF.rnn_relu,
-            'GRU': _VF.gru,
-        }
-        self.w_ih = w_ih
-        self.w_hh = w_hh
-        self.b_ih = b_ih
-        self.b_hh = b_hh
-
-        # stdv = 1.0 / np.sqrt(self.hidden_size)
-        # _init = tf.random_uniform_initializer(minval=-stdv, maxval=stdv)
-        self.proj_size = 0
-        self.act_fn = None
-        self._flat_weights_names = []
-        self._all_weights = []
-        cur = 0
+        self.mode = mode 
+        self.input_size = input_size 
+        self.hidden_size = hidden_size 
+        self.num_layers = num_layers 
+        self.bias = bias 
+        self.batch_first = batch_first 
+        self.dropout = dropout 
+        self.bidirectional = bidirectional 
+        self.proj_size = proj_size 
+        self.nonlinearity = nonlinearity
+
+        if mode == 'LSTM':
+            gate_size = 4 * hidden_size
+        elif mode == 'GRU':
+            gate_size = 3 * hidden_size
+        elif mode == 'RNN':
+            gate_size = hidden_size
+        else:
+            raise ValueError("Unrecognized RNN mode: " + mode)
+
+        num_directions = 1 + bidirectional
+        k = math.sqrt(1 / hidden_size)
+
+        def build_unit(name, in_channels, out_channels=None):
+            if out_channels is not None:
+                shape = (in_channels, out_channels)
+            else:
+                shape = (in_channels,)
+            setattr(self, name, init.uniform(shape, 'float32', -k, k))
+            if self.bidirectional:
+                setattr(self, name + '_reverse', init.uniform(shape, 'float32', -k, k))
+
         for layer in range(num_layers):
-            for direction in range(self.num_directions):
-                if bias:
-                    layer_params = (w_ih[cur], w_hh[cur], b_ih[cur], b_hh[cur])
+            if layer == 0:
+                build_unit(f'weight_ih_l{layer}', gate_size, input_size)
+            else:
+                if proj_size > 0:
+                    build_unit(f'weight_ih_l{layer}', gate_size, num_directions * proj_size)
                 else:
-                    layer_params = (w_ih[cur], w_hh[cur])
-
-                suffix = '_reverse' if direction == 1 else ''
-                param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
-                if bias:
-                    param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
-                param_names = [x.format(layer, suffix) for x in param_names]
-
-                for name, param in zip(param_names, layer_params):
-                    setattr(self, name, param)
-                self._flat_weights_names.extend(param_names)
-                self._all_weights.append(param_names)
-                cur += 1
-        self._flat_weights = [
-            (lambda wn: getattr(self, wn) if hasattr(self, wn) else None)(wn) for wn in self._flat_weights_names
-        ]
-        self.flatten_parameters()
-
-    def flatten_parameters(self):
-        if len(self._flat_weights) != len(self._flat_weights_names):
-            return
-
-        for w in self._flat_weights:
-            if not isinstance(w, torch.Tensor):
-                return
-        first_fw = self._flat_weights[0]
-        dtype = first_fw.dtype
-        for fw in self._flat_weights:
-            if (not isinstance(fw.data, torch.Tensor) or not (fw.data.dtype == dtype) or not fw.data.is_cuda or
-                    not torch.backends.cudnn.is_acceptable(fw.data)):
-                return
-        unique_data_ptrs = set(p.data_ptr() for p in self._flat_weights)
-        if len(unique_data_ptrs) != len(self._flat_weights):
-            return
-
-        with torch.cuda.device_of(first_fw):
-            import torch.backends.cudnn.rnn as rnn
-            with torch.no_grad():
-                if torch._use_cudnn_rnn_flatten_weight():
-                    num_weights = 4 if self.bias else 2
-                    if self.proj_size > 0:
-                        num_weights += 1
-                    torch._cudnn_rnn_flatten_weight(
-                        self._flat_weights, num_weights, self.input_size, rnn.get_cudnn_mode(self.mode),
-                        self.hidden_size, self.proj_size, self.num_layers, self.batch_first, bool(self.bidirectional)
-                    )
-
-    def _apply(self, fn):
-        ret = super(rnnbase, self)._apply(fn)
-        self._flat_weights = [
-            (lambda wn: getattr(self, wn) if hasattr(self, wn) else None)(wn) for wn in self._flat_weights_names
-        ]
-        self.flatten_parameters()
-        return ret
+                    build_unit(f'weight_ih_l{layer}', gate_size, num_directions * hidden_size)
 
-    def check_input(self, input_shape):
-        if len(input_shape) != 3:
-            raise ValueError("input must have 3 dimensions. But got {}.".format(len(input_shape)))
-        if self.input_size != input_shape[-1]:
-            raise ValueError(
-                "The last dimension of input should be equal to input_size {}.But got {}".format(
-                    self.input_size, input_shape[-1]
+            if proj_size > 0:
+                build_unit(f'weight_hh_l{layer}', gate_size, proj_size)
+                build_unit(f'weight_hr_l{layer}', proj_size, hidden_size)
+            else:
+                build_unit(f'weight_hh_l{layer}', gate_size, hidden_size)
+
+            if bias:
+                build_unit(f'bias_ih_l{layer}', gate_size)
+                build_unit(f'bias_hh_l{layer}', gate_size)
+
+    def _cudnn_flatten_weights(self, cudnn_mode):
+        def copy_to_flatten_weight(param_name, offset_idx, num_gates):
+            def copy_to(param_name, offset_idx, idx):
+                cur_offset = self._cudnn_weight_offset[offset_idx]
+                param = getattr(self, param_name)
+                param = param[self.hidden_size * idx: self.hidden_size * (idx + 1)]
+                ft_weight[cur_offset:cur_offset + param.numel()] = param.flatten()
+                
+            if self.bias:
+                for idx in range(num_gates):
+                    copy_to('weight' + param_name, offset_idx + idx * 2, idx)
+                    copy_to('bias' + param_name, offset_idx + idx * 2 + 1, idx)
+                return num_gates * 2
+            else:
+                for idx in range(num_gates):
+                    copy_to('weight' + param_name, offset_idx + idx, idx)
+                return num_gates
+
+        if jt.flags.use_cuda and jt.cudnn and jt.compiler.is_cuda:
+            if getattr(self, '_cudnn_weight_size', None) is None:                
+                offset_array = jt.cudnn.cudnn_rnn_weight_offset(
+                    cudnn_mode,
+                    self.input_size,
+                    self.hidden_size, 
+                    self.num_layers,
+                    self.proj_size,
+                    self.bias,
+                    self.bidirectional
                 )
-            )
+                self._cudnn_weight_size = offset_array[0]
+                self._cudnn_weight_offset = offset_array[1:]
+            
+            num_gates = {
+                "RNN": 1, "LSTM": 4, "GRU": 3
+            }[self.mode]
+            ft_weight = jt.zeros(self._cudnn_weight_size, dtype=jt.float32)
+
+            cnt = 0
+            for layer in range(self.num_layers):
+                suffix = ''
+                cnt += copy_to_flatten_weight(f'_ih_l{layer}' + suffix, cnt, num_gates)
+                cnt += copy_to_flatten_weight(f'_hh_l{layer}' + suffix, cnt, num_gates)
+                if self.bidirectional:
+                    suffix = '_reverse'
+                    cnt += copy_to_flatten_weight(f'_ih_l{layer}' + suffix, cnt, num_gates)
+                    cnt += copy_to_flatten_weight(f'_hh_l{layer}' + suffix, cnt, num_gates)
+            return ft_weight
+        else:
+            raise RuntimeError("Not Cudnn found")
+
+    @abstractmethod
+    def call_rnn_cell(self, input, hidden, suffix):
+        pass
 
-    def check_hidden(self, h, batch_size):
-        expected_hidden_size = (self.num_layers * self.num_directions, batch_size, self.hidden_size)
-        if h.shape != expected_hidden_size:
-            raise ValueError('Expected hidden size {}, got {}.'.format(expected_hidden_size, h.shape))
+    def call_rnn_sequence(self, input, hidden, suffix):
+        if 'reverse' in suffix:
+            input = input[::-1]
+
+        output = []
+        for s in range(input.shape[0]):
+            out, hidden = self.call_rnn_cell(input[s], hidden, suffix)
+            output.append(out)
+
+        if 'reverse' in suffix:
+            output = output[::-1]
+        output = jt.stack(output, dim=0)
+
+        return output, hidden
+
+    def _execute_cudnn_rnn(self, input, hx):
+        cudnn_mode = {
+            ('RNN', 'tanh'): 'tanh',
+            ('RNN', 'relu'): 'relu',
+            ('LSTM', None): 'lstm',
+            ('GRU', None): 'gru'
+        }[(self.mode, self.nonlinearity)]
+        ft_weight = self._cudnn_flatten_weights(cudnn_mode)
 
-    def forward(self, input, states):
-        batch_size = input.shape[0] if self.batch_first else input.shape[1]
-        input_shape = input.shape
-        self.check_input(input_shape)
         if self.mode == 'LSTM':
-            if states is None:
-                h = torch.zeros(
-                    self.num_layers * self.num_directions, batch_size, self.hidden_size, dtype=input.dtype,
-                    device=input.device
-                )
-                c = torch.zeros(
-                    self.num_layers * self.num_directions, batch_size, self.hidden_size, dtype=input.dtype,
-                    device=input.device
-                )
-                states = (h, c)
-            else:
-                h, c = states
-                self.check_hidden(h, batch_size)
-                self.check_hidden(c, batch_size)
-            result = _VF.lstm(
-                input, states, self._flat_weights, self.bias, self.num_layers, self.dropout, self.training,
-                self.bidirectional, self.batch_first
+            ret = jt.cudnn.ops.cudnn_rnn(input, hx[0], hx[1], ft_weight,
+                cudnn_mode, self.input_size, self.hidden_size, self.num_layers, 0,
+                self.dropout, self.bias, self.bidirectional, self.is_training()
             )
-            return result[0], result[1:]
+            return ret[0], (ret[1], ret[2])
         else:
-            if states is None:
-                h = torch.zeros(
-                    self.num_layers * self.num_directions, batch_size, self.hidden_size, dtype=input.dtype,
-                    device=input.device
-                )
-                states = h
-            else:
-                self.check_hidden(states, batch_size)
-            impl = self.rnn_impls[self.mode]
-            result = impl(
-                input, states, self._flat_weights, self.bias, self.num_layers, self.dropout, self.training,
-                self.bidirectional, self.batch_first
+            ret = jt.cudnn.ops.cudnn_rnn(input, hx, ft_weight,
+                cudnn_mode, self.input_size, self.hidden_size, self.num_layers, 0,
+                self.dropout, self.bias, self.bidirectional, self.is_training()
             )
-            return result[0], result[1]
+            return ret[0], ret[1]
+
+    def execute(self, input, hx=None):
+        if self.batch_first:
+            input = input.permute(1, 0, 2)
+
+        num_directions = 2 if self.bidirectional else 1
+
+        if hx is None:
+            if self.mode in ['RNN', 'GRU']:
+                hx = jt.zeros((num_directions * self.num_layers, input.shape[1], self.hidden_size), dtype=input.dtype)
+            elif self.mode == 'LSTM':
+                hx = (jt.zeros((num_directions * self.num_layers, input.shape[1], self.hidden_size), dtype=input.dtype),
+                      jt.zeros((num_directions * self.num_layers, input.shape[1], self.hidden_size), dtype=input.dtype))
+
+        if jt.flags.use_cuda and jt.cudnn and self.proj_size == 0 and jt.compiler.is_cuda:
+            return self._execute_cudnn_rnn(input, hx)
+        else:
+            hidden_n = []
+
+            for l in range(self.num_layers):
+                output = []
+
+                if isinstance(hx, tuple):
+                    hidden = [h[l * num_directions] for h in hx]
+                else:
+                    hidden = hx[l * num_directions]
+
+                output, _hidden = self.call_rnn_sequence(input, hidden, f'l{l}')
+                hidden_n.append(_hidden)
+
+                if self.bidirectional:
+                    if isinstance(hx, tuple):
+                        hidden = [h[l * num_directions + 1] for h in hx]
+                    else:
+                        hidden = hx[l * num_directions + 1]
+
+                    output_b, _hidden = self.call_rnn_sequence(input, hidden, f'l{l}_reverse')
+                    output = jt.concat([output, output_b], dim=-1)
+                    hidden_n.append(_hidden)
+
+                if self.dropout > 0:
+                    input = dropout(output, p=self.dropout)
+                else:
+                    input = output
+
+            if isinstance(hx, tuple):
+                hidden_n = tuple(jt.stack(hn, dim=0) for hn in zip(*hidden_n))
+            else:
+                hidden_n = jt.stack(hidden_n, dim=0)
+
+            return output, hidden_n
+
 
 
 class layernorm(object):
@@ -2173,72 +2212,180 @@ def __init__(self, normalized_shape, gamma, beta, eps, input_shape):
             self.broadcast_shape[dim] = input_shape[dim]
 
     def __call__(self, input):
-        return F.layer_norm(input, self.normalized_shape, self.gamma, self.beta, self.eps)
+        return nn.layer_norm(input, self.normalized_shape, self.gamma, self.beta, self.eps)
 
 
 class multiheadattention(Module):
-
     def __init__(
         self,
         embed_dim,
         num_heads,
-        dropout,
-        batch_first,
-        need_weights,
-        q_weight,
-        k_weight,
-        v_weight,
-        out_weight,
-        q_bias,
-        k_bias,
-        v_bias,
-        out_bias,
-        train,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        q_noise=0.0,
+        qn_block_size=8,
     ):
-        super(multiheadattention, self).__init__()
-        self.embed_dim_check = embed_dim
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
         self.num_heads = num_heads
-        self.dropout = dropout
-        self.batch_first = batch_first
-        self.need_weights = need_weights
-        self.q_weight = q_weight
-        self.k_weight = k_weight
-        self.v_weight = v_weight
-        self.out_weight = out_weight
-        self.q_bias = q_bias
-        self.k_bias = k_bias
-        self.v_bias = v_bias
-        self.out_bias = out_bias
-        self.train = train
+        assert dropout==0, "TODO: dropout>0"
+
         self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim_check, 'embed_dim must be divisible by num_heads'
-        self.register_parameter('in_proj_weight', None)
+        assert (self.head_dim * num_heads == self.embed_dim), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
 
-        if q_bias is not None:
-            self.in_proj_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias))
-        else:
-            self.register_parameter('in_proj_bias', None)
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+
+        assert not self.self_attention or self.qkv_same_dim, ("Self-attention requires query, key and " "value to be of the same size")
+
+        #TODO: quant_noise
+        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+        assert not add_bias_kv, "TODO: add_bias_kv=True"
         self.bias_k = self.bias_v = None
-        self.add_zero_attn = False
 
-    def forward(self, q, k, v, attn_mask, key_padding_mask):
-        k = q if k is None else k
-        v = q if v is None else v
-        if self.batch_first:
-            q, k, v = [x.transpose(1, 0) for x in (q, k, v)]
-        attn_output, attn_output_weights = F.multi_head_attention_forward(
-            q, k, v, self.embed_dim_check, self.num_heads, self.in_proj_weight, self.in_proj_bias, self.bias_k,
-            self.bias_v, self.add_zero_attn, self.dropout, self.out_weight, self.out_bias, training=self.training,
-            key_padding_mask=key_padding_mask, need_weights=self.need_weights, attn_mask=attn_mask,
-            use_separate_proj_weight=True, q_proj_weight=self.q_weight, k_proj_weight=self.k_weight,
-            v_proj_weight=self.v_weight
-        )
-        if self.batch_first:
-            return attn_output.transpose(1, 0), attn_output_weights
+        self.add_zero_attn = add_zero_attn
+
+        self.reset_parameters()
+
+        self.onnx_trace = False
+        self.tpu = False
+
+    def reset_parameters(self):
+        '''
+        初始化参数
+
+            代码示例:
+                >>> multihead_attn = jt.attention.MultiheadAttention(embed_dim, num_heads)
+                >>> multihead_attn.reset_parameters()
+                
+        
+        '''
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            init.xavier_uniform_(self.k_proj.weight)
+            init.xavier_uniform_(self.v_proj.weight)
+            init.xavier_uniform_(self.q_proj.weight)
+
+        # init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            init.constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            init.xavier_normal_(self.bias_v)
+
+
+
+    def execute(
+        self,
+        query,
+        key = None,
+        value = None,
+        key_padding_mask = None,
+        incremental_state = None,
+        need_weights = True,
+        static_kv = False,
+        attn_mask = None,
+        before_softmax = False,
+        need_head_weights = False,
+    ):
+        if need_head_weights:
+            need_weights = True
+
+        tgt_len, bsz, embed_dim = query.shape
+        assert embed_dim == self.embed_dim
+        assert list(query.shape) == [tgt_len, bsz, embed_dim]
+
+        assert incremental_state is None, "TODO: incremental_state is not None"
+        saved_state = None
+
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q = q*self.scaling
+
+        assert self.bias_k is None, "TODO: self.bias_k is not None:"
+
+        q = q.view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(1, 0, 2)
+        if k is not None:
+            k = k.view(-1, bsz * self.num_heads, self.head_dim).transpose(1, 0, 2)
+        if v is not None:
+            v = v.view(-1, bsz * self.num_heads, self.head_dim).transpose(1, 0, 2)
+
+        assert saved_state is None, "TODO: saved_state is not None"
+        assert k is not None
+        src_len = k.shape[1]
+
+        assert key_padding_mask is None, "TODO: key_padding_mask is not None"
+        assert not self.add_zero_attn, "TODO: self.add_zero_attn=True"
+
+        attn_weights = nn.bmm(q, k.transpose(0, 2, 1))
+
+        assert list(attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
+
+        assert attn_mask is None, "TODO: attn_mask is not None"
+        assert key_padding_mask is None, "TODO: key_padding_mask is not None"
+        
+        if before_softmax:
+            return attn_weights, v
+        
+        attn_weights_float = nn.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+
+        assert v is not None
+        attn = nn.bmm(attn_weights, v)
+        assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.shape[1] == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.view(tgt_len, bsz, embed_dim)
         else:
-            return attn_output, attn_output_weights
+            attn = attn.transpose(1, 0, 2).view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0, 2, 3)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dims=[0])
 
+        return attn, attn_weights
 
 class BinaryDense(object):
 
@@ -2363,7 +2510,7 @@ def __init__(self, data_format):
     def __call__(self, input, weight):
         if self.data_format == 'channels_last' :
             input = nhwc_to_nchw(input)
-        output = torch.prelu(input, weight)
+        output = nn.PReLU(input, weight)
         if self.data_format == 'channels_last':
             output = nchw_to_nhwc(output)
         return output
@@ -2372,27 +2519,28 @@ def __call__(self, input, weight):
 def prelu(input, weight, data_format):
     if data_format == 'channels_last':
         input = nhwc_to_nchw(input)
-    output = torch.prelu(input, weight)
+    output = nn.PReLU(input, weight)
     if data_format == 'channels_last':
         output = nchw_to_nhwc(output)
     return output
 
 def hardsigmoid(input):
 
-    return torch.nn.functional.hardsigmoid(input)
+    return NotImplementedError
 
 def hardswish(input):
 
-    return torch.nn.functional.hardswish(input)
+    return NotImplementedError
 
 def swish(input):
 
-    return torch.sigmoid(input) * input
+    return NotImplementedError
 
 def linear(input, weight, bias = None):
 
-    return torch.nn.functional.linear(input, weight, bias)
+    return nn.linear(input, weight, bias)
 
 def unfold(input, kernel_size, dilation = 1, padding = 0, stride = 1):
 
+    return nn.unfold(input, kernel_size, stride=stride, padding=padding, dilation=dilation)
 

From 5d946bd251318ef98d2f3c14ccdc3a714d4ab6a0 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Sat, 9 Mar 2024 04:06:30 +0800
Subject: [PATCH 06/27] Added jittor loss 'jittor_cost.py'

---
 tensorlayerx/losses/jittor_cost.py | 572 +++++++++++++++++++++++++++++
 1 file changed, 572 insertions(+)
 create mode 100644 tensorlayerx/losses/jittor_cost.py

diff --git a/tensorlayerx/losses/jittor_cost.py b/tensorlayerx/losses/jittor_cost.py
new file mode 100644
index 0000000..694c3fe
--- /dev/null
+++ b/tensorlayerx/losses/jittor_cost.py
@@ -0,0 +1,572 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+import jittor as jt
+import jittor.nn as nn
+
+
+__all__ = [
+    'softmax_cross_entropy_with_logits',
+    'sigmoid_cross_entropy',
+    'binary_cross_entropy',
+    'mean_squared_error',
+    'normalized_mean_square_error',
+    'absolute_difference_error',
+    'dice_coe',
+    'dice_hard_coe',
+    'iou_coe',
+    'cross_entropy_seq',
+    'cross_entropy_seq_with_mask',
+    'cosine_similarity',
+    'li_regularizer',
+    'lo_regularizer',
+    'maxnorm_regularizer',
+    'maxnorm_o_regularizer',
+    'maxnorm_i_regularizer',
+    'L1Loss'
+]
+
+
+def softmax_cross_entropy_with_logits(output, target, reduction='mean'):
+    """Softmax cross-entropy operation, returns the TensorFlow expression of cross-entropy for two distributions,
+    it implements softmax internally. See ``tf.ops.sparse_softmax_cross_entropy_with_logits``.
+
+    Parameters
+    ----------
+    output : Tensor
+        A batch of distribution with shape: [batch_size, num of classes].
+    target : Tensor
+        A batch of index with shape: [batch_size, ].
+
+    Examples
+    --------
+    >>> import tensorlayerx as tl
+    >>> ce = tlx.losses.softmax_cross_entropy_with_logits(y_logits, y_target_logits)
+
+    References
+    -----------
+    - About cross-entropy: `<https://en.wikipedia.org/wiki/Cross_entropy>`__.
+    - The code is borrowed from: `<https://en.wikipedia.org/wiki/Cross_entropy>`__.
+
+    """
+
+    return nn.CrossEntropyLoss(reduction=reduction)(output, target)
+
+
+def sigmoid_cross_entropy(output, target, reduction='mean'):
+    """Sigmoid cross-entropy operation, see ``tf.ops.sigmoid_cross_entropy_with_logits``.
+
+    Parameters
+    ----------
+    output : Tensor
+        A batch of distribution with shape: [batch_size, num of classes].
+    target : Tensor
+        same shape as the input.
+    reduction : str
+        The optional values are “mean”, “sum”, and “none”. If “none”, do not perform reduction.
+
+    """
+
+    return nn.BCEWithLogitsLoss(reduction=reduction)(output, target)
+
+
+def binary_cross_entropy(output, target, reduction='mean'):
+    """Binary cross entropy operation.
+
+    Parameters
+    ----------
+    output : Tensor
+        Tensor with type of `float32` or `float64`.
+    target : Tensor
+        The target distribution, format the same with `output`.
+
+    References
+    -----------
+    - `ericjang-DRAW <https://github.com/ericjang/draw/blob/master/draw.py#L73>`__
+
+    """
+
+    return nn.BCELoss(reduction=reduction)(output, target)
+
+
+def mean_squared_error(output, target, reduction='mean'):
+    """Return the TensorFlow expression of mean-square-error (L2) of two batch of data.
+
+    Parameters
+    ----------
+    output : Tensor
+        2D, 3D or 4D tensor i.e. [batch_size, n_feature], [batch_size, height, width] or [batch_size, height, width, channel].
+    target : Tensor
+        The target distribution, format the same with `output`.
+
+    References
+    ------------
+    - `Wiki Mean Squared Error <https://en.wikipedia.org/wiki/Mean_squared_error>`__
+
+    """
+
+    return nn.MSELoss(reduction=reduction)(output, target)
+
+
+def normalized_mean_square_error(output, target, reduction='mean'):
+    """Return the TensorFlow expression of normalized mean-square-error of two distributions.
+
+    Parameters
+    ----------
+    output : Tensor
+        2D, 3D or 4D tensor i.e. [batch_size, n_feature], [batch_size, height, width] or [batch_size, height, width, channel].
+    target : Tensor
+        The target distribution, format the same with `output`.
+
+    """
+
+    nmse_a = jt.Var.sqrt(jt.Var.sum((output - target)**2, dim=-1))
+    nmse_b = jt.Var.sqrt(jt.Var.sum(target**2, dim=-1))
+
+    if reduction == 'mean':
+        nmse = jt.Var.mean(nmse_a / nmse_b)
+    elif reduction == 'sum':
+        nmse = jt.Var.sum(nmse_a / nmse_b)
+    elif reduction == 'none':
+        nmse = nmse_a / nmse_b
+    else:
+        raise Exception("The reduction values are 'mean', 'sum', and 'none'.")
+    return nmse
+
+
+def absolute_difference_error(output, target, reduction='mean'):
+    """Return the TensorFlow expression of absolute difference error (L1) of two batch of data.
+
+    Parameters
+    ----------
+    output : Tensor
+        2D, 3D or 4D tensor i.e. [batch_size, n_feature], [batch_size, height, width] or [batch_size, height, width, channel].
+    target : Tensor
+        The target distribution, format the same with `output`.
+
+    """
+
+    if reduction == 'mean':
+        loss = jt.Var.mean(jt.Var.abs(output - target))
+    elif reduction == 'sum':
+        loss = jt.Var.sum(jt.Var.abs(output - target))
+    elif reduction == 'none':
+        loss = jt.Var.abs(output - target)
+    else:
+        raise Exception("The reduction values are 'mean', 'sum', and 'none'.")
+    return loss
+
+
+def dice_coe(output, target, loss_type='jaccard', axis=(1, 2, 3), smooth=1e-5):
+    """Soft dice (Sørensen or Jaccard) coefficient for comparing the similarity
+    of two batch of data, usually be used for binary image segmentation
+    i.e. labels are binary. The coefficient between 0 to 1, 1 means totally match.
+
+    Parameters
+    -----------
+    output : Tensor
+        A distribution with shape: [batch_size, ....], (any dimensions).
+    target : Tensor
+        The target distribution, format the same with `output`.
+    loss_type : str
+        ``jaccard`` or ``sorensen``, default is ``jaccard``.
+    axis : tuple of int
+        All dimensions are reduced, default ``[1,2,3]``.
+    smooth : float
+        This small value will be added to the numerator and denominator.
+            - If both output and target are empty, it makes sure dice is 1.
+            - If either output or target are empty (all pixels are background), dice = ```smooth/(small_value + smooth)``, then if smooth is very small, dice close to 0 (even the image values lower than the threshold), so in this case, higher smooth can have a higher dice.
+
+    Examples
+    ---------
+    >>> import tensorlayerx as tl
+    >>> outputs = tlx.act.pixel_wise_softmax(outputs)
+    >>> dice_loss = 1 - tlx.losses.dice_coe(outputs, y_)
+
+    References
+    -----------
+    - `Wiki-Dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`__
+
+    """
+
+    inse = jt.Var.sum(output * target, dim=axis)
+    if loss_type == 'jaccard':
+        l = jt.Var.sum(output * output, dim=axis)
+        r = jt.Var.sum(target * target, dim=axis)
+    elif loss_type == 'sorensen':
+        l = jt.Var.sum(output, dim=axis)
+        r = jt.Var.sum(target, dim=axis)
+    else:
+        raise Exception("Unknow loss_type")
+    dice = (2. * inse + smooth) / (l + r + smooth)
+    dice = jt.Var.mean(dice)
+    return dice
+
+
+def dice_hard_coe(output, target, threshold=0.5, axis=(1, 2, 3), smooth=1e-5):
+    """Non-differentiable Sørensen–Dice coefficient for comparing the similarity
+    of two batch of data, usually be used for binary image segmentation i.e. labels are binary.
+    The coefficient between 0 to 1, 1 if totally match.
+
+    Parameters
+    -----------
+    output : tensor
+        A distribution with shape: [batch_size, ....], (any dimensions).
+    target : tensor
+        The target distribution, format the same with `output`.
+    threshold : float
+        The threshold value to be true.
+    axis : tuple of integer
+        All dimensions are reduced, default ``(1,2,3)``.
+    smooth : float
+        This small value will be added to the numerator and denominator, see ``dice_coe``.
+
+    References
+    -----------
+    - `Wiki-Dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`__
+
+    """
+
+    output = _cast(output, threshold)
+    target = _cast(target, threshold)
+    inse = jt.Var.sum(jt.Var.multiply(output, target), dim=axis)
+    l = jt.Var.sum(output, dim=axis)
+    r = jt.Var.sum(target, dim=axis)
+    hard_dice = (2. * inse + smooth) / (l + r + smooth)
+    hard_dice = jt.Var.mean(hard_dice)
+    return hard_dice
+
+
+def iou_coe(output, target, threshold=0.5, axis=(1, 2, 3), smooth=1e-5):
+    """Non-differentiable Intersection over Union (IoU) for comparing the
+    similarity of two batch of data, usually be used for evaluating binary image segmentation.
+    The coefficient between 0 to 1, and 1 means totally match.
+
+    Parameters
+    -----------
+    output : tensor
+        A batch of distribution with shape: [batch_size, ....], (any dimensions).
+    target : tensor
+        The target distribution, format the same with `output`.
+    threshold : float
+        The threshold value to be true.
+    axis : tuple of integer
+        All dimensions are reduced, default ``(1,2,3)``.
+    smooth : float
+        This small value will be added to the numerator and denominator, see ``dice_coe``.
+
+    Notes
+    ------
+    - IoU cannot be used as training loss, people usually use dice coefficient for training, IoU and hard-dice for evaluating.
+
+    """
+
+    pre = _cast(output, threshold)
+    truth = _cast(target, threshold)
+    inse = jt.Var.sum(jt.Var.multiply(pre, truth), dim=axis)
+    union = jt.Var.sum(_cast(jt.Var.add(pre, truth) , 1.0, flag=True), dim=axis)
+    batch_iou = (inse + smooth) / (union + smooth)
+    iou = jt.Var.mean(batch_iou)
+    return iou
+
+
+def sequence_loss_by_example(
+    logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None
+):
+    """Weighted cross-entropy loss for a sequence of logits (per example). see original tensorflow code :
+    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py#L1057>
+
+    Parameters
+    ----------
+    logits: List
+        List of 2D Tensors of shape [batch_size x num_decoder_symbols].
+    targets: List
+        List of 1D batch-sized int32 Tensors of the same length as logits.
+    weights: List
+        List of 1D batch-sized float-Tensors of the same length as logits.
+    average_across_timesteps: Boolean
+        If set, divide the returned losses by the total label weight.
+    softmax_loss_function: None or Function
+        Function (labels, logits) -> loss-batch to be used instead of the standard softmax (the default if this is None).
+        **Note that to avoid confusion, it is required for the function to accept named arguments.**
+    name: None or str
+        Optional name for this operation, default: "sequence_loss_by_example".
+
+    Returns
+    -------
+    1D batch-sized float Tensor: The log-perplexity for each sequence.
+
+    Raises
+    ------
+    ValueError: If len(logits) is different from len(targets) or len(weights).
+
+    """
+
+    raise NotImplementedError("Not Implemented.")
+
+
+def cross_entropy_seq(logits, target_seqs, batch_size=None):
+    """Returns the expression of cross-entropy of two sequences, implement
+    softmax internally. Normally be used for fixed length RNN outputs, see `PTB example <https://github.com/tensorlayer/tensorlayer/blob/master/example/tutorial_ptb_lstm.py>`__.
+
+    Parameters
+    ----------
+    logits : Tensor
+        2D tensor with shape of `[batch_size * n_steps, n_classes]`.
+    target_seqs : Tensor
+        The target sequence, 2D tensor `[batch_size, n_steps]`, if the number of step is dynamic, please use ``tlx.losses.cross_entropy_seq_with_mask`` instead.
+    batch_size : None or int.
+        Whether to divide the losses by batch size.
+            - If integer, the return losses will be divided by `batch_size`.
+            - If None (default), the return losses will not be divided by anything.
+
+    Examples
+    --------
+    >>> import tensorlayerx as tl
+    >>> # see `PTB example <https://github.com/tensorlayer/tensorlayer/blob/master/example/tutorial_ptb_lstm.py>`__.for more details
+    >>> # outputs shape : (batch_size * n_steps, n_classes)
+    >>> # targets shape : (batch_size, n_steps)
+    >>> losses = tlx.losses.cross_entropy_seq(outputs, targets)
+
+    """
+
+    raise NotImplementedError("Not Implemented.")
+
+
+def cross_entropy_seq_with_mask(logits, target_seqs, input_mask, return_details=False, name=None):
+    """Returns the expression of cross-entropy of two sequences, implement
+    softmax internally. Normally be used for Dynamic RNN with Synced sequence input and output.
+
+    Parameters
+    -----------
+    logits : Tensor
+        2D tensor with shape of [batch_size * ?, n_classes], `?` means dynamic IDs for each example.
+        - Can be get from `DynamicRNNLayer` by setting ``return_seq_2d`` to `True`.
+    target_seqs : Tensor
+        int of tensor, like word ID. [batch_size, ?], `?` means dynamic IDs for each example.
+    input_mask : Tensor
+        The mask to compute loss, it has the same size with `target_seqs`, normally 0 or 1.
+    return_details : boolean
+        Whether to return detailed losses.
+            - If False (default), only returns the loss.
+            - If True, returns the loss, losses, weights and targets (see source code).
+
+    Examples
+    --------
+    >>> import tensorlayerx as tl
+    >>> import tensorflow as tf
+    >>> import numpy as np
+    >>> batch_size = 64
+    >>> vocab_size = 10000
+    >>> embedding_size = 256
+    >>> ni = tlx.layers.Input([batch_size, None], dtype=tf.int64)
+    >>> net = tlx.layers.Embedding(
+    ...         vocabulary_size = vocab_size,
+    ...         embedding_size = embedding_size,
+    ...         name = 'seq_embedding')(ni)
+    >>> net = tlx.layers.RNN(
+    ...         cell =tf.keras.layers.LSTMCell(units=embedding_size, dropout=0.1),
+    ...         return_seq_2d = True,
+    ...         name = 'dynamicrnn')(net)
+    >>> net = tlx.layers.Linear(out_features=vocab_size, name="output")(net)
+    >>> model = tlx.model.Model(inputs=ni, outputs=net)
+    >>> input_seqs = np.random.randint(0, 10, size=(batch_size, 10), dtype=np.int64)
+    >>> target_seqs = np.random.randint(0, 10, size=(batch_size, 10), dtype=np.int64)
+    >>> input_mask = np.random.randint(0, 2, size=(batch_size, 10), dtype=np.int64)
+    >>> outputs = model(input_seqs, is_train=True)
+    >>> loss = tlx.losses.cross_entropy_seq_with_mask(outputs, target_seqs, input_mask)
+
+    """
+
+    raise NotImplementedError("Not Implemented.")
+
+
+def cosine_similarity(v1, v2):
+    """Cosine similarity [-1, 1].
+
+    Parameters
+    ----------
+    v1, v2 : Tensor
+        Tensor with the same shape [batch_size, n_feature].
+
+    References
+    ----------
+    - `Wiki <https://en.wikipedia.org/wiki/Cosine_similarity>`__.
+
+    """
+
+    return jt.Var.sum(jt.Var.multiply(v1, v2), 1) / \
+        (jt.Var.sqrt(jt.Var.sum(jt.Var.multiply(v1, v1), 1)) *
+         jt.Var.sqrt(jt.Var.sum(jt.Var.multiply(v2, v2), 1)))
+
+
+# Regularization Functions
+def li_regularizer(scale, scope=None):
+    """Li regularization removes the neurons of previous layer. The `i` represents `inputs`.
+    Returns a function that can be used to apply group li regularization to weights.
+    The implementation follows `TensorFlow contrib <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/layers/python/layers/regularizers.py>`__.
+
+    Parameters
+    ----------
+    scale : float
+        A scalar multiplier `Tensor`. 0.0 disables the regularizer.
+    scope: str
+        An optional scope name for this function.
+
+    Returns
+    --------
+    A function with signature `li(weights, name=None)` that apply Li regularization.
+
+    Raises
+    ------
+    ValueError : if scale is outside of the range [0.0, 1.0] or if scale is not a float.
+
+    """
+
+    raise NotImplementedError("Not Implemented.")
+
+
+def lo_regularizer(scale):
+    """Lo regularization removes the neurons of current layer. The `o` represents `outputs`
+    Returns a function that can be used to apply group lo regularization to weights.
+    The implementation follows `TensorFlow contrib <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/layers/python/layers/regularizers.py>`__.
+
+    Parameters
+    ----------
+    scale : float
+        A scalar multiplier `Tensor`. 0.0 disables the regularizer.
+
+    Returns
+    -------
+    A function with signature `lo(weights, name=None)` that apply Lo regularization.
+
+    Raises
+    ------
+    ValueError : If scale is outside of the range [0.0, 1.0] or if scale is not a float.
+
+    """
+
+    raise NotImplementedError("Not Implemented.")
+
+
+def maxnorm_regularizer(scale=1.0):
+    """Max-norm regularization returns a function that can be used to apply max-norm regularization to weights.
+
+    More about max-norm, see `wiki-max norm <https://en.wikipedia.org/wiki/Matrix_norm#Max_norm>`_.
+    The implementation follows `TensorFlow contrib <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/layers/python/layers/regularizers.py>`__.
+
+    Parameters
+    ----------
+    scale : float
+        A scalar multiplier `Tensor`. 0.0 disables the regularizer.
+
+    Returns
+    ---------
+    A function with signature `mn(weights, name=None)` that apply Lo regularization.
+
+    Raises
+    --------
+    ValueError : If scale is outside of the range [0.0, 1.0] or if scale is not a float.
+
+    """
+
+    raise NotImplementedError("Not Implemented.")
+
+
+def maxnorm_o_regularizer(scale):
+    """Max-norm output regularization removes the neurons of current layer.
+    Returns a function that can be used to apply max-norm regularization to each column of weight matrix.
+    The implementation follows `TensorFlow contrib <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/layers/python/layers/regularizers.py>`__.
+
+    Parameters
+    ----------
+    scale : float
+        A scalar multiplier `Tensor`. 0.0 disables the regularizer.
+
+    Returns
+    ---------
+    A function with signature `mn_o(weights, name=None)` that apply Lo regularization.
+
+    Raises
+    ---------
+    ValueError : If scale is outside of the range [0.0, 1.0] or if scale is not a float.
+
+    """
+
+    raise NotImplementedError("Not Implemented.")
+
+
+def maxnorm_i_regularizer(scale):
+    """Max-norm input regularization removes the neurons of previous layer.
+    Returns a function that can be used to apply max-norm regularization to each row of weight matrix.
+    The implementation follows `TensorFlow contrib <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/layers/python/layers/regularizers.py>`__.
+
+    Parameters
+    ----------
+    scale : float
+        A scalar multiplier `Tensor`. 0.0 disables the regularizer.
+
+    Returns
+    ---------
+    A function with signature `mn_i(weights, name=None)` that apply Lo regularization.
+
+    Raises
+    ---------
+    ValueError : If scale is outside of the range [0.0, 1.0] or if scale is not a float.
+
+    """
+
+    raise NotImplementedError("Not Implemented.")
+
+
+def huber_loss(
+    output, target, is_mean=True, delta=1.0, dynamichuber=False, reverse=False, axis=-1, epsilon=0.00001, name=None
+):
+    """Huber Loss operation, see ``https://en.wikipedia.org/wiki/Huber_loss`` .
+    Reverse Huber Loss operation, see  ''https://statweb.stanford.edu/~owen/reports/hhu.pdf''.
+    Dynamic Reverse Huber Loss operation, see  ''https://arxiv.org/pdf/1606.00373.pdf''.
+
+    Parameters
+    ----------
+    output : Tensor
+        A distribution with shape: [batch_size, ....], (any dimensions).
+    target : Tensor
+        The target distribution, format the same with `output`.
+    is_mean : boolean
+        Whether compute the mean or sum for each example.
+        - If True, use ``tf.reduce_mean`` to compute the loss between one target and predict data (default).
+        - If False, use ``tf.reduce_sum``.
+    delta: float
+        The point where the huber loss function changes from a quadratic to linear.
+    dynamichuber: boolean
+        Whether compute the coefficient c for each batch.
+        - If True, c is 20% of the maximal per-batch error.
+        - If False, c is delta.
+    reverse: boolean
+        Whether compute the reverse huber loss.
+    axis : int or list of int
+        The dimensions to reduce.
+    epsilon:
+        Eplison.
+    name : string
+        Name of this loss.
+
+    """
+
+    raise NotImplementedError("Not Implemented.")
+
+def _cast(a, threshold, flag=False):
+    zero = jt.zeros_like(a)
+    one = jt.ones_like(a)
+    if flag == False:
+        a = jt.where(a > threshold, one, a)
+        a = jt.where(a <= threshold, zero, a)
+    else:
+        a = jt.where(a >= threshold, one, a)
+        a = jt.where(a < threshold, zero, a)
+    return a
+
+
+def L1Loss(input, target, reduction='mean'):
+
+    return nn.l1_loss(input, target, reduction=reduction)

From be15628b3594bb9fdad88c33ff4638d650961c31 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Sat, 9 Mar 2024 04:13:58 +0800
Subject: [PATCH 07/27] Added jittor metrics jittor_metrics.py

---
 tensorlayerx/metrics/jittor_metric.py | 217 ++++++++++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 tensorlayerx/metrics/jittor_metric.py

diff --git a/tensorlayerx/metrics/jittor_metric.py b/tensorlayerx/metrics/jittor_metric.py
new file mode 100644
index 0000000..77680c9
--- /dev/null
+++ b/tensorlayerx/metrics/jittor_metric.py
@@ -0,0 +1,217 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+import jittor as jt
+import six
+import abc
+import numpy as np
+
+__all__ = [
+    'Accuracy',
+    'Auc',
+    'Precision',
+    'Recall',
+    'acc',
+]
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Metric(object):
+
+    def __init__(self):
+        pass
+
+    @abc.abstractmethod
+    def update(self, *args):
+        raise NotImplementedError("function 'update' not implemented in {}.".format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def result(self):
+        raise NotImplementedError("function 'reset' not implemented in {}.".format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def reset(self):
+        raise NotImplementedError("function 'reset' not implemented in {}.".format(self.__class__.__name__))
+
+
+class Accuracy(Metric):
+
+    def __init__(self, topk=1):
+        super(Accuracy, self).__init__()
+        self.topk = topk
+        self.reset()
+
+    def update(self, y_pred, y_true):
+
+        y_pred = jt.argsort(y_pred, dim=-1, descending=True)
+        y_pred = y_pred[:, :self.topk]
+        if (len(y_true.shape) == 1) or (len(y_true.shape) == 2 and y_true.shape[-1] == 1):
+            y_true = jt.reshape(y_true, (-1, 1))
+        elif y_true.shape[-1] != 1:
+            y_true = jt.argmax(y_true, dim=-1, keepdim=True)
+        correct = y_pred == y_true
+        correct = correct.to(jt.float32)
+        correct = correct.cpu().numpy()
+        num_samples = np.prod(np.array(correct.shape[:-1]))
+        num_corrects = correct[..., :self.topk].sum()
+        self.total += num_corrects
+        self.count += num_samples
+
+    def result(self):
+        return float(self.total) / self.count if self.count > 0 else 0.
+
+    def reset(self):
+        self.total = 0.0
+        self.count = 0.0
+
+
+class Auc(object):
+
+    def __init__(
+        self,
+        curve='ROC',
+        num_thresholds=4095,
+    ):
+        self.curve = curve
+        self.num_thresholds = num_thresholds
+        self.reset()
+
+    def update(self, y_pred, y_true):
+        if isinstance(y_true, jt.array()):
+            y_true = y_true.cpu().numpy()
+        elif not isinstance(y_pred, np.ndarray):
+            raise TypeError("The y_true must be a numpy array or Tensor.")
+
+        if isinstance(y_pred, jt.array):
+            y_pred = y_pred.cpu().numpy()
+        elif not isinstance(y_pred, np.ndarray):
+            raise TypeError("The y_pred must be a numpy array or Tensor.")
+
+        for i, label in enumerate(y_true):
+            value = y_pred[i, 1]  # positive probability
+            bin_idx = int(value * self.num_thresholds)
+            assert bin_idx <= self.num_thresholds
+            if label:
+                self._stat_pos[bin_idx] += 1.0
+            else:
+                self._stat_neg[bin_idx] += 1.0
+
+    @staticmethod
+    def trapezoid_area(x1, x2, y1, y2):
+        return abs(x1 - x2) * (y1 + y2) / 2.0
+
+    def result(self):
+        tot_pos = 0.0
+        tot_neg = 0.0
+        auc = 0.0
+        idx = self.num_thresholds
+        while idx > 0:
+            tot_pos_prev = tot_pos
+            tot_neg_prev = tot_neg
+            tot_pos += self._stat_pos[idx]
+            tot_neg += self._stat_neg[idx]
+            auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos, tot_pos_prev)
+            idx -= 1
+
+        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
+
+    def reset(self):
+        """
+        Reset states and result
+        """
+        _num_pred_buckets = self.num_thresholds + 1
+        self._stat_pos = np.zeros(_num_pred_buckets)
+        self._stat_neg = np.zeros(_num_pred_buckets)
+
+
+class Precision(object):
+
+    def __init__(self):
+        self.reset()
+
+    def update(self, y_pred, y_true):
+        if isinstance(y_true, jt.array):
+            y_true = y_true.cpu().numpy()
+        elif not isinstance(y_pred, np.ndarray):
+            raise TypeError("The y_true must be a numpy array or Tensor.")
+
+        if isinstance(y_pred, jt.array):
+            y_pred = y_pred.cpu().numpy()
+        elif not isinstance(y_pred, np.ndarray):
+            raise TypeError("The y_pred must be a numpy array or Tensor.")
+
+        sample_num = y_true.shape[0]
+        y_pred = np.rint(y_pred).astype('int32')
+
+        for i in range(sample_num):
+            pred = y_pred[i]
+            label = y_true[i]
+            if pred == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fp += 1
+
+    def result(self):
+
+        ap = self.tp + self.fp
+        return float(self.tp) / ap if ap != 0 else .0
+
+    def reset(self):
+        self.tp = 0
+        self.fp = 0
+
+
+class Recall(object):
+
+    def __init__(self):
+        self.reset()
+
+    def update(self, y_pred, y_true):
+        if isinstance(y_true, jt.array):
+            y_true = y_true.cpu().numpy()
+        elif not isinstance(y_pred, np.ndarray):
+            raise TypeError("The y_true must be a numpy array or Tensor.")
+
+        if isinstance(y_pred, jt.array):
+            y_pred = y_pred.cpu().numpy()
+        elif not isinstance(y_pred, np.ndarray):
+            raise TypeError("The y_pred must be a numpy array or Tensor.")
+
+        sample_num = y_true.shape[0]
+        y_pred = np.rint(y_pred).astype('int32')
+
+        for i in range(sample_num):
+            pred = y_pred[i]
+            label = y_true[i]
+            if label == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fn += 1
+
+    def result(self):
+
+        recall = self.tp + self.fn
+        return float(self.tp) / recall if recall != 0 else .0
+
+    def reset(self):
+        self.tp = 0
+        self.fn = 0
+
+
+def acc(predicts, labels, topk=1):
+    y_pred = jt.argsort(predicts, dim=-1, descending=True)
+    y_pred = y_pred[:, :topk]
+    if (len(labels.shape) == 1) or (len(labels.shape) == 2 and labels.shape[-1] == 1):
+        y_true = jt.reshape(labels, (-1, 1))
+    elif labels.shape[-1] != 1:
+        y_true = jt.argmax(labels, dim=-1, keepdim=True)
+    correct = y_pred == y_true
+    correct = correct.to(jt.float32)
+    correct = correct.cpu().numpy()
+    num_samples = np.prod(np.array(correct.shape[:-1]))
+    num_corrects = correct[..., :topk].sum()
+    total = num_corrects
+    count = num_samples
+    return float(total) / count if count > 0 else 0.

From b9f4bb5a594407501604948e1d9f81f390066bba Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 11 Mar 2024 00:18:52 +0800
Subject: [PATCH 08/27] Changed desc on jittor_nn.py

---
 tensorlayerx/backend/ops/jittor_nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorlayerx/backend/ops/jittor_nn.py b/tensorlayerx/backend/ops/jittor_nn.py
index 488e80d..480fc5e 100644
--- a/tensorlayerx/backend/ops/jittor_nn.py
+++ b/tensorlayerx/backend/ops/jittor_nn.py
@@ -3,7 +3,7 @@
 
 
 # Unified nn API for TensorLayerX, using Jittor as backend.
-# Similar to file ./mindspore_nn.py and ./oneflow_nn.py
+# Similar to file ./torch_nn.py and ./oneflow_nn.py
 
 import jittor as jt
 import jittor.nn as nn

From 49f95c811ebbf3fa782cae632e73b8d5d9154dc7 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 11 Mar 2024 00:42:45 +0800
Subject: [PATCH 09/27] Added jittor to load_backend.py

---
 tensorlayerx/backend/ops/load_backend.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tensorlayerx/backend/ops/load_backend.py b/tensorlayerx/backend/ops/load_backend.py
index 155d136..3fd2d0c 100644
--- a/tensorlayerx/backend/ops/load_backend.py
+++ b/tensorlayerx/backend/ops/load_backend.py
@@ -5,10 +5,11 @@
 import os
 import sys
 
-BACKEND = 'tensorflow'
+# BACKEND = 'tensorflow'
 # BACKEND = 'mindspore'
 # BACKEND = 'paddle'
 # BACKEND = 'torch'
+BACKEND = 'jittor'
 # BACKEND = 'oneflow'
 
 # Check for backend.json files
@@ -88,5 +89,13 @@
     BACKEND_VERSION = flow.__version__
 
     sys.stderr.write('Using OneFlow backend.\n')
+
+elif BACKEND == 'jittor':
+    from .jittor_nn import *
+    from .jittor_backend import *
+    import jittor as jt
+    BACKEND_VERSION = jt.__version__
+    sys.stderr.write('Using jittor backend.\n')
+
 else:
     raise NotImplementedError("This backend is not supported")

From 00647a39a297961934b73d4f5cc74125b9411b56 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 11 Mar 2024 01:27:31 +0800
Subject: [PATCH 10/27] modified __init__.py and added jittor version

---
 tensorlayerx/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorlayerx/__init__.py b/tensorlayerx/__init__.py
index 1970a1a..481c043 100644
--- a/tensorlayerx/__init__.py
+++ b/tensorlayerx/__init__.py
@@ -39,6 +39,7 @@
     'mindspore': '1.8.1',
     'paddle': '2.2.0',
     'torch': '1.10.0',
+    'jittor': '1.3.8.5',
 }
 
 if BACKEND_VERSION != backend_v[BACKEND]:

From 863c509f23448e5e89ac11ce7633e327b90f1abe Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 11 Mar 2024 01:29:10 +0800
Subject: [PATCH 11/27] updated jittor_backend.py and removed complex64 and
 complex128

---
 tensorlayerx/backend/ops/jittor_backend.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorlayerx/backend/ops/jittor_backend.py b/tensorlayerx/backend/ops/jittor_backend.py
index 2b13457..f40b22a 100644
--- a/tensorlayerx/backend/ops/jittor_backend.py
+++ b/tensorlayerx/backend/ops/jittor_backend.py
@@ -23,8 +23,6 @@
     'uint32': None,
     'uint64': None,
     'bool': jt.bool,
-    'complex64': jt.complex64,
-    'complex128': jt.complex128
 }
 
 DType = None

From 92b06138347a6d998f1358c2fe8a4c7ff3c825f1 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 11 Mar 2024 01:30:06 +0800
Subject: [PATCH 12/27] jittor_backend updated

---
 tensorlayerx/backend/ops/jittor_backend.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorlayerx/backend/ops/jittor_backend.py b/tensorlayerx/backend/ops/jittor_backend.py
index f40b22a..31139e3 100644
--- a/tensorlayerx/backend/ops/jittor_backend.py
+++ b/tensorlayerx/backend/ops/jittor_backend.py
@@ -23,6 +23,7 @@
     'uint32': None,
     'uint64': None,
     'bool': jt.bool,
+
 }
 
 DType = None

From 4cead40e0495ec4cc42fa859512f115e4ad066b2 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 11 Mar 2024 02:41:38 +0800
Subject: [PATCH 13/27] Added core_jittor.py and jittor_initializers.py

---
 tensorlayerx/nn/core/core_jittor.py           | 787 ++++++++++++++++++
 .../nn/initializers/jittor_initializers.py    | 382 +++++++++
 2 files changed, 1169 insertions(+)
 create mode 100644 tensorlayerx/nn/core/core_jittor.py
 create mode 100644 tensorlayerx/nn/initializers/jittor_initializers.py

diff --git a/tensorlayerx/nn/core/core_jittor.py b/tensorlayerx/nn/core/core_jittor.py
new file mode 100644
index 0000000..0bcd68e
--- /dev/null
+++ b/tensorlayerx/nn/core/core_jittor.py
@@ -0,0 +1,787 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+from jittor import Module as T_Module
+from .common import check_parameter, processing_act, str2init, tolist, construct_graph, ModuleNode, select_attrs
+from .common import _save_weights, _load_weights, _save_standard_weights_dict, _load_standard_weights_dict
+from jittor.nn import Parameter
+from typing import Any, Callable
+import jittor as jt
+import operator
+from itertools import islice
+from collections import OrderedDict, abc as container_abcs
+import warnings
+import tensorlayerx as tlx
+
+_global_layer_name_dict = {}
+_global_layer_node = []
+
+__all__ = ['Module', 'Sequential', 'ModuleList', 'ModuleDict', 'Parameter', 'ParameterList', 'ParameterDict']
+
+
+class Module(T_Module):
+
+    def __init__(self, name=None, act=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        global _global_layer_name_dict
+        if name is None:
+            prefix = self.__class__.__name__.lower()
+
+            if _global_layer_name_dict.get(prefix) is not None:
+                _global_layer_name_dict[prefix] += 1
+                name = prefix + '_' + str(_global_layer_name_dict[prefix])
+            else:
+                _global_layer_name_dict[prefix] = 0
+                name = prefix
+            while True:
+                if _global_layer_name_dict.get(name) is None:
+                    break
+                _global_layer_name_dict[prefix] += 1
+                name = prefix + '_' + str(_global_layer_name_dict[prefix])
+        else:
+            if _global_layer_name_dict.get(name) is not None:
+                pass
+            else:
+                _global_layer_name_dict[name] = 0
+
+        self.name = name
+
+        self.act = processing_act(act)
+
+        # Layer building state
+        self._built = False
+
+        # Layer nodes state
+        self._nodes_fixed = False
+        self._build_graph = False
+
+        # Layer weight state
+        self._all_weights = None
+        self._trainable_weights = None
+        self._nontrainable_weights = None
+
+        # Layer training state
+        self.is_train = True
+
+        # layer forward  state
+        self._forward_state = False
+
+        # weights check state
+        self._check = False
+
+
+    def set_train(self, mode=True):
+        if not isinstance(mode, bool):
+            raise ValueError("training mode is expected to be boolean")
+        self.is_train = mode
+        for module in self.children():
+            module.set_train(mode)
+        return self
+
+    def set_eval(self):
+        self.set_train(False)
+
+    def build(self, inputs_shape):
+        raise Exception("The build(self, inputs_shape) method must be implemented by inherited class")
+
+    def forward(self, *inputs, **kwargs):
+        raise Exception("The forward method must be implemented by inherited class")
+
+    def _get_weights(self, var_name, shape, init=None, trainable=True, transposed=None, order=False):
+        if order:
+            w_tmp = Parameter(init(shape), requires_grad=trainable)
+            return w_tmp
+
+        if len(shape) == 3:
+            shape = shape[::-1]
+        if len(shape) == 4:
+            if transposed:
+                shape = (shape[3], shape[0], shape[1], shape[2])
+            else:
+                shape = (shape[3], shape[2], shape[0], shape[1])
+        if len(shape) == 5:
+            shape = (shape[4], shape[3], shape[0], shape[1], shape[2])
+        # TODO paramters name should be add
+        _param = init(shape)
+        param = Parameter(_param, requires_grad=trainable)
+        self.var_name = var_name
+        return param
+
+    def _call_impl_tlx(self, *input, **kwargs):
+        if self._check == False:
+            _param_name = []
+            for name, param in self.named_parameters(recurse=True):
+                if name not in _param_name:
+                    _param_name.append(name)
+                else:
+                    raise Exception("parameter name [{}] have be been used. "
+                                    "In training, the name of layer can't be same."
+                                    "Please check the layers name".format(name))
+            self._check = True
+
+        result = self._call_impl(*input, **kwargs)
+        return result
+    # TODO RNN enabled after repair
+    __call__: Callable[..., Any] = _call_impl_tlx
+
+    def _named_members(self, get_members_fn, prefix='', recurse=True):
+        r"""Helper method for yielding various names + members of modules."""
+        memo = set()
+        modules = self.named_modules(prefix=prefix) if recurse else [(prefix, self)]
+        for module_prefix, module in modules:
+            members = get_members_fn(module)
+            for k, v in members:
+                if v is None or v in memo:
+                    continue
+                memo.add(v)
+                name = module.name + '/' + k
+                yield name, v
+
+
+    @property
+    def all_weights(self):
+        if self._all_weights is not None and len(self._all_weights) > 0:
+            # self._all_weights already extracted, so do nothing
+            pass
+        else:
+            self._all_weights = []
+            for name, param in self.named_parameters(recurse=True):
+                self._all_weights.append(param)
+        return self._all_weights
+
+    @property
+    def trainable_weights(self):
+        if self._trainable_weights is not None and len(self._trainable_weights) > 0:
+            # self._trainable_weights already extracted, so do nothing
+            pass
+        else:
+            self._trainable_weights = []
+            for name, param in self.named_parameters(recurse=True):
+                if param.requires_grad ==True:
+                    self._trainable_weights.append(param)
+        return self._trainable_weights
+
+    @property
+    def nontrainable_weights(self):
+        """
+        Returns all untrainable weights.
+        Returns a list of all untrainable weights.
+
+        """
+
+        if self._nontrainable_weights is not None and len(self._nontrainable_weights) > 0:
+            # self._nontrainable_weights already extracted, so do nothing
+            pass
+        else:
+            self._nontrainable_weights = []
+            for name, param in self.named_parameters(recurse=True):
+                if param.requires_grad == False:
+                    self._nontrainable_weights.append(param)
+        return self._nontrainable_weights
+
+    def save_weights(self, file_path, format=None):
+        _save_weights(net=self, file_path=file_path, format=format)
+
+    def load_weights(self, file_path, format=None, in_order=True, skip=False):
+        """Load model weights from a given file, which should be previously saved by self.save_weights()."""
+        _load_weights(net=self, file_path=file_path, format=format, in_order=in_order, skip=skip)
+
+    def save_standard_weights(self, file_path):
+        _save_standard_weights_dict(self, file_path)
+
+    def load_standard_weights(self, file_path, weights_from, weights_to, skip=False):
+        _load_standard_weights_dict(self, file_path, skip=skip, weights_from=weights_from, weights_to=weights_to)
+
+    def str_to_init(self, initializer):
+        return str2init(initializer)
+
+    def check_param(self, param, dim='2d'):
+        return check_parameter(param, dim)
+
+    def init_build(self, *inputs, **kwargs):
+        """
+        (1) This method must be called when the Layer has no input in_channels.
+        (2) Automatic shape inference when the user does not enter inchannels.
+        """
+
+        self.forward(*inputs, **kwargs)
+
+    def set_build_graph(self):
+        for layer_name, layer in self._modules.items():
+            if isinstance(layer, Module):
+                if len(layer._modules) > 1:
+                    layer.set_build_graph()
+                layer._build_graph = True
+
+    def build_graph(self, *inputs, **kwargs):
+        # Add nodes only when the composition is needed.
+        self.set_build_graph()
+        self.set_eval()
+
+        outputs = self.forward(*inputs, **kwargs)
+        self.inputs = inputs
+        self.outputs = outputs
+        self._node_by_depth, self._all_layers = construct_graph(self.inputs, self.outputs)
+        return self._node_by_depth, self._all_layers
+
+    def _add_node(self, input_tensors, output_tensors):
+        """Add a ModuleNode for this layer given input_tensors, output_tensors.
+
+        This function should not be called from outside, it should only be called
+        in __call__ when building static model.
+
+        Parameters
+        ----------
+        input_tensors : Tensor or a list of tensors
+            Input tensors to this layer.
+        output_tensors : Tensor or a list of tensors
+            Output tensors to this layer.
+
+        """
+
+        inputs_list = tolist(input_tensors)
+        outputs_list = tolist(output_tensors)
+        if self.__class__.__name__ in tlx.layers.inputs.__all__:
+            # for InputLayer, there should be no in_nodes
+            in_nodes = []
+            in_tensor_idxes = [0]
+        else:
+            in_nodes = [tensor._info[0] for tensor in inputs_list]
+            in_tensor_idxes = [tensor._info[1] for tensor in inputs_list]
+        node_index = len(_global_layer_node)
+
+        new_node = ModuleNode(
+            self, node_index, in_nodes, inputs_list, outputs_list, in_tensor_idxes, select_attrs(self)
+        )
+        _global_layer_node.append(new_node)
+        for idx, tensor in enumerate(outputs_list):
+            tensor._info = (new_node, idx)
+
+
+class Sequential(Module):
+    """
+    The class :class:`Sequential` is a linear stack of layers.
+    The :class:`Sequential` can be created by passing a list of layer instances.
+    The given layer instances will be automatically connected one by one.
+    Parameters
+    ----------
+    layers: list of Layer
+        A list of layers.
+    name : str or None
+        A unique layer name. If None, a unique name will be automatically assigned.
+    Methods
+    ---------
+    __init__()
+        Initializing the ModuleList.
+    weights()
+        A collection of weights of all the layer instances.
+    build()
+        Build the ModuleList. The layer instances will be connected automatically one by one.
+    forward()
+        Forward the computation. The computation will go through all layer instances.
+
+    Examples
+    ---------
+    >>> conv = tlx.layers.Conv2d(3, 2, 3, pad_mode='valid')
+    >>> bn = tlx.layers.BatchNorm2d(2)
+    >>> seq = tlx.nn.Sequential([conv, bn])
+    >>> x = tlx.layers.Input((1, 3, 4, 4))
+    >>> seq(x)
+    """
+
+    def __init__(self, *args):
+        super(Sequential, self).__init__()
+        self._built = True
+        if len(args) == 1:
+            layers = args[0]
+            if isinstance(layers, list):
+                for index, layer in enumerate(layers):
+                    self.add_module(str(index), layer)
+            elif isinstance(layers, OrderedDict):
+                for name, layer in layers.items():
+                    self.add_module(name, layer)
+            else:
+                raise TypeError('Layers must be list or orderedDict')
+        else:
+            for index, layer in enumerate(args):
+                self.add_module(str(index), layer)
+        self.layer_list = list(self._modules.values())
+
+    def _get_item_by_idx(self, iterator, idx):
+        """Get the idx-th item of the iterator"""
+        size = len(self)
+        idx = operator.index(idx)
+        if not -size <= idx < size:
+            raise IndexError('index {} is out of range'.format(idx))
+        idx %= size
+        return next(islice(iterator, idx, None))
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
+        index = _valid_index(len(self), idx)
+        return list(self._modules.values())[index]
+
+    def __setitem__(self, index, layer):
+        if _valid_module(layer):
+            index = _valid_index(len(self), index)
+            key = list(self._modules.keys())[index]
+            self._modules[key] = layer
+            self.layer_list = list(self._modules.values())
+
+    def __delitem__(self, index):
+        if isinstance(index, int):
+            index = _valid_index(len(self), index)
+            key = list(self._modules.keys())[index]
+            del self._modules[key]
+        elif isinstance(index, slice):
+            keys = list(self._modules.keys())[index]
+            for key in keys:
+                del self._modules[key]
+        else:
+            raise TypeError('Index {} is not int type or slice type'.format(index))
+        self.layer_list = list(self._modules.values())
+
+    def __len__(self):
+        return len(self._modules)
+
+    def __dir__(self):
+        keys = super(Sequential, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def __iter__(self):
+        return iter(self._modules.values())
+
+    def append(self, layer):
+        if _valid_module(layer):
+            self._modules[str(len(self))] = layer
+        self.layer_list = list(self._modules.values())
+        return self
+
+    def build(self, inputs_shape):
+        pass
+
+    def forward(self, input_data):
+        for layer in self.layer_list:
+            input_data = layer(input_data)
+        return input_data
+
+    # def _add_seq_node(self, input_tensors, output_tensors, layer):
+    #     inputs_list = tolist(input_tensors)
+    #     outputs_list = tolist(output_tensors)
+    #     if layer.__class__.__name__ in tlx.layers.inputs.__all__:
+    #         in_nodes = []
+    #         in_tensor_idxes = [0]
+    #     else:
+    #         in_nodes = [tensor._info[0] for tensor in inputs_list]
+    #         in_tensor_idxes = [tensor._info[1] for tensor in inputs_list]
+    #     node_index = len(_global_layer_node)
+    #
+    #     new_node = ModuleNode(
+    #         layer, node_index, in_nodes, inputs_list, outputs_list, in_tensor_idxes, select_attrs(layer)
+    #     )
+    #     _global_layer_node.append(new_node)
+    #     for idx, tensor in enumerate(outputs_list):
+    #         tensor._info = (new_node, idx)
+
+
+class ModuleList(Module):
+    """
+    Holds Modules in a list.
+
+    ModuleList can be used like a regular Python list, support
+    '__getitem__', '__setitem__', '__delitem__', '__len__', '__iter__' and '__iadd__',
+    but module it contains are properly registered, and will be visible by all Modules methods.
+
+    Parameters
+    ----------
+        args : list
+            List of subclass of Module.
+    Methods
+    ---------
+    __init__()
+        Initializing the Layer.
+    insert()
+        Inserts a given layer before a given index in the list.
+    extend()
+        Appends layers from a Python iterable to the end of the list.
+    append()
+        Appends a given layer to the end of the list.
+
+    Examples
+    ---------
+    >>> from tensorlayerx.nn import Module, ModuleList, Linear
+    >>> import tensorlayerx as tlx
+    >>> d1 = Linear(out_features=800, act=tlx.ReLU, in_features=784, name='linear1')
+    >>> d2 = Linear(out_features=800, act=tlx.ReLU, in_features=800, name='linear2')
+    >>> d3 = Linear(out_features=10, act=tlx.ReLU, in_features=800, name='linear3')
+    >>> layer_list = ModuleList([d1, d2])
+    >>> # Inserts a given d2 before a given index in the list
+    >>> layer_list.insert(1, d2)
+    >>> layer_list.insert(2, d2)
+    >>> # Appends d2 from a Python iterable to the end of the list.
+    >>> layer_list.extend([d2])
+    >>> # Appends a given d3 to the end of the list.
+    >>> layer_list.append(d3)
+    """
+
+    def __init__(self, modules=None):
+        super(ModuleList, self).__init__()
+        if modules is not None:
+            self.extend(modules)
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            return self.__class__(list(self._modules.values())[index])
+        if isinstance(index, int):
+            index = _valid_index(len(self), index)
+            return self._modules[str(index)]
+        raise TypeError('Index {} is not int type or slice type'.format(index))
+
+    def __setitem__(self, index, layer):
+        if not isinstance(index, int) and _valid_module(layer):
+            raise TypeError('Index {} is not int type'.format(index))
+        index = _valid_index(len(self), index)
+        self._modules[str(index)] = layer
+
+    def __delitem__(self, index):
+        if isinstance(index, int):
+            index = _valid_index(len(self), index)
+            del self._modules[str(index)]
+        elif isinstance(index, slice):
+            keys = list(self._modules.keys())[index]
+            for key in keys:
+                del self._modules[key]
+        else:
+            raise TypeError('Index {} is not int type or slice type'.format(index))
+        temp_dict = OrderedDict()
+        for idx, layer in enumerate(self._modules.values()):
+            temp_dict[str(idx)] = layer
+        self._modules = temp_dict
+
+    def __len__(self):
+        return len(self._modules)
+
+    def __iter__(self):
+        return iter(self._modules.values())
+
+    def __iadd__(self, layers):
+        self.extend(layers)
+        return self
+
+    def insert(self, index, layer):
+        """
+            Inserts a given layer before a given index in the list.
+
+        """
+        idx = _valid_index(len(self), index)
+        _valid_module(layer)
+        length = len(self)
+        while length > idx:
+            self._modules[str(length)] = self._modules[str(length - 1)]
+            length -= 1
+        self._modules[str(idx)] = layer
+
+    def extend(self, layers):
+        """
+            Appends layers from a Python iterable to the end of the list.
+
+        """
+
+        if not isinstance(layers, list):
+            raise TypeError('Modules {} should be list of sublayers'.format(layers))
+        for layer in layers:
+            if _valid_module(layer):
+                self._modules[str(len(self))] = layer
+        return self
+
+    def append(self, layer):
+        """
+            Appends a given layer to the end of the list.
+
+        """
+
+        if _valid_module(layer):
+            self._modules[str(len(self))] = layer
+
+    def forward(self, *inputs):
+        raise NotImplementedError
+
+
+class ModuleDict(Module):
+
+    def __init__(self, modules=None):
+        super(ModuleDict, self).__init__()
+        if modules is not None:
+            self.update(modules)
+
+    def __getitem__(self, key):
+
+        return self._modules[key]
+
+    def __setitem__(self, key, module):
+
+        self.add_module(key, module)
+
+    def __delitem__(self, key):
+
+        del self._modules[key]
+
+    def __len__(self):
+
+        return len(self._modules)
+
+    def __iter__(self):
+
+        return iter(self._modules)
+
+    def __contains__(self, key):
+
+        return key in self._modules
+
+    def clear(self):
+
+        self._modules.clear()
+
+    def pop(self, key):
+
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self):
+
+        return self._modules.keys()
+
+    def items(self):
+
+        return self._modules.items()
+
+    def values(self):
+
+        return self._modules.values()
+
+    def update(self, modules):
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError(
+                "ModuleDict.update should be called with an "
+                "iterable of key/value pairs, but got " + type(modules).__name__
+            )
+
+        if isinstance(modules, (OrderedDict, ModuleDict, container_abcs.Mapping)):
+            for key, module in modules.items():
+                self[key] = module
+        else:
+            for j, m in enumerate(modules):
+                if not isinstance(m, container_abcs.Iterable):
+                    raise TypeError(
+                        "ModuleDict update sequence element "
+                        "#" + str(j) + " should be Iterable; is" + type(m).__name__
+                    )
+                if not len(m) == 2:
+                    raise ValueError(
+                        "ModuleDict update sequence element "
+                        "#" + str(j) + " has length " + str(len(m)) + "; 2 is required"
+                    )
+                self[m[0]] = m[1]
+
+
+class ParameterList(Module):
+
+    def __init__(self, parameters=None):
+        super(ParameterList, self).__init__()
+        self._initialized = True
+        if parameters is not None:
+            self += parameters
+
+    def __setstate__(self, state):
+        state['_initialized'] = False
+        super(ParameterList, self).__setstate__(state)
+        self._initialized = True
+
+    def _get_abs_string_index(self, idx):
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError('index {} is out of range'.format(idx))
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return self.__class__(list(self._parameters.values())[idx])
+        else:
+            idx = self._get_abs_string_index(idx)
+            return self._parameters[str(idx)]
+
+    def __setitem__(self, idx, param):
+        idx = self._get_abs_string_index(idx)
+        return self.register_parameter(str(idx), param)
+
+    def __setattr__(self, key, value):
+        if getattr(self, "_initialized", False):
+            if not hasattr(self, key) and not isinstance(value, jt.nn.Parameter):
+                warnings.warn("Setting attributes on ParameterList is not supported.")
+        super(ParameterList, self).__setattr__(key, value)
+
+    def __len__(self):
+        return len(self._parameters)
+
+    def __iter__(self):
+        return iter(self._parameters.values())
+
+    def __iadd__(self, parameters):
+        return self.extend(parameters)
+
+    def __dir__(self):
+        keys = super(ParameterList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def append(self, parameter):
+
+        self.register_parameter(str(len(self)), parameter)
+        return self
+
+    def extend(self, parameters):
+        if not isinstance(parameters, container_abcs.Iterable):
+            raise TypeError(
+                "ParameterList.extend should be called with an "
+                "iterable, but got " + type(parameters).__name__
+            )
+        offset = len(self)
+        for i, param in enumerate(parameters):
+            self.register_parameter(str(offset + i), param)
+        return self
+
+    def __call__(self, input):
+        raise RuntimeError('ParameterList should not be called.')
+
+
+class ParameterDict(Module):
+
+    def __init__(self, parameters=None):
+        super(ParameterDict, self).__init__()
+        self._initialized = True
+        if parameters is not None:
+            self.update(parameters)
+
+    def __setstate__(self, state):
+        state['_initialized'] = False
+        super(ParameterDict, self).__setstate__(state)
+        self._initialized = True
+
+    def __getitem__(self, key):
+        return self._parameters[key]
+
+    def __setitem__(self, key, parameter):
+        self.register_parameter(key, parameter)
+
+    def __delitem__(self, key):
+        del self._parameters[key]
+
+    def __setattr__(self, key, value):
+        if getattr(self, "_initialized", False):
+            if not hasattr(self, key) and not isinstance(value, jt.nn.Parameter):
+                warnings.warn("Setting attributes on ParameterDict is not supported.")
+        super(ParameterDict, self).__setattr__(key, value)
+
+    def __len__(self):
+        return len(self._parameters)
+
+    def __iter__(self):
+        return iter(self._parameters.keys())
+
+    def __reversed__(self):
+        return reversed(list(self._parameters.keys()))
+
+    def copy(self):
+
+        return ParameterDict(self._parameters.copy())
+
+    def __contains__(self, key):
+        return key in self._parameters
+
+    def setdefault(self, key, default=None):
+        if key in self._parameters:
+            return self._parameters[key]
+        self[key] = default
+        return self._parameters[key]
+
+    def clear(self):
+        self._parameters.clear()
+
+    def pop(self, key):
+        v = self[key]
+        del self[key]
+        return v
+
+    def popitem(self):
+        return self._parameters.popitem()
+
+    def get(self, key, default=None):
+
+        return self._parameters.get(key, default)
+
+    def fromkeys(self, keys, default=None):
+
+        return ParameterDict(self._parameters.fromkeys(keys, default))  # type: ignore[arg-type]
+
+    def keys(self):
+
+        return self._parameters.keys()
+
+    def items(self):
+
+        return self._parameters.items()
+
+    def values(self):
+
+        return self._parameters.values()
+
+    def update(self, parameters):
+        if not isinstance(parameters, container_abcs.Iterable):
+            raise TypeError(
+                "ParametersDict.update should be called with an "
+                "iterable of key/value pairs, but got " + type(parameters).__name__
+            )
+
+        if isinstance(parameters, (OrderedDict, ParameterDict)):
+            for key, parameter in parameters.items():
+                self[key] = parameter
+        elif isinstance(parameters, container_abcs.Mapping):
+            for key, parameter in sorted(parameters.items()):
+                self[key] = parameter
+        else:
+            for j, p in enumerate(parameters):
+                if not isinstance(p, container_abcs.Iterable):
+                    raise TypeError(
+                        "ParameterDict update sequence element "
+                        "#" + str(j) + " should be Iterable; is" + type(p).__name__
+                    )
+                if not len(p) == 2:
+                    raise ValueError(
+                        "ParameterDict update sequence element "
+                        "#" + str(j) + " has length " + str(len(p)) + "; 2 is required"
+                    )
+                # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
+                self[p[0]] = p[1]  # type: ignore[assignment]
+
+    def __call__(self, input):
+        raise RuntimeError('ParameterDict should not be called.')
+
+
+def _valid_index(layer_num, index):
+    if not isinstance(index, int):
+        raise TypeError("Index {} is not int type")
+    if not -layer_num <= index < layer_num:
+        raise IndexError("Index should be a number in range [{}, {}), but got {}".format(-layer_num, layer_num, index))
+    return index % layer_num
+
+
+def _valid_module(layer):
+    if issubclass(layer.__class__, Module):
+        return True
+    raise TypeError('Module {} is not subclass of Module'.format(layer))
diff --git a/tensorlayerx/nn/initializers/jittor_initializers.py b/tensorlayerx/nn/initializers/jittor_initializers.py
new file mode 100644
index 0000000..0a791ab
--- /dev/null
+++ b/tensorlayerx/nn/initializers/jittor_initializers.py
@@ -0,0 +1,382 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+import jittor as jt
+import jittor.transform
+import tensorlayerx as tlx
+import numpy as np
+
+__all__ = [
+    'Initializer',
+    'Zeros',
+    'Ones',
+    'Constant',
+    'RandomUniform',
+    'RandomNormal',
+    'TruncatedNormal',
+    'deconv2d_bilinear_upsampling_initializer',
+    'HeNormal',
+    'HeUniform',
+    'XavierNormal',
+    'XavierUniform',
+]
+
+
+class Initializer(object):
+    """Initializer base class: all initializers inherit from this class.
+    """
+
+    def __call__(self, shape, dtype=None):
+        """Returns a tensor object initialized as specified by the initializer.
+
+        Parameters
+        ----------
+        shape : tuple of int.
+            The shape of the tensor.
+        dtype : Optional dtype of the tensor.
+            If not provided will return tensor of `tlx.float32`.
+
+        Returns
+        -------
+
+        """
+        raise NotImplementedError
+
+    def get_config(self):
+        """Returns the configuration of the initializer as a JSON-serializable dict.
+
+        Returns
+        -------
+            A JSON-serializable Python dict.
+        """
+        return {}
+
+    @classmethod
+    def from_config(cls, config):
+        """Instantiates an initializer from a configuration dictionary.
+
+        Parameters
+        ----------
+        config : A python dictionary.
+            It will typically be the output of `get_config`.
+
+        Returns
+        -------
+            An Initializer instance.
+        """
+        if 'dtype' in config:
+            config.pop('dtype')
+        return cls(**config)
+
+
+class Zeros(Initializer):
+    """Initializer that generates tensors initialized to 0.
+
+    Examples
+    --------
+
+    >>> import tensorlayerx as tlx
+    >>> init = tlx.initializers.zeros()
+    >>> print(init(shape=(5, 10), dtype=tlx.float32))
+
+    """
+
+    def __call__(self, shape, dtype=tlx.float32):
+        _tensor = jt.empty(size=shape, dtype=dtype)
+        return jt.zeros(_tensor)
+
+
+class Ones(Initializer):
+    """Initializer that generates tensors initialized to 1.
+
+    Examples
+    --------
+
+    >>> import tensorlayerx as tlx
+    >>> init = tlx.initializers.ones()
+    >>> print(init(shape=(5, 10), dtype=tlx.float32))
+
+    """
+
+    def __call__(self, shape, dtype=tlx.float32):
+        _tensor = jt.empty(size=shape, dtype=dtype)
+        return jt.ones(_tensor)
+
+
+class Constant(Initializer):
+    """Initializer that generates tensors initialized to a constant value.
+
+    Parameters
+    ----------
+    value : A python scalar or a numpy array.
+        The assigned value.
+
+    Examples
+    --------
+
+    >>> import tensorlayerx as tlx
+    >>> init = tlx.initializers.constant(value=10)
+    >>> print(init(shape=(5, 10), dtype=tlx.float32))
+
+    """
+
+    def __init__(self, value=0):
+        self.value = value
+
+    def __call__(self, shape, dtype=tlx.float32):
+        _tensor = jt.empty(size=shape, dtype=dtype)
+        if isinstance(self.value, (int, float)):
+            return jt.init.constant_(_tensor, val=self.value)
+        elif isinstance(self.value, (jt.array, list, np.ndarray)):
+            _tensor.data = jittor.transform.ToTensor(self.value)
+            return _tensor
+
+    def get_config(self):
+        return {"value": self.value}
+
+
+class RandomUniform(Initializer):
+    """Initializer that generates tensors with a uniform distribution.
+
+    Parameters
+    ----------
+    minval : A python scalar or a scalar tensor.
+        Lower bound of the range of random values to generate.
+    maxval : A python scalar or a scalar tensor.
+        Upper bound of the range of random values to generate.
+    seed : A Python integer.
+        Used to seed the random generator.
+
+    Examples
+    --------
+
+    >>> import tensorlayerx as tlx
+    >>> init = tlx.initializers.random_uniform(minval=-0.05, maxval=0.05)
+    >>> print(init(shape=(5, 10), dtype=tlx.float32))
+
+    """
+
+    def __init__(self, minval=-0.05, maxval=0.05, seed=None):
+        self.minval = minval
+        self.maxval = maxval
+        self.seed = seed
+
+    def __call__(self, shape, dtype=tlx.float32):
+        _tensor = jt.empty(size=shape, dtype=dtype)
+        return jt.nn.init.uniform_(_tensor, a=self.minval, b=self.maxval)
+
+    def get_config(self):
+        return {"minval": self.minval, "maxval": self.maxval, "seed": self.seed}
+
+
+class RandomNormal(Initializer):
+    """Initializer that generates tensors with a normal distribution.
+
+    Parameters
+    ----------
+    mean : A python scalar or a scalar tensor.
+        Mean of the random values to generate.
+    stddev : A python scalar or a scalar tensor.
+        Standard deviation of the random values to generate.
+    seed : A Python integer.
+        Used to seed the random generator.
+
+    minval=-0.05, maxval=0.05
+
+    Examples
+    --------
+
+    >>> import tensorlayerx as tlx
+    >>> init = tlx.initializers.random_normal(mean=0.0, stddev=0.05)
+    >>> print(init(shape=(5, 10), dtype=tlx.float32))
+
+    """
+
+    def __init__(self, mean=0.0, stddev=0.05, seed=None):
+        self.mean = mean
+        self.stddev = stddev
+        self.seed = seed
+
+    def __call__(self, shape, dtype=tlx.float32):
+        _tensor = jt.empty(size=shape)
+        return jt.normal(_tensor, mean=self.mean, std=self.stddev)
+
+    def get_config(self):
+        return {"mean": self.mean, "stddev": self.stddev, "seed": self.seed}
+
+
+class TruncatedNormal(Initializer):
+    """Initializer that generates a truncated normal distribution.
+
+    These values are similar to values from a `RandomNormal`
+    except that values more than two standard deviations from the mean
+    are discarded and re-drawn. This is the recommended initializer for
+    neural network weights and filters.
+
+
+    Parameters
+    ----------
+    mean : A python scalar or a scalar tensor.
+        Mean of the random values to generate.
+    stddev : A python scalar or a scalar tensor.
+        Standard deviation of the andom values to generate.
+    seed : A Python integer.
+        Used to seed the random generator.
+
+    Examples
+    --------
+
+    >>> import tensorlayerx as tlx
+    >>> init = tlx.initializers.truncated_normal(mean=0.0, stddev=0.05)
+    >>> print(init(shape=(5, 10), dtype=tlx.float32))
+
+    """
+
+    def __init__(self, mean=0.0, stddev=0.05, seed=None):
+        self.mean = mean
+        self.stddev = stddev
+        self.seed = seed
+
+    def __call__(self, shape, dtype=tlx.float32):
+        _tensor = jt.empty(size=shape)
+        return self._truncated_normal(_tensor, self.mean, self.stddev)
+
+    def _truncated_normal(self, tensor, mean=0, std=0.09):
+        with jt.no_grad():
+            size = tensor.shape
+            tmp = tensor.new_empty(size + (4, )).normal_()
+            valid = (tmp < 2) & (tmp > -2)
+            ind = valid.max(-1, keepdim=True)[1]
+            tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
+            tensor.data.mul_(std).add_(mean)
+            return tensor
+
+    def get_config(self):
+        return {"mean": self.mean, "stddev": self.stddev, "seed": self.seed}
+
+
+class HeNormal(Initializer):
+    """He normal initializer.
+
+    Parameters
+    ----------
+    seed : A Python integer.
+        Used to seed the random generator.
+
+    Examples
+    --------
+
+    >>> import tensorlayerx as tlx
+    >>> init = tlx.initializers.he_normal()
+    >>> print(init(shape=(5, 10), dtype=tlx.float32))
+
+    """
+
+    def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu', seed=None):
+        self.a = a
+        self.mode = mode
+        self.nonlinearity = nonlinearity
+        self.seed = seed
+
+    def __call__(self, shape, dtype=tlx.float32):
+        return tlx.ops.he_normal(shape=shape, a=self.a, mode=self.mode, nonlinearity=self.nonlinearity, dtype=dtype)
+
+    def get_config(self):
+        return {"a": self.a, "mode ": self.mode, "nonlinearity": self.nonlinearity}
+
+
+class HeUniform(Initializer):
+    """He uniform initializer.
+
+    Parameters
+    ----------
+    seed : A Python integer.
+        Used to seed the random generator.
+
+    Examples
+    --------
+
+    >>> import tensorlayerx as tlx
+    >>> init = tlx.initializers.he_normal()
+    >>> print(init(shape=(5, 10), dtype=tlx.float32))
+
+    """
+
+    def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu', seed=None):
+        self.a = a
+        self.mode = mode
+        self.nonlinearity = nonlinearity
+        self.seed = seed
+
+    def __call__(self, shape, dtype=tlx.float32):
+        return tlx.ops.he_uniform(
+            shape=shape, a=self.a, mode=self.mode, nonlinearity=self.nonlinearity, dtype=dtype, seed=self.seed
+        )
+
+    def get_config(self):
+        return {"a": self.a, "mode ": self.mode, "nonlinearity": self.nonlinearity}
+
+
+def deconv2d_bilinear_upsampling_initializer(shape):
+    """Returns the initializer that can be passed to DeConv2dLayer for initializing the
+    weights in correspondence to channel-wise bilinear up-sampling.
+    Used in segmentation approaches such as [FCN](https://arxiv.org/abs/1605.06211)
+
+    Parameters
+    ----------
+    shape : tuple of int
+        The shape of the filters, [height, width, output_channels, in_channels].
+        It must match the shape passed to DeConv2dLayer.
+
+    Returns
+    -------
+    ``tf.constant_initializer``
+        A constant initializer with weights set to correspond to per channel bilinear upsampling
+        when passed as W_int in DeConv2dLayer
+
+    """
+    raise NotImplementedError
+
+
+class XavierNormal(Initializer):
+    """This class implements the Xavier weight initializer from the paper
+    by Xavier Glorot and Yoshua Bengio.using a normal distribution.
+
+    Parameters
+    ----------
+    seed : A Python integer.
+        Used to seed the random generator.
+
+    """
+
+    def __init__(self, gain=1.0, seed=None):
+        self.gain = gain
+        self.seed = seed
+
+    def __call__(self, shape, dtype=tlx.float32):
+        return tlx.ops.xavier_normal(shape=shape, gain=self.gain, dtype=dtype, seed=self.seed)
+
+    def get_config(self):
+        return {"gain": self.gain}
+
+
+class XavierUniform(Initializer):
+    """This class implements the Xavier weight initializer from the paper
+    by Xavier Glorot and Yoshua Bengio.using a uniform distribution.
+
+    Parameters
+    ----------
+    seed : A Python integer.
+        Used to seed the random generator.
+
+    """
+
+    def __init__(self, gain=1.0, seed=None):
+        self.gain = gain
+        self.seed = seed
+
+    def __call__(self, shape, dtype=tlx.float32):
+        return tlx.ops.xavier_uniform(shape=shape, gain=self.gain, dtype=dtype, seed=self.seed)
+
+    def get_config(self):
+        return {"gain": self.gain}

From 278ce4b48c50c7f661130e8c5b6054c6283c2c91 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 11 Mar 2024 02:43:02 +0800
Subject: [PATCH 14/27] updated load_initializers.py

---
 tensorlayerx/nn/initializers/load_initializers_backend.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorlayerx/nn/initializers/load_initializers_backend.py b/tensorlayerx/nn/initializers/load_initializers_backend.py
index 90bbf0b..8b44a0c 100644
--- a/tensorlayerx/nn/initializers/load_initializers_backend.py
+++ b/tensorlayerx/nn/initializers/load_initializers_backend.py
@@ -14,5 +14,7 @@
     from .torch_initializers import *
 elif BACKEND == 'oneflow':
     from .oneflow_initializers import *
+elif BACKEND == 'jittor':
+    from .jittor_initializers import *
 else:
     raise NotImplementedError("This backend is not supported")

From 7407a51fcc835893b6567bda04e6d80c795e04d5 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 11 Mar 2024 02:51:04 +0800
Subject: [PATCH 15/27] Updated core/__init__py

---
 tensorlayerx/nn/core/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorlayerx/nn/core/__init__.py b/tensorlayerx/nn/core/__init__.py
index be955e7..00b37ed 100644
--- a/tensorlayerx/nn/core/__init__.py
+++ b/tensorlayerx/nn/core/__init__.py
@@ -13,5 +13,7 @@
     from .core_torch import *
 elif BACKEND == 'oneflow':
     from .core_oneflow import *
+elif BACKEND == 'jittor':
+    from .core_jittor import *
 else:
     raise ("Unsupported backend:", BACKEND)

From 0a66e51992d631f1cd37aaab119c1d011cfa1c2e Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 11 Mar 2024 03:11:14 +0800
Subject: [PATCH 16/27] Added /vision/ops/jittor_ops.py

---
 tensorlayerx/vision/ops/jittor_ops.py | 63 +++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 tensorlayerx/vision/ops/jittor_ops.py

diff --git a/tensorlayerx/vision/ops/jittor_ops.py b/tensorlayerx/vision/ops/jittor_ops.py
new file mode 100644
index 0000000..2846fdb
--- /dev/null
+++ b/tensorlayerx/vision/ops/jittor_ops.py
@@ -0,0 +1,63 @@
+import jittor as jt
+import jittor.transform
+import numpy as np
+__all__ = [
+    'box_iou',
+    'nms',
+    'box_area',
+]
+
+def box_area(boxes):
+    if len(boxes.shape) == 1:
+        boxes = boxes[None, :]
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+def box_iou(boxes1, boxes2):
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+
+    Parameters
+    ----------
+    boxes1 : Tensor
+        Tensor[N, 4]
+    boxes2 : Tensor
+        Tensor[M, 4]
+
+    Returns
+    -------
+    iou: Tensor
+        Tensor[N, M],the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
+
+    """
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    tl = jt.maximum(boxes1[:, None, :2], boxes2[:, :2])
+    rb = jt.minimum(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = np.clip(rb - tl, 0, None)
+    inter = wh[:, :, 0] * wh[:, :, 1]
+    iou = inter / (area1[:, None] + area2 - inter)
+    return iou
+    
+
+def nms(boxes, scores, iou_threshold):
+    keep = []
+    idx = jt.argsort(scores)
+    while idx.size > 0:
+        if idx.size == 1:
+            i = idx[0]
+            keep.append(i)
+            break
+        else:
+            max_score_index = idx[-1]
+            max_score_box = boxes[max_score_index][None, :]
+            keep.append(max_score_index)
+            idx = idx[:-1]
+            other_boxes = boxes[idx]
+            ious = box_iou(max_score_box, other_boxes)
+            idx = idx[ious[0] <= iou_threshold]
+
+    keep = jittor.transform.to_tensor(keep)
+    keep = jt.flatten(keep)
+    return keep

From 066c392357695482394cf7e34cc487e3b69c15b8 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 11 Mar 2024 03:33:31 +0800
Subject: [PATCH 17/27] Updated __init__py files for losses, metrics, model
 core, model utils, vision/ops

---
 tensorlayerx/losses/__init__.py     | 2 ++
 tensorlayerx/metrics/__init__.py    | 3 +++
 tensorlayerx/model/core.py          | 3 ++-
 tensorlayerx/model/utils.py         | 3 ++-
 tensorlayerx/vision/ops/__init__.py | 4 +++-
 5 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/tensorlayerx/losses/__init__.py b/tensorlayerx/losses/__init__.py
index 403ccea..887a53d 100644
--- a/tensorlayerx/losses/__init__.py
+++ b/tensorlayerx/losses/__init__.py
@@ -13,5 +13,7 @@
     from .torch_cost import *
 elif BACKEND == 'oneflow':
     from .oneflow_cost import *
+elif BACKEND == 'jittor':
+    from .jittor_cost import *
 else:
     raise NotImplementedError("This backend is not supported")
diff --git a/tensorlayerx/metrics/__init__.py b/tensorlayerx/metrics/__init__.py
index fb699e8..bcb0854 100644
--- a/tensorlayerx/metrics/__init__.py
+++ b/tensorlayerx/metrics/__init__.py
@@ -13,5 +13,8 @@
     from .torch_metric import *
 elif BACKEND == 'oneflow':
     from .oneflow_metric import *
+elif BACKEND == 'jittor':
+    from .jittor_metric import *
+
 else:
     raise NotImplementedError("This backend is not supported")
diff --git a/tensorlayerx/model/core.py b/tensorlayerx/model/core.py
index 83a34dc..6e3e9ae 100644
--- a/tensorlayerx/model/core.py
+++ b/tensorlayerx/model/core.py
@@ -23,7 +23,8 @@
     import paddle as pd
 if tlx.BACKEND == 'torch':
     import torch
-
+if tlx.BACKEND == 'jittor':
+    import torch
 __all__ = ['Model', 'WithLoss', 'WithGrad', 'TrainOneStep', 'TrainOneStepWithGradientClipping']
 
 
diff --git a/tensorlayerx/model/utils.py b/tensorlayerx/model/utils.py
index 96722b6..5ab3570 100644
--- a/tensorlayerx/model/utils.py
+++ b/tensorlayerx/model/utils.py
@@ -13,7 +13,8 @@
     import paddle as pd
 if tlx.BACKEND == 'torch':
     import torch
-
+if tlx.BACKEND == 'jittor':
+    import jittor
 class WithLoss(Module):
     """
     High-Level API for Training or Testing.
diff --git a/tensorlayerx/vision/ops/__init__.py b/tensorlayerx/vision/ops/__init__.py
index c06b24d..de4db27 100644
--- a/tensorlayerx/vision/ops/__init__.py
+++ b/tensorlayerx/vision/ops/__init__.py
@@ -12,6 +12,8 @@
 elif BACKEND == 'torch':
     from .torch_ops import *
 elif BACKEND == 'oneflow':
-    from .torch_ops import *#TODO 
+    from .torch_ops import *
+elif BACKEND == 'jittor':
+    from .jittor_ops import *#TODO 
 else:
     raise NotImplementedError("This backend is not supported")
\ No newline at end of file

From 7e73b2455455462f422ea999b83f36196fce4dd6 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Sun, 14 Apr 2024 12:04:26 +0800
Subject: [PATCH 18/27] Updating files to fix minor bugs

---
 tensorlayerx/backend/ops/jittor_backend.py    |   2 +-
 tensorlayerx/backend/ops/jittor_nn.py         |   2 +-
 tensorlayerx/model/core.py                    |  15 +
 tensorlayerx/nn/core/core_jittor.py           |   2 +-
 .../nn/initializers/jittor_initializers.py    |  33 +-
 tensorlayerx/optimizers/jittor_optimizers.py  | 450 ++++++++++++++++++
 .../optimizers/load_optimizers_backend.py     |   2 +
 tensorlayerx/optimizers/lr/__init__.py        |   2 +
 tensorlayerx/optimizers/lr/jittor_lr.py       | 391 +++++++++++++++
 9 files changed, 876 insertions(+), 23 deletions(-)
 create mode 100644 tensorlayerx/optimizers/jittor_optimizers.py
 create mode 100644 tensorlayerx/optimizers/lr/jittor_lr.py

diff --git a/tensorlayerx/backend/ops/jittor_backend.py b/tensorlayerx/backend/ops/jittor_backend.py
index 31139e3..d4da98f 100644
--- a/tensorlayerx/backend/ops/jittor_backend.py
+++ b/tensorlayerx/backend/ops/jittor_backend.py
@@ -48,7 +48,7 @@ def set_context(**kwargs):
 
 
 def get_tensor_shape(x):
-    return list(x.shape())
+    return list(x.shape)
 
 
 # initializers
diff --git a/tensorlayerx/backend/ops/jittor_nn.py b/tensorlayerx/backend/ops/jittor_nn.py
index 480fc5e..7dd6f6b 100644
--- a/tensorlayerx/backend/ops/jittor_nn.py
+++ b/tensorlayerx/backend/ops/jittor_nn.py
@@ -320,7 +320,7 @@ class Tanh(object):
 # jittor.nn.hardtanh(x, min_val=-1, max_val=1)
     def __init__(self):
         super(Tanh, self).__init__()
-        self.tanh = nn.Sigmoid()
+        self.tanh = nn.Tanh()
 
     def __call__(self, x):
         return self.tanh(x)
diff --git a/tensorlayerx/model/core.py b/tensorlayerx/model/core.py
index 6e3e9ae..ba5b3ce 100644
--- a/tensorlayerx/model/core.py
+++ b/tensorlayerx/model/core.py
@@ -128,6 +128,14 @@ def train(self, n_epoch, train_dataset=None, test_dataset=False, print_train_bat
                 print_train_batch=print_train_batch, print_freq=print_freq, test_dataset=test_dataset,
             )
 
+        elif tlx.BACKEND == "jittor":
+            self.of_train(
+                n_epoch=n_epoch, train_dataset=train_dataset, network=self.network, loss_fn=self.loss_fn,
+                train_weights=self.train_weights, optimizer=self.optimizer, metrics=self.metrics,
+                print_train_batch=print_train_batch, print_freq=print_freq, test_dataset=test_dataset,
+            )
+
+
     def eval(self, test_dataset):
         self.network.set_eval()
         test_loss, test_acc, n_iter = 0, 0, 0
@@ -660,6 +668,8 @@ def __init__(self, network, loss_fn=None, optimizer=None):
             self.net_with_grad = WithGradMS(network, loss_fn, optimizer)
         elif tlx.BACKEND == 'paddle':
             self.net_with_grad = WithGradPD(network, loss_fn, optimizer)
+        elif tlx.BACKEND == 'jittor':
+            self.net_with_grad = WithGradPD(network, loss_fn, optimizer)            
         else:
             raise NotImplementedError("This backend is not supported")
 
@@ -706,6 +716,8 @@ def __init__(self, net_with_loss, optimizer, train_weights):
             self.net_with_train = TrainOneStepWithPD(net_with_loss, optimizer, train_weights)
         elif tlx.BACKEND == 'torch':
             self.net_with_train = TrainOneStepWithTH(net_with_loss, optimizer, train_weights)
+        elif tlx.BACKEND == 'jittor':
+            self.net_with_train = TrainOneStepWithTH(net_with_loss, optimizer, train_weights)
         else:
             raise NotImplementedError("This backend is not supported")
 
@@ -759,6 +771,9 @@ def __init__(self, net_with_loss, optimizer, train_weights, gradient_clipping=tl
         elif tlx.BACKEND == 'torch':
             self.net_weith_train = TrainOneStepWithGradientClippingTH(
                 net_with_loss, optimizer, train_weights, gradient_clipping)
+        elif tlx.BACKEND == 'jittor':
+            self.net_weith_train = TrainOneStepWithGradientClippingTH(
+                net_with_loss, optimizer, train_weights, gradient_clipping)
         else:
             raise NotImplementedError("This backend is not supported")
 
diff --git a/tensorlayerx/nn/core/core_jittor.py b/tensorlayerx/nn/core/core_jittor.py
index 0bcd68e..7f9668e 100644
--- a/tensorlayerx/nn/core/core_jittor.py
+++ b/tensorlayerx/nn/core/core_jittor.py
@@ -120,7 +120,7 @@ def _call_impl_tlx(self, *input, **kwargs):
                                     "Please check the layers name".format(name))
             self._check = True
 
-        result = self._call_impl(*input, **kwargs)
+        result = super().__call__(*input, **kwargs)
         return result
     # TODO RNN enabled after repair
     __call__: Callable[..., Any] = _call_impl_tlx
diff --git a/tensorlayerx/nn/initializers/jittor_initializers.py b/tensorlayerx/nn/initializers/jittor_initializers.py
index 0a791ab..ca4664d 100644
--- a/tensorlayerx/nn/initializers/jittor_initializers.py
+++ b/tensorlayerx/nn/initializers/jittor_initializers.py
@@ -82,8 +82,8 @@ class Zeros(Initializer):
     """
 
     def __call__(self, shape, dtype=tlx.float32):
-        _tensor = jt.empty(size=shape, dtype=dtype)
-        return jt.zeros(_tensor)
+        _tensor = jt.empty(shape, dtype=dtype)
+        return jt.zeros_like(_tensor)
 
 
 class Ones(Initializer):
@@ -99,8 +99,7 @@ class Ones(Initializer):
     """
 
     def __call__(self, shape, dtype=tlx.float32):
-        _tensor = jt.empty(size=shape, dtype=dtype)
-        return jt.ones(_tensor)
+        return jt.ones(shape, dtype=dtype)
 
 
 class Constant(Initializer):
@@ -124,9 +123,9 @@ def __init__(self, value=0):
         self.value = value
 
     def __call__(self, shape, dtype=tlx.float32):
-        _tensor = jt.empty(size=shape, dtype=dtype)
+        _tensor = jt.empty(shape, dtype=dtype)
         if isinstance(self.value, (int, float)):
-            return jt.init.constant_(_tensor, val=self.value)
+            return jt.init.constant_(_tensor, value=self.value)
         elif isinstance(self.value, (jt.array, list, np.ndarray)):
             _tensor.data = jittor.transform.ToTensor(self.value)
             return _tensor
@@ -162,7 +161,7 @@ def __init__(self, minval=-0.05, maxval=0.05, seed=None):
         self.seed = seed
 
     def __call__(self, shape, dtype=tlx.float32):
-        _tensor = jt.empty(size=shape, dtype=dtype)
+        _tensor = jt.empty(shape, dtype=dtype)
         return jt.nn.init.uniform_(_tensor, a=self.minval, b=self.maxval)
 
     def get_config(self):
@@ -198,8 +197,9 @@ def __init__(self, mean=0.0, stddev=0.05, seed=None):
         self.seed = seed
 
     def __call__(self, shape, dtype=tlx.float32):
-        _tensor = jt.empty(size=shape)
-        return jt.normal(_tensor, mean=self.mean, std=self.stddev)
+        _tensor = jt.empty(shape)
+        return jt.init.gauss(_tensor.shape, _tensor.dtype, mean=self.mean, std=self.stddev)
+
 
     def get_config(self):
         return {"mean": self.mean, "stddev": self.stddev, "seed": self.seed}
@@ -238,18 +238,11 @@ def __init__(self, mean=0.0, stddev=0.05, seed=None):
         self.seed = seed
 
     def __call__(self, shape, dtype=tlx.float32):
-        _tensor = jt.empty(size=shape)
-        return self._truncated_normal(_tensor, self.mean, self.stddev)
-
-    def _truncated_normal(self, tensor, mean=0, std=0.09):
         with jt.no_grad():
-            size = tensor.shape
-            tmp = tensor.new_empty(size + (4, )).normal_()
-            valid = (tmp < 2) & (tmp > -2)
-            ind = valid.max(-1, keepdim=True)[1]
-            tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
-            tensor.data.mul_(std).add_(mean)
-            return tensor
+            tensor = jt.randn(shape, dtype=dtype)
+            tensor *= self.stddev
+            tensor += self.mean
+        return tensor
 
     def get_config(self):
         return {"mean": self.mean, "stddev": self.stddev, "seed": self.seed}
diff --git a/tensorlayerx/optimizers/jittor_optimizers.py b/tensorlayerx/optimizers/jittor_optimizers.py
new file mode 100644
index 0000000..f53bb48
--- /dev/null
+++ b/tensorlayerx/optimizers/jittor_optimizers.py
@@ -0,0 +1,450 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import, division, print_function
+import jittor.optim as optimizer
+import jittor as jt
+from tensorlayerx.optimizers.lr import LRScheduler
+
+__all__ = ['Adadelta', 'Adagrad', 'Adam', 'Adamax', 'Ftrl', 'Nadam', 'RMSprop', 'SGD', 'Momentum', 'Lamb', 'LARS']
+
+
+class Adadelta(object):
+
+    def __init__(self):
+        pass
+
+    def app_gradients(self):
+        raise Exception('Adadelta optimizer function not implemented')
+
+
+class Adagrad(object):
+
+    def __init__(self):
+        pass
+
+    def app_gradients(self):
+        raise Exception('Adagrad optimizer function not implemented')
+
+class Adam(object):
+
+    def __init__(
+        self,
+        lr=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        eps=1e-8,
+        weight_decay=0.0,
+        grad_clip=None,
+    ):
+        self.lr = lr
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.eps = eps
+        self.init_optim = False
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+
+    @jt.no_grad()
+    def apply_gradients(self, grads_and_vars=None, closure=None):
+        if not self.init_optim:
+            raise AttributeError("Can not apply gradients before zero_grad call.")
+        loss = None
+        if closure is not None:
+            with jt.enable_grad():
+                loss = closure()
+
+        for group in self.optimizer_adam.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            beta1, beta2 = group['betas']
+
+            for p in group['params']:
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    grads.append(p.grad)
+
+                    state = self.optimizer_adam.state[p]
+                    # Lazy state initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        # Exponential moving average of gradient values
+                        state['exp_avg'] = jt.zeros_like(p)
+                        # Exponential moving average of squared gradient values
+                        state['exp_avg_sq'] = jt.zeros_like(p)
+                        if group['amsgrad']:
+                            # Maintains max of all exp. moving avg. of sq. grad. values
+                            state['max_exp_avg_sq'] = jt.zeros_like(p, memory_format=jt.preserve_format)
+
+                    exp_avgs.append(state['exp_avg'])
+                    exp_avg_sqs.append(state['exp_avg_sq'])
+
+                    if group['amsgrad']:
+                        max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+                    # update the steps for each param group update
+                    state['step'] += 1
+                    # record the step after step update
+                    state_steps.append(state['step'])
+
+            optimizer.Adam(params_with_grad,
+                   grads,
+                   exp_avgs,
+                   exp_avg_sqs,
+                   max_exp_avg_sqs,
+                   state_steps,
+                   amsgrad=group['amsgrad'],
+                   beta1=beta1,
+                   beta2=beta2,
+                   lr=get_lr(self.lr),
+                   weight_decay=group['weight_decay'],
+                   eps=group['eps'])
+        return loss
+
+    def gradient(self, loss, weights=None, return_grad=True):
+        if weights is None:
+            raise AttributeError("Parameter train_weights must be entered.")
+        if not self.init_optim:
+            self.optimizer_adam = optimizer.Adam(
+                params=weights, lr=get_lr(self.lr), betas=(self.beta_1, self.beta_2), eps=self.eps,
+                weight_decay=self.weight_decay
+            )
+            self.init_optim = True
+        self.optimizer_adam.zero_grad()
+        loss.backward()
+
+        if self.grad_clip is not None:
+            self.grad_clip(weights)
+
+        if return_grad ==True:
+            return _grads(weights)
+        else:
+            return None
+
+
+class Adamax(object):
+
+    def __init__(self):
+        pass
+
+    def apply_gradients(self):
+        raise Exception('Adamax optimizer function not implemented')
+
+
+
+class Ftrl(object):
+
+    def __init__(self):
+        raise NotImplementedError("Ftrl optimizer is not implemented")
+
+    def apply_gradients(self):
+        pass
+
+    def gradient(self, train_weights=None):
+        pass
+
+
+class Nadam(object):
+
+    def __init__(self):
+        raise NotImplementedError("Nadam optimizer is not implemented")
+
+    def apply_gradients(self):
+        pass
+
+    def gradient(self, train_weights=None):
+        pass
+
+
+class RMSprop(object):
+
+    def __init__(
+        self,
+        lr=0.001,
+        rho=0.99,
+        momentum=0.0,
+        eps=1e-08,
+        centered=False,
+        weight_decay=0.0,
+        grad_clip=None,
+    ):
+        self.lr = lr
+        self.rho = rho
+        self.momentum = momentum
+        self.eps = eps
+        self.centered = centered
+        self.init_optim = False
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+
+    @jt.no_grad()
+    def apply_gradients(self, grads_and_vars=None, closure=None):
+        if not self.init_optim:
+            raise AttributeError("Can not apply gradients before zero_grad call.")
+
+        loss = None
+        if closure is not None:
+            with jt.enable_grad():
+                loss = closure()
+
+        for group in self.optimizer_rmsprop.param_groups:
+            params_with_grad = []
+            grads = []
+            square_avgs = []
+            grad_avgs = []
+            momentum_buffer_list = []
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+
+                if p.grad.is_sparse:
+                    raise RuntimeError('RMSprop does not support sparse gradients')
+                grads.append(p.grad)
+
+                state = self.optimizer_rmsprop.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['square_avg'] = jt.zeros_like(p)
+                    if group['momentum'] > 0:
+                        state['momentum_buffer'] = jt.zeros_like(p)
+                    if group['centered']:
+                        state['grad_avg'] = jt.zeros_like(p)
+
+                square_avgs.append(state['square_avg'])
+
+                if group['momentum'] > 0:
+                    momentum_buffer_list.append(state['momentum_buffer'])
+                if group['centered']:
+                    grad_avgs.append(state['grad_avg'])
+
+                state['step'] += 1
+
+            optimizer.RMSprop(params_with_grad,
+                      grads,
+                      square_avgs,
+                      grad_avgs,
+                      momentum_buffer_list,
+                      lr=get_lr(self.lr),
+                      alpha=group['alpha'],
+                      eps=group['eps'],
+                      weight_decay=group['weight_decay'],
+                      momentum=group['momentum'],
+                      centered=group['centered'])
+
+        return loss
+
+    def gradient(self, loss, weights=None, return_grad=True):
+        if weights is None:
+            raise AttributeError("Parameter train_weights must be entered.")
+        if not self.init_optim:
+            self.optimizer_rmsprop = optimizer.RMSprop(
+                params=weights, lr=get_lr(self.lr), alpha=self.rho, eps=self.eps, momentum=self.momentum,
+                centered=self.centered, weight_decay=self.weight_decay
+            )
+            self.init_optim = True
+        self.optimizer_rmsprop.zero_grad()
+        loss.backward()
+
+        if self.grad_clip is not None:
+            self.grad_clip(weights)
+
+        if return_grad ==True:
+            return _grads(weights)
+        else:
+            return None
+
+
+class SGD(object):
+
+    def __init__(
+        self,
+        lr=0.001,
+        momentum=0,
+        weight_decay=0.0,
+        grad_clip=None,
+    ):
+        self.lr = lr
+        self.momentum = momentum
+        self.init_optim = False
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+
+    @jt.no_grad()
+    def apply_gradients(self, grads_and_vars=None, closure=None):
+        if not self.init_optim:
+            raise AttributeError("Can not apply gradients before zero_grad call.")
+
+        loss = None
+        if closure is not None:
+            with jt.enable_grad():
+                loss = closure()
+
+        for group in self.optimizer_sgd.param_groups:
+            params_with_grad = []
+            d_p_list = []
+            momentum_buffer_list = []
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            dampening = group['dampening']
+            nesterov = group['nesterov']
+            lr = get_lr(self.lr)
+
+            for p in group['params']:
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    d_p_list.append(p.grad)
+
+                    state = self.optimizer_sgd.state[p]
+                    if 'momentum_buffer' not in state:
+                        momentum_buffer_list.append(None)
+                    else:
+                        momentum_buffer_list.append(state['momentum_buffer'])
+
+            optimizer.SGD(params_with_grad,
+                  d_p_list,
+                  momentum_buffer_list,
+                  weight_decay=weight_decay,
+                  momentum=momentum,
+                  lr=lr,
+                  dampening=dampening,
+                  nesterov=nesterov)
+
+            # update momentum_buffers in state
+            for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
+                state = self.optimizer_sgd.state[p]
+                state['momentum_buffer'] = momentum_buffer
+
+        return loss
+
+    def gradient(self, loss, weights=None, return_grad=True):
+        if weights is None:
+            raise AttributeError("Parameter train_weights must be entered.")
+        if not self.init_optim:
+            self.optimizer_sgd = optimizer.SGD(
+                params=weights, lr=get_lr(self.lr), momentum=self.momentum, weight_decay=self.weight_decay
+            )
+            self.init_optim = True
+        self.optimizer_sgd.zero_grad()
+        loss.backward()
+
+        if self.grad_clip is not None:
+            self.grad_clip(weights)
+
+        if return_grad ==True:
+            return _grads(weights)
+        else:
+            return None
+
+
+class Momentum(object):
+
+    def __init__(
+        self,
+        lr=0.001,
+        momentum=0,
+        weight_decay=0.0,
+        nesterov=False,
+        grad_clip=None,
+    ):
+        self.lr = lr
+        self.momentum = momentum
+        self.init_optim = False
+        self.weight_decay = weight_decay
+        self.nesterov = nesterov
+        self.grad_clip = grad_clip
+
+    @jt.no_grad()
+    def apply_gradients(self, grads_and_vars=None, closure=None):
+        if not self.init_optim:
+            raise AttributeError("Can not apply gradients before zero_grad call.")
+
+        loss = None
+        if closure is not None:
+            with jt.enable_grad():
+                loss = closure()
+
+        for group in self.optimizer_momentum.param_groups:
+            params_with_grad = []
+            d_p_list = []
+            momentum_buffer_list = []
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            dampening = group['dampening']
+            nesterov = group['nesterov']
+            lr = get_lr(self.lr)
+
+            for p in group['params']:
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    d_p_list.append(p.grad)
+
+                    state = self.optimizer_momentum.state[p]
+                    if 'momentum_buffer' not in state:
+                        momentum_buffer_list.append(None)
+                    else:
+                        momentum_buffer_list.append(state['momentum_buffer'])
+
+            optimizer.SGD(params_with_grad,
+                  d_p_list,
+                  momentum_buffer_list,
+                  weight_decay=weight_decay,
+                  momentum=momentum,
+                  lr=lr,
+                  dampening=dampening,
+                  nesterov=nesterov)
+
+            # update momentum_buffers in state
+            for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
+                state = self.optimizer_momentum.state[p]
+                state['momentum_buffer'] = momentum_buffer
+
+        return loss
+
+    def gradient(self, loss, weights=None, return_grad=True):
+        if weights is None:
+            raise AttributeError("Parameter train_weights must be entered.")
+        if not self.init_optim:
+            self.optimizer_momentum = optimizer.SGD(
+                params=weights, lr=get_lr(self.lr), momentum=self.momentum, weight_decay=self.weight_decay, nesterov=self.nesterov
+            )
+            self.init_optim = True
+        self.optimizer_momentum.zero_grad()
+        loss.backward()
+
+        if self.grad_clip is not None:
+            self.grad_clip(weights)
+
+        if return_grad ==True:
+            return _grads(weights)
+        else:
+            return None
+
+
+def Lamb(**kwargs):
+    raise Exception('Lamb optimizer function not implemented')
+
+
+def LARS(**kwargs):
+    raise Exception('LARS optimizer function not implemented')
+
+
+def _grads(weights):
+    grads = []
+    for w in weights:
+        grads.append(w.grad)
+    return grads
+
+def get_lr(lr):
+    if isinstance(lr, LRScheduler):
+        return lr()
+    return lr
diff --git a/tensorlayerx/optimizers/load_optimizers_backend.py b/tensorlayerx/optimizers/load_optimizers_backend.py
index 2b60dd2..71b3a37 100644
--- a/tensorlayerx/optimizers/load_optimizers_backend.py
+++ b/tensorlayerx/optimizers/load_optimizers_backend.py
@@ -14,5 +14,7 @@
     from .torch_optimizers import *
 elif BACKEND == 'oneflow':
     from .oneflow_optimizers import *
+elif BACKEND == 'jittor':
+    from .jittor_optimizers import *
 else:
     raise NotImplementedError("This backend is not supported")
diff --git a/tensorlayerx/optimizers/lr/__init__.py b/tensorlayerx/optimizers/lr/__init__.py
index 286f5c3..f2d15fa 100644
--- a/tensorlayerx/optimizers/lr/__init__.py
+++ b/tensorlayerx/optimizers/lr/__init__.py
@@ -14,5 +14,7 @@
     from .torch_lr import *
 elif BACKEND == 'oneflow':
     from .oneflow_lr import *
+elif BACKEND == 'jittor':
+    from .jittor_lr import *    
 else:
     raise NotImplementedError("This backend is not supported")
diff --git a/tensorlayerx/optimizers/lr/jittor_lr.py b/tensorlayerx/optimizers/lr/jittor_lr.py
new file mode 100644
index 0000000..aa52a38
--- /dev/null
+++ b/tensorlayerx/optimizers/lr/jittor_lr.py
@@ -0,0 +1,391 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function
+import jittor as jt
+import math
+import numpy as np
+
+__all__ = [
+    'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'InverseTimeDecay', 'PolynomialDecay',
+    'LinearWarmup', 'ExponentialDecay', 'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau',
+    'CosineAnnealingDecay'
+]
+
+
+class LRScheduler(object):
+    """
+    LRScheduler Base class. Define the common interface of a learning rate scheduler.
+
+    User can import it by ``from tlx.optimizer.lr import LRScheduler`` ,
+
+    then overload it for your subclass and have a custom implementation of ``get_lr()`` .
+
+    References
+    ----------
+    - https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/lr/LRScheduler_cn.html
+
+    Parameters
+    ----------
+    learning_rate : A floating point value
+        The learning rate. Defaults to 0.1.
+    last_epoch : int
+        The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    verbose : bool
+        If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+
+    Examples
+    --------
+    With TensorLayerX
+
+    >>> #Here is an example of a simple ``StepDecay`` implementation.
+    >>> import tensorlayerx as tlx
+    >>> from tensorlayerx.optimizers.lr import LRScheduler
+    >>> class StepDecay(LRScheduler):
+    >>>     def __init__(self, learning_rate, step_size, gamma = 0.1, last_epoch = -1, verbose=False):
+    >>>         if not isinstance(step_size, int):
+    >>>             raise TypeError("The type of 'step_size' must be 'int', but received %s." %type(step_size))
+    >>>         if gamma >= 1.0 :
+    >>>             raise ValueError('gamma should be < 1.0.')
+    >>>         self.step_size = step_size
+    >>>         self.gamma = gamma
+    >>>         super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
+    >>>     def get_lr(self):
+    >>>         i = self.last_epoch // self.step_size
+    >>>         return self.base_lr * (self.gamma**i)
+
+    """
+
+    def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
+        if not isinstance(learning_rate, (float, int)):
+            raise TypeError("The type of learning rate must be float, but received {}".format(type(learning_rate)))
+        self.base_lr = jt.array(float(learning_rate))
+        self.last_lr = jt.array(float(learning_rate))
+        self.last_epoch = last_epoch
+        self.verbose = verbose
+
+        self.step()
+
+    def __call__(self):
+
+        return self.last_lr
+
+    def step(self, epoch=None):
+        if epoch is None:
+            self.last_epoch += 1
+            new_lr = self.get_lr()
+        else:
+            self.last_epoch = epoch
+            if hasattr(self, "_get_closed_form_lr"):
+                new_lr = self._get_closed_form_lr()
+            else:
+                new_lr = self.get_lr()
+        self.last_lr.fill_(new_lr)
+        if self.verbose:
+            print(
+                'Epoch {}: {} set learning rate to {}.'.format(self.last_epoch, self.__class__.__name__, self.last_lr)
+            )
+
+    def get_lr(self):
+
+        raise NotImplementedError
+
+
+class StepDecay(LRScheduler):
+
+    def __init__(self, learning_rate, step_size, gamma=0.1, last_epoch=-1, verbose=False):
+        if not isinstance(step_size, int):
+            raise TypeError("The type of 'step_size' must be 'int', but received %s." % type(step_size))
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+        self.step_size = step_size
+        self.gamma = gamma
+        super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        i = self.last_epoch // self.step_size
+        return self.base_lr * (self.gamma**i)
+
+
+class CosineAnnealingDecay(LRScheduler):
+
+    def __init__(self, learning_rate, T_max, eta_min=0, last_epoch=-1, verbose=False):
+        if not isinstance(T_max, int):
+            raise TypeError(
+                "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s." % type(T_max)
+            )
+        if not isinstance(eta_min, (float, int)):
+            raise TypeError(
+                "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s." %
+                type(eta_min)
+            )
+        self.T_max = T_max
+        self.eta_min = float(eta_min)
+        super(CosineAnnealingDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            return self.base_lr
+        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(math.pi / self.T_max)) / 2
+
+        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)
+               ) / (1 + math.cos(math.pi *
+                                 (self.last_epoch - 1) / self.T_max)) * (self.last_lr - self.eta_min) + self.eta_min
+
+    def _get_closed_form_lr(self):
+        return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
+
+
+class NoamDecay(LRScheduler):
+
+    def __init__(self, d_model, warmup_steps, learning_rate=1.0, last_epoch=-1, verbose=False):
+        self.d_model = d_model
+        self.warmup_steps = warmup_steps
+        super(NoamDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch == 0:
+            a = 1
+        else:
+            a = self.last_epoch**-0.5
+        b = self.warmup_steps**-1.5 * self.last_epoch
+        return self.base_lr * (self.d_model**-0.5) * min(a, b)
+
+
+class PiecewiseDecay(LRScheduler):
+
+    def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
+        self.boundaries = boundaries
+        self.values = values
+        super(PiecewiseDecay, self).__init__(last_epoch=last_epoch, verbose=verbose)
+
+    def get_lr(self):
+        for i in range(len(self.boundaries)):
+            if self.last_epoch < self.boundaries[i]:
+                return self.values[i]
+        return self.values[len(self.values) - 1]
+
+
+
+class NaturalExpDecay(LRScheduler):
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(NaturalExpDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)
+
+
+class InverseTimeDecay(LRScheduler):
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(InverseTimeDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr / (1 + self.gamma * self.last_epoch)
+
+
+class PolynomialDecay(LRScheduler):
+    def __init__(self, learning_rate, decay_steps, end_lr=0.0001, power=1.0, cycle=False, last_epoch=-1, verbose=False):
+        self.decay_steps = decay_steps
+        self.end_lr = end_lr
+        self.power = power
+        self.cycle = cycle
+        super(PolynomialDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        tmp_epoch_num = self.last_epoch
+        tmp_decay_steps = self.decay_steps
+        if self.cycle:
+            div_res = math.ceil(float(self.last_epoch) / float(self.decay_steps))
+            if self.last_epoch == 0:
+                div_res = 1
+            tmp_decay_steps = self.decay_steps * div_res
+        else:
+            tmp_epoch_num = min(self.last_epoch, self.decay_steps)
+
+        return (self.base_lr -
+                self.end_lr) * ((1 - float(tmp_epoch_num) / float(tmp_decay_steps))**self.power) + self.end_lr
+
+
+class LinearWarmup(LRScheduler):
+
+    def __init__(self, learning_rate, warmup_steps, start_lr, end_lr, last_epoch=-1, verbose=False):
+        type_check = isinstance(learning_rate, float) or isinstance(learning_rate,
+                                                                    int) or isinstance(learning_rate, LRScheduler)
+        if not type_check:
+            raise TypeError(
+                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".
+                format(learning_rate)
+            )
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(end_lr, start_lr)
+        super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_steps:
+            return (self.end_lr - self.start_lr) * float(self.last_epoch) / float(self.warmup_steps) + self.start_lr
+        else:
+            if isinstance(self.learning_rate, LRScheduler):
+                lr_value = self.learning_rate()
+                self.learning_rate.step()
+                return lr_value
+
+            return self.learning_rate
+
+
+class ExponentialDecay(LRScheduler):
+
+    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
+        self.gamma = gamma
+        super(ExponentialDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * (self.gamma**self.last_epoch)
+
+
+class MultiStepDecay(LRScheduler):
+
+    def __init__(self, learning_rate, milestones, gamma=0.1, last_epoch=-1, verbose=False):
+        if not isinstance(milestones, (tuple, list)):
+            raise TypeError(
+                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s." %
+                type(milestones)
+            )
+
+        if not all([milestones[i] < milestones[i + 1] for i in range(len(milestones) - 1)]):
+            raise ValueError('The elements of milestones must be incremented')
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+
+        self.milestones = milestones
+        self.gamma = gamma
+        super(MultiStepDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        for i in range(len(self.milestones)):
+            if self.last_epoch < self.milestones[i]:
+                return self.base_lr * (self.gamma**i)
+        return self.base_lr * (self.gamma**len(self.milestones))
+
+
+
+class LambdaDecay(LRScheduler):
+
+    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
+        if not callable(lr_lambda):
+            raise TypeError(
+                "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s." % type(lr_lambda)
+            )
+
+        self.lr_lambda = lr_lambda
+        super(LambdaDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        return self.base_lr * self.lr_lambda(self.last_epoch)
+
+
+class ReduceOnPlateau(LRScheduler):
+
+    def __init__(
+        self, learning_rate, mode='min', factor=0.1, patience=10, threshold=1e-4, threshold_mode='rel', cooldown=0,
+        min_lr=0, epsilon=1e-8, verbose=False
+    ):
+        mode = mode.lower()
+        if mode not in ['min', 'max']:
+            raise ValueError('mode: ' + mode + ' is unknown!')
+        self.mode = mode
+
+        if factor >= 1.0:
+            raise ValueError('new_lr = origin_lr * gamma and gamma should be < 1.0.')
+        self.factor = factor
+
+        threshold_mode = threshold_mode.lower()
+        if threshold_mode not in ['rel', 'abs']:
+            raise ValueError('threshold mode: ' + threshold_mode + ' is unknown!')
+        self.threshold_mode = threshold_mode
+        if not isinstance(learning_rate, (float, int)):
+            raise TypeError(
+                "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s." %
+                type(learning_rate)
+            )
+
+        self.patience = patience
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.cooldown = cooldown
+        self.min_lr = min_lr
+        self.epsilon = epsilon
+
+        self.cooldown_counter = 0
+        self.best = None
+        self.num_bad_epochs = 0
+
+        # Can not call Parent __init__, so implement here.
+        self.base_lr = jt.array(float(learning_rate))
+        self.last_lr = jt.array(float(learning_rate))
+        self.last_epoch = 0
+        self.verbose = verbose
+        self._var_name = None
+
+    # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
+    def step(self, metrics, epoch=None):
+
+        if epoch is None:
+            self.last_epoch = self.last_epoch + 1
+        else:
+            self.last_epoch = epoch
+
+        # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
+        if isinstance(metrics, (jt.array, np.ndarray)):
+            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
+                                                                      "should be (1L,), but the current metrics.shape is {}. Maybe that " \
+                                                                      "you should call tlx.reudce_mean to process it first.".format(
+                metrics.shape)
+        elif not isinstance(metrics, (int, float, np.float32, np.float64)):
+            raise TypeError(
+                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray', but receive {}".format(
+                    type(metrics)
+                )
+            )
+
+        if self.cooldown_counter > 0:
+            self.cooldown_counter -= 1
+        else:
+            if self.best is None or self._is_better(metrics, self.best):
+                self.best = metrics
+                self.num_bad_epochs = 0
+            else:
+                self.num_bad_epochs += 1
+
+            if self.num_bad_epochs > self.patience:
+                self.cooldown_counter = self.cooldown
+                self.num_bad_epochs = 0
+                new_lr = max(self.last_lr * self.factor, self.min_lr)
+                if self.last_lr - new_lr > self.epsilon:
+                    self.last_lr.fill_(new_lr)
+                    if self.verbose:
+                        print(
+                            'Epoch {}: {} set learning rate to {}.'.format(
+                                self.last_epoch, self.__class__.__name__, self.last_lr
+                            )
+                        )
+
+    def _is_better(self, current, best):
+        if self.mode == 'min' and self.threshold_mode == 'rel':
+            return current < best - best * self.threshold
+
+        elif self.mode == 'min' and self.threshold_mode == 'abs':
+            return current < best - self.threshold
+
+        elif self.mode == 'max' and self.threshold_mode == 'rel':
+            return current > best + best * self.threshold
+
+        else:
+            return current > best + self.threshold
\ No newline at end of file

From 2e1832de08f15d35eab0233891a409bd5fb0442a Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Tue, 25 Jun 2024 22:35:46 +0800
Subject: [PATCH 19/27] Updated Core_Jittor.py and enabled Activation Functions
 to function as expected

---
 tensorlayerx/nn/core/core_jittor.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorlayerx/nn/core/core_jittor.py b/tensorlayerx/nn/core/core_jittor.py
index 7f9668e..b59124b 100644
--- a/tensorlayerx/nn/core/core_jittor.py
+++ b/tensorlayerx/nn/core/core_jittor.py
@@ -108,6 +108,15 @@ def _get_weights(self, var_name, shape, init=None, trainable=True, transposed=No
         self.var_name = var_name
         return param
 
+    def execute(self, *args, **kw):
+        ''' Executes the module computation. 
+        
+        Raises NotImplementedError if the subclass does not override the method.
+        '''
+        return self.forward(*args, **kw)
+        # raise NotImplementedError("Please implement 'execute' method of "+str(type(self)))
+    
+
     def _call_impl_tlx(self, *input, **kwargs):
         if self._check == False:
             _param_name = []

From 0211bdf17c4342a9ec7ba5216521d0aeddb496cd Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Tue, 25 Jun 2024 22:37:47 +0800
Subject: [PATCH 20/27] Updated dataflow/utils.py and added jittor

---
 tensorlayerx/dataflow/utils.py | 54 ++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/tensorlayerx/dataflow/utils.py b/tensorlayerx/dataflow/utils.py
index a6c095b..45a152d 100644
--- a/tensorlayerx/dataflow/utils.py
+++ b/tensorlayerx/dataflow/utils.py
@@ -157,6 +157,58 @@ def default_collate_torch(batch):
     raise TypeError(default_collate_err_msg_format.format(elem_type))
 
 
+def default_collate_jittor(batch):
+    import jittor
+    import jittor.transform
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, jittor.Var):
+        out = None
+
+        return jittor.stack(batch, 0, out=out)
+    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+            and elem_type.__name__ != 'string_':
+        if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+            # array of string classes and object
+            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+
+            return default_collate([jittor.transform.to_tensor(b) for b in batch])
+        elif elem.shape == ():  # scalars
+            return jittor.transform.to_tensor(batch)
+    elif isinstance(elem, float):
+        return jittor.transform.to_tensor(batch, dtype=jittor.float64)
+    elif isinstance(elem, int):
+        return jittor.transform.to_tensor(batch)
+    elif isinstance(elem, string_classes):
+        return batch
+    elif isinstance(elem, collections.abc.Mapping):
+        try:
+            return elem_type({key: default_collate([d[key] for d in batch]) for key in elem})
+        except TypeError:
+            # The mapping type may not support `__init__(iterable)`.
+            return {key: default_collate([d[key] for d in batch]) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+        return elem_type(*(default_collate(samples) for samples in zip(*batch)))
+    elif isinstance(elem, collections.abc.Sequence):
+        # check to make sure that the elements in batch have consistent size
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all(len(elem) == elem_size for elem in it):
+            raise RuntimeError('each element in list of batch should be of equal size')
+        transposed = list(zip(*batch))  # It may be accessed twice, so we use a list.
+
+        if isinstance(elem, tuple):
+            return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
+        else:
+            try:
+                return elem_type([default_collate(samples) for samples in transposed])
+            except TypeError:
+                # The sequence type may not support `__init__(iterable)` (e.g., `range`).
+                return [default_collate(samples) for samples in transposed]
+
+    raise TypeError(default_collate_err_msg_format.format(elem_type))
+
 def default_collate_paddle(batch):
     import paddle
     elem = batch[0]
@@ -310,6 +362,8 @@ def default_collate(batch):
         return default_collate_ms(batch)
     elif BACKEND == 'oneflow':
         return default_collate_flow(batch)
+    elif BACKEND == 'jittor':
+        return default_collate_jittor(batch)
 
 
 class _DatasetKind(object):

From d70b8fb784c1c4f6934da3fec1a77b8a8c59bc8c Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Wed, 26 Jun 2024 01:52:19 +0800
Subject: [PATCH 21/27] Updated Jittor_nn, dataflow/Utils and jittor_Cost

---
 tensorlayerx/backend/ops/jittor_nn.py |  4 +--
 tensorlayerx/dataflow/utils.py        | 13 ++++------
 tensorlayerx/losses/jittor_cost.py    | 36 +++++++++++++++++++++------
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/tensorlayerx/backend/ops/jittor_nn.py b/tensorlayerx/backend/ops/jittor_nn.py
index 7dd6f6b..d1548d3 100644
--- a/tensorlayerx/backend/ops/jittor_nn.py
+++ b/tensorlayerx/backend/ops/jittor_nn.py
@@ -535,7 +535,7 @@ def same_padding(input, weight, strides, dilations):
     #                     H(in) + 2* padding[0] - dilation[0] * (Ksize[0] - 1) - 1
     # H(out) = = floor( --------------------------------------------------------------   + 1 )
     #                                        stride[0]
-    if isinstance(weight, jt.array):
+    if isinstance(weight, jt.Var):
         if len(input.shape) == 3:
             filter_rows = weight.size(2)
         if len(input.shape) == 4:
@@ -1202,7 +1202,7 @@ def depthwise_conv2d(input, filter, strides, padding, data_format=None, dilation
 def same_padding_deconvolution(input, weight, strides, dilations):
     #H(out) = floor((H(in) - 1)*stride[0] - 2* padding[0] + dilation[0] * (ksize[0]-1) + 1)
 
-    if isinstance(weight, jt.array):
+    if isinstance(weight, jt.Var):
         if len(input.shape) == 3:
             filter_rows = weight.size(2)
         if len(input.shape) == 4:
diff --git a/tensorlayerx/dataflow/utils.py b/tensorlayerx/dataflow/utils.py
index 45a152d..7bd133a 100644
--- a/tensorlayerx/dataflow/utils.py
+++ b/tensorlayerx/dataflow/utils.py
@@ -159,13 +159,10 @@ def default_collate_torch(batch):
 
 def default_collate_jittor(batch):
     import jittor
-    import jittor.transform
     elem = batch[0]
     elem_type = type(elem)
     if isinstance(elem, jittor.Var):
-        out = None
-
-        return jittor.stack(batch, 0, out=out)
+        return jittor.stack(batch, 0)
     elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
             and elem_type.__name__ != 'string_':
         if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
@@ -173,13 +170,13 @@ def default_collate_jittor(batch):
             if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
                 raise TypeError(default_collate_err_msg_format.format(elem.dtype))
 
-            return default_collate([jittor.transform.to_tensor(b) for b in batch])
+            return default_collate([jittor.array(b) for b in batch])
         elif elem.shape == ():  # scalars
-            return jittor.transform.to_tensor(batch)
+            return jittor.array(batch)
     elif isinstance(elem, float):
-        return jittor.transform.to_tensor(batch, dtype=jittor.float64)
+        return jittor.array(batch, dtype=jittor.float64)
     elif isinstance(elem, int):
-        return jittor.transform.to_tensor(batch)
+        return jittor.array(batch)
     elif isinstance(elem, string_classes):
         return batch
     elif isinstance(elem, collections.abc.Mapping):
diff --git a/tensorlayerx/losses/jittor_cost.py b/tensorlayerx/losses/jittor_cost.py
index 694c3fe..b2c1f59 100644
--- a/tensorlayerx/losses/jittor_cost.py
+++ b/tensorlayerx/losses/jittor_cost.py
@@ -27,6 +27,7 @@
 ]
 
 
+
 def softmax_cross_entropy_with_logits(output, target, reduction='mean'):
     """Softmax cross-entropy operation, returns the TensorFlow expression of cross-entropy for two distributions,
     it implements softmax internally. See ``tf.ops.sparse_softmax_cross_entropy_with_logits``.
@@ -49,8 +50,14 @@ def softmax_cross_entropy_with_logits(output, target, reduction='mean'):
     - The code is borrowed from: `<https://en.wikipedia.org/wiki/Cross_entropy>`__.
 
     """
+    loss = jt.nn.cross_entropy_loss(output, target)  # Use Jittor's cross-entropy loss function
 
-    return nn.CrossEntropyLoss(reduction=reduction)(output, target)
+    if reduction == 'mean':
+        return jt.mean(loss)
+    elif reduction == 'sum':
+        return jt.sum(loss)
+    else:
+        return loss
 
 
 def sigmoid_cross_entropy(output, target, reduction='mean'):
@@ -67,8 +74,13 @@ def sigmoid_cross_entropy(output, target, reduction='mean'):
 
     """
 
-    return nn.BCEWithLogitsLoss(reduction=reduction)(output, target)
-
+    loss = nn.BCEWithLogitsLoss()(output, target)
+    if reduction == 'mean':
+        return jt.mean(loss)
+    elif reduction == 'sum':
+        return jt.sum(loss)
+    else:
+        return loss
 
 def binary_cross_entropy(output, target, reduction='mean'):
     """Binary cross entropy operation.
@@ -86,8 +98,13 @@ def binary_cross_entropy(output, target, reduction='mean'):
 
     """
 
-    return nn.BCELoss(reduction=reduction)(output, target)
-
+    loss = nn.BCELoss()(output, target)
+    if reduction == 'mean':
+        return jt.mean(loss)
+    elif reduction == 'sum':
+        return jt.sum(loss)
+    else:
+        return loss
 
 def mean_squared_error(output, target, reduction='mean'):
     """Return the TensorFlow expression of mean-square-error (L2) of two batch of data.
@@ -104,8 +121,13 @@ def mean_squared_error(output, target, reduction='mean'):
     - `Wiki Mean Squared Error <https://en.wikipedia.org/wiki/Mean_squared_error>`__
 
     """
-
-    return nn.MSELoss(reduction=reduction)(output, target)
+    loss = nn.MSELoss()(output, target)
+    if reduction == 'mean':
+        return jt.mean(loss)
+    elif reduction == 'sum':
+        return jt.sum(loss)
+    else:
+        return loss
 
 
 def normalized_mean_square_error(output, target, reduction='mean'):

From 480645d65049d26f96abf5a5e4c7d591e0a9caf1 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Sat, 29 Jun 2024 14:03:08 +0800
Subject: [PATCH 22/27]  updated Jittor with WithGrad, TrainOneStep and
 TrainOneStepWithGradientClipping

---
 tensorlayerx/model/core.py  | 12 ++++----
 tensorlayerx/model/utils.py | 58 +++++++++++++++++++++++++++++++++++--
 2 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/tensorlayerx/model/core.py b/tensorlayerx/model/core.py
index ba5b3ce..717a3b7 100644
--- a/tensorlayerx/model/core.py
+++ b/tensorlayerx/model/core.py
@@ -5,9 +5,9 @@
 
 from tensorlayerx.nn.core.common import _save_weights, _load_weights, \
     _save_standard_weights_dict, _load_standard_weights_dict
-from .utils import WithLoss, WithGradPD, WithGradMS, WithGradTF, TrainOneStepWithPD, \
-    TrainOneStepWithMS, TrainOneStepWithTH, TrainOneStepWithTF, GradWrap, \
-    TrainOneStepWithGradientClippingTF, TrainOneStepWithGradientClippingPD, TrainOneStepWithGradientClippingTH
+from .utils import WithLoss, WithGradPD, WithGradMS, WithGradTF,WithGradJT, TrainOneStepWithPD, \
+    TrainOneStepWithMS, TrainOneStepWithTH,TrainOneStepWithJT, TrainOneStepWithTF, GradWrap, \
+    TrainOneStepWithGradientClippingTF,TrainOneStepWithGradientClippingJT, TrainOneStepWithGradientClippingPD, TrainOneStepWithGradientClippingTH
 import tensorlayerx as tlx
 from tensorlayerx.nn import Module
 import numpy as np
@@ -669,7 +669,7 @@ def __init__(self, network, loss_fn=None, optimizer=None):
         elif tlx.BACKEND == 'paddle':
             self.net_with_grad = WithGradPD(network, loss_fn, optimizer)
         elif tlx.BACKEND == 'jittor':
-            self.net_with_grad = WithGradPD(network, loss_fn, optimizer)            
+            self.net_with_grad = WithGradJT(network, loss_fn, optimizer)            
         else:
             raise NotImplementedError("This backend is not supported")
 
@@ -717,7 +717,7 @@ def __init__(self, net_with_loss, optimizer, train_weights):
         elif tlx.BACKEND == 'torch':
             self.net_with_train = TrainOneStepWithTH(net_with_loss, optimizer, train_weights)
         elif tlx.BACKEND == 'jittor':
-            self.net_with_train = TrainOneStepWithTH(net_with_loss, optimizer, train_weights)
+            self.net_with_train = TrainOneStepWithJT(net_with_loss, optimizer, train_weights)
         else:
             raise NotImplementedError("This backend is not supported")
 
@@ -772,7 +772,7 @@ def __init__(self, net_with_loss, optimizer, train_weights, gradient_clipping=tl
             self.net_weith_train = TrainOneStepWithGradientClippingTH(
                 net_with_loss, optimizer, train_weights, gradient_clipping)
         elif tlx.BACKEND == 'jittor':
-            self.net_weith_train = TrainOneStepWithGradientClippingTH(
+            self.net_weith_train = TrainOneStepWithGradientClippingJT(
                 net_with_loss, optimizer, train_weights, gradient_clipping)
         else:
             raise NotImplementedError("This backend is not supported")
diff --git a/tensorlayerx/model/utils.py b/tensorlayerx/model/utils.py
index 5ab3570..541b6a3 100644
--- a/tensorlayerx/model/utils.py
+++ b/tensorlayerx/model/utils.py
@@ -6,6 +6,10 @@
 
 if tlx.BACKEND == 'tensorflow':
     import tensorflow as tf
+
+if tlx.BACKEND == 'jittor':
+    import jittor
+    
 if tlx.BACKEND == 'mindspore':
     from mindspore.ops import composite
     from mindspore.common import ParameterTuple
@@ -13,8 +17,7 @@
     import paddle as pd
 if tlx.BACKEND == 'torch':
     import torch
-if tlx.BACKEND == 'jittor':
-    import jittor
+
 class WithLoss(Module):
     """
     High-Level API for Training or Testing.
@@ -128,6 +131,25 @@ def __call__(self, inputs, label):
         grads = self.optimizer.gradient(loss, self.train_weights)
         return grads
 
+class WithGradJT(object):
+    def __init__(self, network, loss_fn=None, optimizer=None):
+        self.network = network
+        self.loss_fn = loss_fn
+        self.train_weights = self.network.trainable_weights
+        self.optimizer = optimizer
+        if loss_fn is None:
+            self.network_with_loss = network
+        else:
+            self.network_with_loss = WithLoss(self.network, self.loss_fn)
+        self.network.set_train()
+
+    def __call__(self, inputs, label):
+        loss = self.network_with_loss(inputs, label)
+        grads = self.optimizer.gradient(loss, self.train_weights)
+        return grads
+    
+
+
 
 class TrainOneStepWithTF(object):
 
@@ -197,6 +219,21 @@ def __call__(self, data, label, *args, **kwargs):
         self.optimizer.apply_gradients(zip(grads, self.train_weights))
         return loss.cpu().detach().numpy()
 
+
+
+class TrainOneStepWithJT(object):
+    def __init__(self, net_with_loss, optimizer, train_weights):
+        self.net_with_loss = net_with_loss
+        self.optimizer = optimizer
+        self.train_weights = train_weights
+
+    def __call__(self, data, label, *args, **kwargs):
+        loss = self.net_with_loss(data, label, *args, **kwargs)
+        grads = self.optimizer.gradient(loss, self.train_weights)
+        self.optimizer.apply_gradients(zip(grads, self.train_weights))
+        return loss.numpy()
+    
+
 class TrainOneStepWithGradientClippingTF(object):
     def __init__(self, net_with_loss, optimizer, train_weights, gradient_clipping):
         self.net_with_loss = net_with_loss
@@ -254,4 +291,19 @@ def __call__(self, data, label):
         loss = self.net_with_loss(data, label)
         grads = self.optimizer.gradient(loss, self.train_weights, grad_clip=self.gradient_clipping)
         self.optimizer.apply_gradients(zip(grads, self.train_weights))
-        return loss
\ No newline at end of file
+        return loss
+    
+
+class TrainOneStepWithGradientClippingJT(object):
+    def __init__(self, net_with_loss, optimizer, train_weights, gradient_clipping):
+        self.net_with_loss = net_with_loss
+        self.optimizer = optimizer
+        self.train_weights = train_weights
+        self.gradient_clipping = gradient_clipping
+
+    def __call__(self, data, label):
+        loss = self.net_with_loss(data, label)
+        grads = self.optimizer.gradient(loss, self.train_weights, grad_clip=self.gradient_clipping)
+        self.optimizer.apply_gradients(zip(grads, self.train_weights))
+        return loss.numpy()
+    

From 8946a63e0facda54ef175e0b07209416480c1589 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Sat, 29 Jun 2024 19:40:47 +0800
Subject: [PATCH 23/27] Updating Jittor Optimizer to intergrate Adam

---
 tensorlayerx/optimizers/jittor_optimizers.py | 107 ++-----------------
 1 file changed, 9 insertions(+), 98 deletions(-)

diff --git a/tensorlayerx/optimizers/jittor_optimizers.py b/tensorlayerx/optimizers/jittor_optimizers.py
index f53bb48..173d77e 100644
--- a/tensorlayerx/optimizers/jittor_optimizers.py
+++ b/tensorlayerx/optimizers/jittor_optimizers.py
@@ -4,6 +4,7 @@
 from __future__ import absolute_import, division, print_function
 import jittor.optim as optimizer
 import jittor as jt
+import jittor.nn as nn
 from tensorlayerx.optimizers.lr import LRScheduler
 
 __all__ = ['Adadelta', 'Adagrad', 'Adam', 'Adamax', 'Ftrl', 'Nadam', 'RMSprop', 'SGD', 'Momentum', 'Lamb', 'LARS']
@@ -26,107 +27,16 @@ def __init__(self):
     def app_gradients(self):
         raise Exception('Adagrad optimizer function not implemented')
 
-class Adam(object):
-
-    def __init__(
-        self,
-        lr=0.001,
-        beta_1=0.9,
-        beta_2=0.999,
-        eps=1e-8,
-        weight_decay=0.0,
-        grad_clip=None,
-    ):
-        self.lr = lr
-        self.beta_1 = beta_1
-        self.beta_2 = beta_2
-        self.eps = eps
-        self.init_optim = False
-        self.weight_decay = weight_decay
-        self.grad_clip = grad_clip
-
-    @jt.no_grad()
-    def apply_gradients(self, grads_and_vars=None, closure=None):
-        if not self.init_optim:
-            raise AttributeError("Can not apply gradients before zero_grad call.")
-        loss = None
-        if closure is not None:
-            with jt.enable_grad():
-                loss = closure()
-
-        for group in self.optimizer_adam.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avg_sqs = []
-            max_exp_avg_sqs = []
-            state_steps = []
-            beta1, beta2 = group['betas']
-
-            for p in group['params']:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    if p.grad.is_sparse:
-                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-                    grads.append(p.grad)
-
-                    state = self.optimizer_adam.state[p]
-                    # Lazy state initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        # Exponential moving average of gradient values
-                        state['exp_avg'] = jt.zeros_like(p)
-                        # Exponential moving average of squared gradient values
-                        state['exp_avg_sq'] = jt.zeros_like(p)
-                        if group['amsgrad']:
-                            # Maintains max of all exp. moving avg. of sq. grad. values
-                            state['max_exp_avg_sq'] = jt.zeros_like(p, memory_format=jt.preserve_format)
-
-                    exp_avgs.append(state['exp_avg'])
-                    exp_avg_sqs.append(state['exp_avg_sq'])
-
-                    if group['amsgrad']:
-                        max_exp_avg_sqs.append(state['max_exp_avg_sq'])
-
-                    # update the steps for each param group update
-                    state['step'] += 1
-                    # record the step after step update
-                    state_steps.append(state['step'])
-
-            optimizer.Adam(params_with_grad,
-                   grads,
-                   exp_avgs,
-                   exp_avg_sqs,
-                   max_exp_avg_sqs,
-                   state_steps,
-                   amsgrad=group['amsgrad'],
-                   beta1=beta1,
-                   beta2=beta2,
-                   lr=get_lr(self.lr),
-                   weight_decay=group['weight_decay'],
-                   eps=group['eps'])
-        return loss
 
-    def gradient(self, loss, weights=None, return_grad=True):
-        if weights is None:
-            raise AttributeError("Parameter train_weights must be entered.")
-        if not self.init_optim:
-            self.optimizer_adam = optimizer.Adam(
-                params=weights, lr=get_lr(self.lr), betas=(self.beta_1, self.beta_2), eps=self.eps,
-                weight_decay=self.weight_decay
-            )
-            self.init_optim = True
-        self.optimizer_adam.zero_grad()
-        loss.backward()
+class Adam(object):
+    def __init__(self, params, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, weight_decay=0.0):
+        self.optimizer = optimizer.Adam(params, lr=lr, eps=eps, betas=(beta_1, beta_2), weight_decay=weight_decay)
 
-        if self.grad_clip is not None:
-            self.grad_clip(weights)
-
-        if return_grad ==True:
-            return _grads(weights)
-        else:
-            return None
+    def step(self, loss=None):
+        self.optimizer.step(loss)
 
+    def zero_grad(self):
+        self.optimizer.zero_grad()
 
 class Adamax(object):
 
@@ -346,6 +256,7 @@ def gradient(self, loss, weights=None, return_grad=True):
             return None
 
 
+
 class Momentum(object):
 
     def __init__(

From c8b6108aa562199ecd3937ae0bd0ba085cbe3513 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Thu, 4 Jul 2024 22:47:05 +0800
Subject: [PATCH 24/27] modfied core.py to integrate jittor optmizer. updated
 jittor metrics.

---
 examples/basic_tutorials/cifar10_cnn.py       | 408 +++++++++++-------
 examples/basic_tutorials/cifar10_cnn_train.py |  10 +-
 tensorlayerx/backend/ops/jittor_nn.py         |  28 +-
 tensorlayerx/metrics/jittor_metric.py         |  15 +-
 tensorlayerx/model/core.py                    |  76 +++-
 tensorlayerx/nn/core/core_jittor.py           |   4 +-
 tensorlayerx/optimizers/jittor_optimizers.py  |  22 +
 7 files changed, 393 insertions(+), 170 deletions(-)

diff --git a/examples/basic_tutorials/cifar10_cnn.py b/examples/basic_tutorials/cifar10_cnn.py
index c7cc15d..569a7bb 100644
--- a/examples/basic_tutorials/cifar10_cnn.py
+++ b/examples/basic_tutorials/cifar10_cnn.py
@@ -2,32 +2,30 @@
 # -*- coding: utf-8 -*-
 
 import os
-# os.environ['TL_BACKEND'] = 'paddle'
-# os.environ['TL_BACKEND'] = 'tensorflow'
-# os.environ['TL_BACKEND'] = 'mindspore'
-os.environ['TL_BACKEND'] = 'torch'
-
 import time
+import numpy as np
+import tensorlayerx as tlx
 from tensorlayerx.dataflow import Dataset, DataLoader
 from tensorlayerx.vision.transforms import (
     Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
 )
-from tensorlayerx.model import TrainOneStep
-from tensorlayerx.nn import Module
-import tensorlayerx as tlx
-from tensorlayerx.nn import (Conv2d, Linear, Flatten, MaxPool2d, BatchNorm2d)
-# enable debug logging
+from tensorlayerx.nn import Conv2d, Linear, Flatten, Module
+from tensorlayerx.optimizers import Adam
+from tqdm import tqdm
+
+# Enable debug logging
 tlx.logging.set_verbosity(tlx.logging.DEBUG)
 
-# ################## Download and prepare the CIFAR10 dataset ##################
-# This is just some way of getting the CIFAR10 dataset from an online location
-# and loading it into numpy arrays with shape [32,32,3]
-X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+os.environ['TL_BACKEND'] = 'jittor'
+
 
-# ################## CIFAR10 dataset ##################
-# We define a Dataset class for Loading CIFAR10 images and labels.
-class make_dataset(Dataset):
 
+# Download and prepare the CIFAR10 dataset with progress bar
+print("Downloading CIFAR10 dataset...")
+X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+
+# Define the CIFAR10 dataset
+class CIFAR10Dataset(Dataset):
     def __init__(self, data, label, transforms):
         self.data = data
         self.label = label
@@ -37,131 +35,247 @@ def __getitem__(self, idx):
         x = self.data[idx].astype('uint8')
         y = self.label[idx].astype('int64')
         x = self.transforms(x)
-
         return x, y
 
     def __len__(self):
-
         return len(self.label)
 
-# We define the CIFAR10 iamges preprocessing pipeline.
-train_transforms = Compose( # Combining multiple operations sequentially
-    [
-        RandomCrop(size=[24, 24]), #random crop from images to shape [24, 24]
-        RandomFlipHorizontal(), # random invert each image horizontally by probability
-        RandomBrightness(brightness_factor=(0.5, 1.5)), # Within the range of values (0.5, 1.5), adjust brightness randomly
-        RandomContrast(contrast_factor=(0.5, 1.5)), # Within the range of values (0.5, 1.5), adjust contrast randomly
-        StandardizePerImage() #Normalize the values of each image to [-1, 1]
-    ]
-)
+# Define the CIFAR10 images preprocessing pipeline
+train_transforms = Compose([
+    RandomCrop(size=[24, 24]),
+    RandomFlipHorizontal(),
+    RandomBrightness(brightness_factor=(0.5, 1.5)),
+    RandomContrast(contrast_factor=(0.5, 1.5)),
+    StandardizePerImage()
+])
 
 test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
 
-# We use DataLoader to batch and shuffle data, and make data into iterators.
-train_dataset = make_dataset(data=X_train, label=y_train, transforms=train_transforms)
-test_dataset = make_dataset(data=X_test, label=y_test, transforms=test_transforms)
-
-train_dataset = DataLoader(train_dataset, batch_size=128, shuffle=True)
-test_dataset = DataLoader(test_dataset, batch_size=128)
+# Create DataLoaders for training and testing
+print("Processing CIFAR10 dataset...")
+train_dataset = CIFAR10Dataset(data=X_train, label=y_train, transforms=train_transforms)
+test_dataset = CIFAR10Dataset(data=X_test, label=y_test, transforms=test_transforms)
 
-# ################## CNN network ##################
-class CNN(Module):
+train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
+test_dataloader = DataLoader(test_dataset, batch_size=128)
 
+# Define a simple CNN model
+class SimpleCNN(Module):
     def __init__(self):
-        super(CNN, self).__init__()
-        # Parameter initialization method
-        W_init = tlx.nn.initializers.truncated_normal(stddev=5e-2)
-        W_init2 = tlx.nn.initializers.truncated_normal(stddev=0.04)
-        b_init2 = tlx.nn.initializers.constant(value=0.1)
-
-        # 2D Convolutional Neural Network, Set padding method "SAME", convolutional kernel size [5,5], stride [1,1], in channels, out channels
-        self.conv1 = Conv2d(64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv1', in_channels=3)
-        # Add 2D BatchNormalize, using ReLU for output.
-        self.bn = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
-        # Add 2D Max pooling layer.
-        self.maxpool1 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')
-
-        self.conv2 = Conv2d(
-            64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, name='conv2', in_channels=64
-        )
-        self.maxpool2 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')
-        # Flatten 2D data to 1D data
-        self.flatten = Flatten(name='flatten')
-        # Linear layer with 384 units, using ReLU for output.
-        self.linear1 = Linear(384, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
-        self.linear2 = Linear(192, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear2relu', in_features=384)
-        self.linear3 = Linear(10, act=None, W_init=W_init2, name='output', in_features=192)
-
-    # We define the forward computation process.
+        super(SimpleCNN, self).__init__()
+        self.conv1 = Conv2d(16, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=3)
+        self.flatten = Flatten()
+        self.fc1 = Linear(out_features=64, act=tlx.nn.ReLU, in_features=16 * 24 * 24)
+        self.fc2 = Linear(out_features=10, act=None, in_features=64)
+
     def forward(self, x):
         z = self.conv1(x)
-        z = self.bn(z)
-        z = self.maxpool1(z)
-        z = self.conv2(z)
-        z = self.maxpool2(z)
         z = self.flatten(z)
-        z = self.linear1(z)
-        z = self.linear2(z)
-        z = self.linear3(z)
+        z = self.fc1(z)
+        z = self.fc2(z)
         return z
 
+# Instantiate the model
+model = SimpleCNN()
 
-# get the network
-net = CNN()
+# Define the optimizer
+optimizer = Adam(model.trainable_weights, lr=0.001)
 
-# training settings
-n_epoch = 500
-learning_rate = 0.0001
-print_freq = 5
-n_step_epoch = int(len(y_train) / 128)
-n_step = n_epoch * n_step_epoch
-shuffle_buffer_size = 128
-# Get training parameters
-train_weights = net.trainable_weights
-# Define the optimizer, use the Adam optimizer.
-optimizer = tlx.optimizers.Adam(learning_rate)
-# Define evaluation metrics.
-metrics = tlx.metrics.Accuracy()
+# Define the loss function
+loss_fn = tlx.losses.softmax_cross_entropy_with_logits
 
-# Define the loss calculation process
-class WithLoss(Module):
+# Training loop
+n_epoch = 2
+for epoch in range(n_epoch):
+    start_time = time.time()
+    model.set_train()
+    train_loss, n_iter = 0, 0
 
-    def __init__(self, net, loss_fn):
-        super(WithLoss, self).__init__()
-        self._net = net
-        self._loss_fn = loss_fn
+    with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{n_epoch}", unit="batch") as pbar:
+        for X_batch, y_batch in train_dataloader:
+            X_batch = tlx.convert_to_tensor(X_batch)
+            y_batch = tlx.convert_to_tensor(y_batch)
+            _logits = model(X_batch)
+            loss = loss_fn(_logits, y_batch)
+            
+            optimizer.zero_grad()
+            optimizer.step(loss)
+            
+            train_loss += loss.item()
+            n_iter += 1
+            pbar.update(1)
 
-    def forward(self, data, label):
-        out = self._net(data)
-        loss = self._loss_fn(out, label)
-        return loss
+    print(f"Epoch {epoch + 1} of {n_epoch} took {time.time() - start_time:.2f}s")
+    print(f"   train loss: {train_loss / n_iter:.4f}")
 
 
-net_with_loss = WithLoss(net, loss_fn=tlx.losses.softmax_cross_entropy_with_logits)
-# Initialize one-step training
-net_with_train = TrainOneStep(net_with_loss, optimizer, train_weights)
 
-# Custom training loops
-for epoch in range(n_epoch):
-    start_time = time.time()
-    # Set the network to training state
-    net.set_train()
-    train_loss, train_acc, n_iter = 0, 0, 0
-    # Get training data and labels
-    for X_batch, y_batch in train_dataset:
-        # Calculate the loss value, and automatically complete the gradient update
-        _loss_ce = net_with_train(X_batch, y_batch)
-        train_loss += _loss_ce
-
-        n_iter += 1
-        _logits = net(X_batch)
-        # Calculate accuracy
-        metrics.update(_logits, y_batch)
-        train_acc += metrics.result()
-        metrics.reset()
-        print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
-        print("   train loss: {}".format(train_loss / n_iter))
-        print("   train acc:  {}".format(train_acc / n_iter))
+################################ TensorLayerX and Jittor can be mixed programming. #################################
+
+
+
+# import os
+# # os.environ['TL_BACKEND'] = 'paddle'
+# # os.environ['TL_BACKEND'] = 'tensorflow'
+# # os.environ['TL_BACKEND'] = 'mindspore'
+# os.environ['TL_BACKEND'] = 'torch'
+
+
+# import time
+# from tensorlayerx.dataflow import Dataset, DataLoader
+# from tensorlayerx.vision.transforms import (
+#     Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
+# )
+# from tensorlayerx.model import TrainOneStep
+# from tensorlayerx.nn import Module
+# import tensorlayerx as tlx
+# from tensorlayerx.nn import (Conv2d, Linear, Flatten, MaxPool2d, BatchNorm2d)
+# # enable debug logging
+# tlx.logging.set_verbosity(tlx.logging.DEBUG)
+
+# # ################## Download and prepare the CIFAR10 dataset ##################
+# # This is just some way of getting the CIFAR10 dataset from an online location
+# # and loading it into numpy arrays with shape [32,32,3]
+# X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+
+# # ################## CIFAR10 dataset ##################
+# # We define a Dataset class for Loading CIFAR10 images and labels.
+# class make_dataset(Dataset):
+
+#     def __init__(self, data, label, transforms):
+#         self.data = data
+#         self.label = label
+#         self.transforms = transforms
+
+#     def __getitem__(self, idx):
+#         x = self.data[idx].astype('uint8')
+#         y = self.label[idx].astype('int64')
+#         x = self.transforms(x)
+
+#         return x, y
+
+#     def __len__(self):
+
+#         return len(self.label)
+
+# # We define the CIFAR10 iamges preprocessing pipeline.
+# train_transforms = Compose( # Combining multiple operations sequentially
+#     [
+#         RandomCrop(size=[24, 24]), #random crop from images to shape [24, 24]
+#         RandomFlipHorizontal(), # random invert each image horizontally by probability
+#         RandomBrightness(brightness_factor=(0.5, 1.5)), # Within the range of values (0.5, 1.5), adjust brightness randomly
+#         RandomContrast(contrast_factor=(0.5, 1.5)), # Within the range of values (0.5, 1.5), adjust contrast randomly
+#         StandardizePerImage() #Normalize the values of each image to [-1, 1]
+#     ]
+# )
+
+# test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
+
+# # We use DataLoader to batch and shuffle data, and make data into iterators.
+# train_dataset = make_dataset(data=X_train, label=y_train, transforms=train_transforms)
+# test_dataset = make_dataset(data=X_test, label=y_test, transforms=test_transforms)
+
+# train_dataset = DataLoader(train_dataset, batch_size=128, shuffle=True)
+# test_dataset = DataLoader(test_dataset, batch_size=128)
+
+# # ################## CNN network ##################
+# class CNN(Module):
+
+#     def __init__(self):
+#         super(CNN, self).__init__()
+#         # Parameter initialization method
+#         W_init = tlx.nn.initializers.truncated_normal(stddev=5e-2)
+#         W_init2 = tlx.nn.initializers.truncated_normal(stddev=0.04)
+#         b_init2 = tlx.nn.initializers.constant(value=0.1)
+
+#         # 2D Convolutional Neural Network, Set padding method "SAME", convolutional kernel size [5,5], stride [1,1], in channels, out channels
+#         self.conv1 = Conv2d(64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv1', in_channels=3)
+#         # Add 2D BatchNormalize, using ReLU for output.
+#         self.bn = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+#         # Add 2D Max pooling layer.
+#         self.maxpool1 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')
+
+#         self.conv2 = Conv2d(
+#             64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, name='conv2', in_channels=64
+#         )
+#         self.maxpool2 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')
+#         # Flatten 2D data to 1D data
+#         self.flatten = Flatten(name='flatten')
+#         # Linear layer with 384 units, using ReLU for output.
+#         self.linear1 = Linear(384, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
+#         self.linear2 = Linear(192, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear2relu', in_features=384)
+#         self.linear3 = Linear(10, act=None, W_init=W_init2, name='output', in_features=192)
+
+#     # We define the forward computation process.
+#     def forward(self, x):
+#         z = self.conv1(x)
+#         z = self.bn(z)
+#         z = self.maxpool1(z)
+#         z = self.conv2(z)
+#         z = self.maxpool2(z)
+#         z = self.flatten(z)
+#         z = self.linear1(z)
+#         z = self.linear2(z)
+#         z = self.linear3(z)
+#         return z
+
+
+# # get the network
+# net = CNN()
+
+# # training settings
+# n_epoch = 500
+# learning_rate = 0.0001
+# print_freq = 5
+# n_step_epoch = int(len(y_train) / 128)
+# n_step = n_epoch * n_step_epoch
+# shuffle_buffer_size = 128
+# # Get training parameters
+# train_weights = net.trainable_weights
+# # Define the optimizer, use the Adam optimizer.
+# optimizer = tlx.optimizers.Adam(learning_rate)
+# # Define evaluation metrics.
+# metrics = tlx.metrics.Accuracy()
+
+# # Define the loss calculation process
+# class WithLoss(Module):
+
+#     def __init__(self, net, loss_fn):
+#         super(WithLoss, self).__init__()
+#         self._net = net
+#         self._loss_fn = loss_fn
+
+#     def forward(self, data, label):
+#         out = self._net(data)
+#         loss = self._loss_fn(out, label)
+#         return loss
+
+
+# net_with_loss = WithLoss(net, loss_fn=tlx.losses.softmax_cross_entropy_with_logits)
+# # Initialize one-step training
+# net_with_train = TrainOneStep(net_with_loss, optimizer, train_weights)
+
+# # Custom training loops
+# for epoch in range(n_epoch):
+#     start_time = time.time()
+#     # Set the network to training state
+#     net.set_train()
+#     train_loss, train_acc, n_iter = 0, 0, 0
+#     # Get training data and labels
+#     for X_batch, y_batch in train_dataset:
+#         # Calculate the loss value, and automatically complete the gradient update
+#         _loss_ce = net_with_train(X_batch, y_batch)
+#         train_loss += _loss_ce
+
+#         n_iter += 1
+#         _logits = net(X_batch)
+#         # Calculate accuracy
+#         metrics.update(_logits, y_batch)
+#         train_acc += metrics.result()
+#         metrics.reset()
+#         print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
+#         print("   train loss: {}".format(train_loss / n_iter))
+#         print("   train acc:  {}".format(train_acc / n_iter))
 
 
 ################################ TensorLayerX and TensorFlow can be mixed programming. #################################
@@ -497,7 +611,7 @@ def forward(self, data, label):
 ################################### TensorLayerX and Paddle can be mixed programming. ##################################
 # import os
 # os.environ['TL_BACKEND'] = 'paddle'
-#
+
 # import time
 # import paddle as pd
 # from tensorlayerx.nn import Module
@@ -509,35 +623,35 @@ def forward(self, data, label):
 # )
 # # enable debug logging
 # tlx.logging.set_verbosity(tlx.logging.DEBUG)
-#
+
 # # prepare cifar10 data
 # X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
-#
-#
+
+
 # class CNN(Module):
-#
+
 #     def __init__(self):
 #         super(CNN, self).__init__()
 #         # weights init
 #         W_init = tlx.nn.initializers.truncated_normal(stddev=5e-2)
 #         W_init2 = tlx.nn.initializers.truncated_normal(stddev=0.04)
 #         b_init2 = tlx.nn.initializers.constant(value=0.1)
-#
+
 #         self.conv1 = Conv2d(64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv1', in_channels=3)
 #         self.bn1 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
 #         self.maxpool1 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')
-#
+
 #         self.conv2 = Conv2d(
 #             64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv2', in_channels=64
 #         )
 #         self.bn2 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
 #         self.maxpool2 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')
-#
+
 #         self.flatten = Flatten(name='flatten')
 #         self.linear1 = Linear(384, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_channels=2304)
 #         self.linear2 = Linear(192, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear2relu', in_channels=384)
 #         self.linear3 = Linear(10, act=None, W_init=W_init2, name='output', in_channels=192)
-#
+
 #     def forward(self, x):
 #         z = self.conv1(x)
 #         z = self.bn1(z)
@@ -550,11 +664,11 @@ def forward(self, data, label):
 #         z = self.linear2(z)
 #         z = self.linear3(z)
 #         return z
-#
-#
+
+
 # # get the network
 # net = CNN()
-#
+
 # # training settings
 # batch_size = 128
 # n_epoch = 500
@@ -562,31 +676,31 @@ def forward(self, data, label):
 # print_freq = 5
 # shuffle_buffer_size = 128
 # metrics = tlx.metrics.Accuracy()
-#
+
 # train_weights = net.trainable_weights
 # optimizer = tlx.optimizers.Adam(learning_rate)
 # # looking for decay learning rate? see https://github.com/tensorlayer/srgan/blob/master/train.py
-#
-#
+
+
 # class make_dataset(Dataset):
-#
+
 #     def __init__(self, data, label, transforms):
 #         self.data = data
 #         self.label = label
 #         self.transforms = transforms
-#
+
 #     def __getitem__(self, idx):
 #         x = self.data[idx].astype('uint8')
 #         y = self.label[idx].astype('int64')
 #         x = self.transforms(x)
-#
+
 #         return x, y
-#
+
 #     def __len__(self):
-#
+
 #         return len(self.label)
-#
-#
+
+
 # train_transforms = Compose(
 #     [
 #         RandomCrop(size=[24, 24]),
@@ -596,15 +710,15 @@ def forward(self, data, label):
 #         StandardizePerImage()
 #     ]
 # )
-#
+
 # test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
-#
+
 # train_dataset = make_dataset(data=X_train, label=y_train, transforms=train_transforms)
 # test_dataset = make_dataset(data=X_test, label=y_test, transforms=test_transforms)
-#
+
 # train_dataset = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
 # test_dataset = DataLoader(test_dataset, batch_size=batch_size)
-#
+
 # for epoch in range(n_epoch):
 #     train_loss, train_acc, n_iter = 0, 0, 0
 #     start_time = time.time()
@@ -615,15 +729,15 @@ def forward(self, data, label):
 #         loss_ce = loss.numpy()
 #         grads = optimizer.gradient(loss, train_weights)
 #         optimizer.apply_gradients(zip(grads, train_weights))
-#
+
 #         train_loss += loss_ce
-#
+
 #         if metrics:
 #             metrics.update(output, y_batch)
 #             train_acc += metrics.result()
 #             metrics.reset()
 #         n_iter += 1
-#
+
 #         print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
 #         print("   train loss: {}".format(train_loss / n_iter))
 #         print("   train acc:  {}".format(train_acc / n_iter))
diff --git a/examples/basic_tutorials/cifar10_cnn_train.py b/examples/basic_tutorials/cifar10_cnn_train.py
index 9fd9a56..2661ce5 100644
--- a/examples/basic_tutorials/cifar10_cnn_train.py
+++ b/examples/basic_tutorials/cifar10_cnn_train.py
@@ -4,11 +4,16 @@
 # TensorlayerX目前支持包括TensorFlow、Pytorch、PaddlePaddle、MindSpore作为计算后端，指定计算后端的方法也非常简单，只需要设置环境变量即可
 
 import os
-os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'paddle'
+
+os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
+
 # os.environ['TL_BACKEND'] = 'torch'
 
+
+
 import tensorlayerx as tlx
 from tensorlayerx.nn import Module
 from tensorlayerx.nn import (Conv2d, Linear, Flatten, MaxPool2d, BatchNorm2d)
@@ -54,6 +59,7 @@ def forward(self, x):
         z = self.linear1(z)
         z = self.linear2(z)
         return z
+        
 
 
 # get the network
@@ -70,7 +76,7 @@ def forward(self, x):
 
 # 定义损失函数、优化器等
 loss_fn=tlx.losses.softmax_cross_entropy_with_logits
-optimizer = tlx.optimizers.Adam(learning_rate)
+optimizer = tlx.optimizers.Adam(net.trainable_weights, lr=learning_rate)
 metrics = tlx.metrics.Accuracy()
 
 
diff --git a/tensorlayerx/backend/ops/jittor_nn.py b/tensorlayerx/backend/ops/jittor_nn.py
index d1548d3..fd69da8 100644
--- a/tensorlayerx/backend/ops/jittor_nn.py
+++ b/tensorlayerx/backend/ops/jittor_nn.py
@@ -1695,11 +1695,16 @@ class BatchNorm(object):
 
     """
 
+class BatchNorm:
+    """
+    The BatchNorm class for Jittor, supporting both training and inference modes.
+    """
+
     def __init__(
         self, decay=0.9, epsilon=0.00001, beta=None, gamma=None, moving_mean=None, moving_var=None, num_features=None,
         data_format='channels_last', is_train=False
     ):
-        self.decay =  1-decay
+        self.decay = 1 - decay
         self.epsilon = epsilon
         self.data_format = data_format
         self.beta = beta
@@ -1708,27 +1713,28 @@ def __init__(
         self.moving_var = moving_var
         self.num_features = num_features
         self.is_train = is_train
-        self.axes = None
 
         if self.decay < 0.0 or 1.0 < self.decay:
             raise ValueError("decay should be between 0 to 1")
 
+    def set_train(self):
+        self.is_train = True
+
+    def set_eval(self):
+        self.is_train = False
+
     def __call__(self, inputs):
         if self.data_format == 'channels_last':
             inputs = nhwc_to_nchw(inputs)
 
-        out = nn.batch_norm(inputs,
-                                             running_mean=self.moving_mean,
-                                             running_var=self.moving_var,
-                                             weight=self.gamma,
-                                             bias=self.beta,
-                                             training=self.is_train,
-                                             momentum=self.decay)
+        if self.is_train:
+            out = nn.BatchNorm(self.num_features, eps=self.epsilon, momentum=self.decay, affine=True, is_train=True)(inputs)
+        else:
+            out = nn.BatchNorm(self.num_features, eps=self.epsilon, momentum=self.decay, affine=True, is_train=False)(inputs)
+        
         if self.data_format == 'channels_last':
             out = nchw_to_nhwc(out)
         return out
-
-
 class GroupConv2D(object):
 
     def __init__(self, strides, padding, data_format, dilations, out_channel, k_size, groups=1):
diff --git a/tensorlayerx/metrics/jittor_metric.py b/tensorlayerx/metrics/jittor_metric.py
index 77680c9..d5a163c 100644
--- a/tensorlayerx/metrics/jittor_metric.py
+++ b/tensorlayerx/metrics/jittor_metric.py
@@ -34,26 +34,28 @@ def reset(self):
         raise NotImplementedError("function 'reset' not implemented in {}.".format(self.__class__.__name__))
 
 
-class Accuracy(Metric):
 
+
+
+class Accuracy(Metric):
     def __init__(self, topk=1):
         super(Accuracy, self).__init__()
-        self.topk = topk
+        self.topk = int(topk)  # Ensure topk is an integer
         self.reset()
 
     def update(self, y_pred, y_true):
+        y_pred = jt.argsort(y_pred, dim=-1, descending=True)[0]
 
-        y_pred = jt.argsort(y_pred, dim=-1, descending=True)
-        y_pred = y_pred[:, :self.topk]
         if (len(y_true.shape) == 1) or (len(y_true.shape) == 2 and y_true.shape[-1] == 1):
             y_true = jt.reshape(y_true, (-1, 1))
         elif y_true.shape[-1] != 1:
             y_true = jt.argmax(y_true, dim=-1, keepdim=True)
+
         correct = y_pred == y_true
         correct = correct.to(jt.float32)
-        correct = correct.cpu().numpy()
+        correct = correct.numpy()
         num_samples = np.prod(np.array(correct.shape[:-1]))
-        num_corrects = correct[..., :self.topk].sum()
+        num_corrects = correct.sum()
         self.total += num_corrects
         self.count += num_samples
 
@@ -64,7 +66,6 @@ def reset(self):
         self.total = 0.0
         self.count = 0.0
 
-
 class Auc(object):
 
     def __init__(
diff --git a/tensorlayerx/model/core.py b/tensorlayerx/model/core.py
index 717a3b7..218d40e 100644
--- a/tensorlayerx/model/core.py
+++ b/tensorlayerx/model/core.py
@@ -129,7 +129,7 @@ def train(self, n_epoch, train_dataset=None, test_dataset=False, print_train_bat
             )
 
         elif tlx.BACKEND == "jittor":
-            self.of_train(
+            self.jt_train(
                 n_epoch=n_epoch, train_dataset=train_dataset, network=self.network, loss_fn=self.loss_fn,
                 train_weights=self.train_weights, optimizer=self.optimizer, metrics=self.metrics,
                 print_train_batch=print_train_batch, print_freq=print_freq, test_dataset=test_dataset,
@@ -637,6 +637,80 @@ def of_train(
                 progress.advance(epoch_tqdm, advance=1)
                 progress.reset(batch_tqdm)
 
+
+    def jt_train(
+        self, n_epoch, train_dataset, network, loss_fn, train_weights, optimizer, metrics, print_train_batch,
+        print_freq, test_dataset
+    ):
+        # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+        # network = network.to(device)
+        with Progress(TextColumn("[progress.description]{task.description}"),
+                      BarColumn(),
+                      TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+                      TimeRemainingColumn(),
+                      TimeElapsedColumn()) as progress:
+
+            n_batch = len(train_dataset)
+            epoch_tqdm = progress.add_task(description="[red]Epoch progress 0/{}".format(n_epoch), total=n_epoch)
+            batch_tqdm = progress.add_task(description="[green]Batch progress 0/{}".format(n_batch), total=n_batch)
+
+            for epoch in range(n_epoch):
+                start_time = time.time()
+
+                train_loss, train_acc, n_iter = 0, 0, 0
+                for batch, (X_batch, y_batch) in enumerate(train_dataset):
+                    network.set_train()
+                    output = network(X_batch)
+                    loss = loss_fn(output, y_batch)
+                    # grads = optimizer.gradient(loss, train_weights)
+                    # optimizer.apply_gradients(zip(grads, train_weights))
+                    optimizer.zero_grad()
+                    optimizer.step(loss)
+                    train_loss += loss
+                    if metrics:
+                        metrics.update(output, y_batch)
+                        train_acc += metrics.result()
+                        metrics.reset()
+                    else:
+                        train_acc += (output.argmax(1) == y_batch).type(torch.float).mean().item()
+                    n_iter += 1
+
+                    if print_train_batch:
+                        print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
+                        print("   train loss: {}".format(train_loss / n_iter))
+                        print("   train acc:  {}".format(train_acc / n_iter))
+                    progress.advance(batch_tqdm, advance=1)
+                    progress.update(batch_tqdm, description="[green]Batch progress {}/{}".format(batch + 1, n_batch))
+
+                if epoch + 1 == 1 or (epoch + 1) % print_freq == 0:
+
+                    print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
+                    print("   train loss: {}".format(train_loss / n_iter))
+                    print("   train acc:  {}".format(train_acc / n_iter))
+
+                if test_dataset:
+                    # use training and evaluation sets to evaluate the model every print_freq epoch
+                    if epoch + 1 == 1 or (epoch + 1) % print_freq == 0:
+                        network.set_eval()
+                        val_loss, val_acc, n_iter = 0, 0, 0
+                        for X_batch, y_batch in test_dataset:
+                            _logits = network(X_batch)  # is_train=False, disable dropout
+                            val_loss += loss_fn(_logits, y_batch)
+                            if metrics:
+                                metrics.update(_logits, y_batch)
+                                val_acc += metrics.result()
+                                metrics.reset()
+                            else:
+                                val_acc += (_logits.argmax(1) == y_batch).type(torch.float).mean().item()
+                            n_iter += 1
+                        print("   val loss: {}".format(val_loss / n_iter))
+                        print("   val acc:  {}".format(val_acc / n_iter))
+                progress.update(epoch_tqdm, description="[red]Epoch progress {}/{}".format(epoch + 1, n_epoch))
+                progress.advance(epoch_tqdm, advance=1)
+                progress.reset(batch_tqdm)
+
+
+
 class WithGrad(object):
     """Module that returns the gradients.
 
diff --git a/tensorlayerx/nn/core/core_jittor.py b/tensorlayerx/nn/core/core_jittor.py
index b59124b..8bd8942 100644
--- a/tensorlayerx/nn/core/core_jittor.py
+++ b/tensorlayerx/nn/core/core_jittor.py
@@ -108,12 +108,12 @@ def _get_weights(self, var_name, shape, init=None, trainable=True, transposed=No
         self.var_name = var_name
         return param
 
-    def execute(self, *args, **kw):
+    def execute(self, *inputs, **kwargs):
         ''' Executes the module computation. 
         
         Raises NotImplementedError if the subclass does not override the method.
         '''
-        return self.forward(*args, **kw)
+        return self.forward(*inputs, **kwargs)
         # raise NotImplementedError("Please implement 'execute' method of "+str(type(self)))
     
 
diff --git a/tensorlayerx/optimizers/jittor_optimizers.py b/tensorlayerx/optimizers/jittor_optimizers.py
index 173d77e..29ce336 100644
--- a/tensorlayerx/optimizers/jittor_optimizers.py
+++ b/tensorlayerx/optimizers/jittor_optimizers.py
@@ -38,6 +38,28 @@ def step(self, loss=None):
     def zero_grad(self):
         self.optimizer.zero_grad()
 
+class AdamW(object):
+    def __init__(self, params, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, weight_decay=0.0):
+        self.optimizer = optimizer.AdamW(params, lr=lr, eps=eps, betas=(beta_1, beta_2), weight_decay=weight_decay)
+
+    def step(self, loss=None):
+        self.optimizer.step(loss)
+
+    def zero_grad(self):
+        self.optimizer.zero_grad()
+
+
+class Adan(object):
+    def __init__(self, params, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, weight_decay=0.0):
+        self.optimizer = optimizer.Adan(params, lr=lr, eps=eps, betas=(beta_1, beta_2), weight_decay=weight_decay)
+
+    def step(self, loss=None):
+        self.optimizer.step(loss)
+
+    def zero_grad(self):
+        self.optimizer.zero_grad()
+
+
 class Adamax(object):
 
     def __init__(self):

From b907adf4f9fa2f0c5577a558f78a5a6339f21aed Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Wed, 24 Jul 2024 23:01:18 +0300
Subject: [PATCH 25/27] Patching bugs (post Utilities test)

Updated Jittor_backend, jittor_nn, core_jittor, jittor_initializers, seperable_conv.py, shape.py
---
 tensorlayerx/backend/ops/jittor_backend.py    |  254 +++--
 tensorlayerx/backend/ops/jittor_nn.py         | 1004 ++++++++++-------
 tensorlayerx/nn/core/core_jittor.py           |   42 +-
 .../nn/initializers/jittor_initializers.py    |    2 +-
 .../nn/layers/convolution/separable_conv.py   |    4 +-
 tensorlayerx/nn/layers/shape.py               |    2 +-
 6 files changed, 808 insertions(+), 500 deletions(-)

diff --git a/tensorlayerx/backend/ops/jittor_backend.py b/tensorlayerx/backend/ops/jittor_backend.py
index d4da98f..774cd05 100644
--- a/tensorlayerx/backend/ops/jittor_backend.py
+++ b/tensorlayerx/backend/ops/jittor_backend.py
@@ -7,6 +7,7 @@
 # import jittor.nn.functional as F
 import numpy as np
 import random
+import jittor.nn as nn 
 
 
 _dtypeDict = {
@@ -71,7 +72,7 @@ def zeros(shape, dtype=None, device = None):
     if device == 'gpu':
         jt.flags.use_cuda = 1
     
-    return jt.zeros(size=shape, dtype=dtype)
+    return jt.zeros(shape=shape, dtype=dtype)
 
 
 def ones(shape, dtype=None, device = None):
@@ -93,8 +94,16 @@ def ones(shape, dtype=None, device = None):
     if device == 'gpu':
         jt.flags.use_cuda = 1
         
-    return jt.ones(size=shape, dtype=dtype)
-
+    # Check if dtype is None, if so, set it to 'float32' by default
+    if dtype is None:
+        dtype = 'float32'
+        
+    # Ensure shape is passed as a tuple
+    if isinstance(shape, list):
+        shape = tuple(shape)
+    
+    # Call Jittor's ones function
+    return jt.ones(shape, dtype=dtype)
 
 def constant(value, dtype=None, shape=None, device =None):
     """
@@ -116,7 +125,7 @@ def constant(value, dtype=None, shape=None, device =None):
     """
     if device == 'gpu':
         jt.flags.use_cuda = 1
-    w = jt.empty(size=shape, dtype=dtype)
+    w = jt.empty(shape, dtype=dtype)
     return jt.nn.init.constant_(w, value)
 
 
@@ -174,12 +183,10 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
     """
 
-    if seed is None:
-        jt.random.seed()
-    else:
-        jt.random.manual_seed(seed)
-    w = jt.randn(size=shape, dtype=dtype)
-    out = w.normal_(mean=mean, std=stddev)
+    if seed is not None:
+        jt.set_global_seed(seed)
+    w = jt.randn(shape, dtype)
+    out = w * stddev + mean
     return out
 
 
@@ -206,7 +213,10 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
     """
 
-    tensor = jt.empty(size=shape, dtype=dtype)
+    if dtype is None:
+        dtype = jt.float32
+    tensor = jt.empty(shape, dtype=dtype)
+
     out = jt.nn.init.trunc_normal_(tensor, mean=mean, std=stddev)
     return out
 
@@ -229,23 +239,25 @@ def he_normal(shape, a = 0, mode = 'fan_in', nonlinearity='leaky_relu', dtype=No
         A tensor of the specified shape filled with he normal values.
     """
 
-    tensor = jt.empty(size=shape, dtype=dtype)
+    tensor = jt.empty(shape, dtype=dtype)
     out = jt.nn.init.kaiming_normal_(tensor, a=a, mode = mode, nonlinearity = nonlinearity)
     return out
 
 def he_uniform(shape, a = 0, mode = 'fan_in', nonlinearity='leaky_relu', dtype=None, seed=None):
 
-    tensor = jt.empty(size=shape, dtype=dtype)
+    tensor = jt.empty(shape, dtype=dtype)
     out = jt.nn.init.kaiming_uniform_(tensor, a=a, mode = mode, nonlinearity = nonlinearity)
     return out
 
-def xavier_normal(shape, gain = 1.0, dtype=None, seed=None):
-    _tensor = jt.empty(size=shape, dtype=dtype)
-    return jt.nn.init.xavier_normal_(_tensor, gain)
+def xavier_normal(shape, gain=1.0, dtype='float32', seed=None):
+    if seed is not None:
+        jt.set_global_seed(seed)
+    stddev = gain * np.sqrt(2.0 / (shape[-2] + shape[-1]))
+    return jt.init.gauss(std=stddev, shape=shape, dtype=dtype)
 
 
 def xavier_uniform(shape, gain = 1.0, dtype=None, seed=None):
-    _tensor = jt.empty(size=shape, dtype=dtype)
+    _tensor = jt.empty(shape, dtype=dtype)
     return jt.nn.init.xavier_uniform_(_tensor, gain)
 
 
@@ -387,6 +399,11 @@ def __init__(self, shape):
         self.shape = shape
 
     def __call__(self, tensor):
+        if not self.shape:
+            raise ValueError("The target shape of reshape can't be empty.")
+        if -1 in self.shape and len([s for s in self.shape if s == -1]) > 1:
+            raise ValueError("Only one dimension can be inferred when using -1 in reshape.")
+
         return jt.reshape(tensor, self.shape)
 
 
@@ -415,7 +432,7 @@ def __init__(self, axis=0):
         self.axis = axis
 
     def __call__(self, values):
-        return jt.concat(tensors=values, dim=self.axis)
+        return jt.concat(values, dim=self.axis)
 
 
 def concat(values, axis=0):
@@ -489,9 +506,9 @@ def __init__(self, axis=None, keepdims=False):
 
     def __call__(self, input):
         if self.axis is not None:
-            return jt.sum(input=input, dim=self.axis)
+            return jt.sum(input, dim=self.axis)
         else:
-            return jt.sum(input=input)
+            return jt.sum(input)
 
 
 class ReduceMean(object):
@@ -502,7 +519,7 @@ def __init__(self, axis=None, keepdims=False):
 
     def __call__(self, inputs):
         if self.axis is not None:
-            return jt.mean(input=inputs, dim=self.axis, keepdim=self.keepdims)
+            return jt.mean(inputs, self.axis, self.keepdims)
         else:
             return jt.mean(inputs)
 
@@ -525,33 +542,35 @@ def reduce_mean(input_tensor, axis=None, keepdims=False):
     -------
         The reduced tensor.
     """
-
     if axis is not None:
-        return jt.mean(input_tensor, dim=axis, keepdim=keepdims)
+        if isinstance(axis, (tuple, list)):
+            axis = tuple(axis)
+        return jt.mean(input_tensor, dims=axis, keepdims=keepdims)
     else:
         return jt.mean(input_tensor)
 
 
+
 class ReduceMax(object):
 
     def __init__(self, axis=None, keepdims=False):
         self.axis = axis
         self.keepdims = keepdims
 
-
     def __call__(self, inputs):
         if self.axis is not None:
             if isinstance(self.axis, (list, tuple)):
                 out = inputs
                 for dim in self.axis[::-1]:
-                    out = jt.max(out, dim=dim, keepdim=self.keepdims).values
+                    out = jt.max(out, dim=dim, keepdims=self.keepdims)
                 return out
             else:
-                return jt.max(inputs, dim=self.axis, keepdim=self.keepdims).values
+                return jt.max(inputs, dim=self.axis, keepdims=self.keepdims)
         else:
             return jt.max(inputs)
 
 
+
 def reduce_max(input_tensor, axis=None, keepdims=False):
     """
     Computes the maximum of elements across dimensions of a tensor.
@@ -572,7 +591,7 @@ def reduce_max(input_tensor, axis=None, keepdims=False):
     """
 
     if axis is not None:
-        return jt.max(input_tensor, dim=axis, keepdim=keepdims).values
+        return jt.max(input_tensor, dim=axis, keepdims=keepdims)
     else:
         return jt.max(input_tensor)
 
@@ -597,7 +616,7 @@ def reduce_min(input_tensor, axis=None, keepdims=False):
     """
 
     if axis is not None:
-        return jt.min(input_tensor, dim=axis, keepdim=keepdims).values
+        return jt.min(input_tensor, dim=axis, keepdims=keepdims)
     else:
         return jt.min(input_tensor)
 
@@ -642,7 +661,7 @@ def __call__(self, x):
             else:
                 raise NotImplementedError("Only constant padding is implemented for arbitrary dimensions.")
 
-        out = jt.nn.functional.pad(x, self.paddings, mode=self.mode, value=self.constant_values)
+        out = jt.nn.pad(x, self.paddings, mode=self.mode, value=self.constant_values)
 
         if self.mode in ['symmetric', 'reflect']:
             if len(x.shape) == 3:
@@ -787,7 +806,7 @@ def __init__(self, axis=0):
         self.axis = axis
 
     def __call__(self, input):
-        return jt.unsqueeze(input=input, dim=self.axis)
+        return jt.unsqueeze(input, dim=self.axis)
 
 
 def expand_dims(input, axis):
@@ -816,8 +835,19 @@ def __init__(self):
         pass
 
     def __call__(self, input, multiples):
-        return jt.tile(input, dims=multiples)
+        input_shape = list(input.shape)
+        reps = multiples
+        # Ensure reps is the same length as input shape
+        while len(reps) < len(input_shape):
+            reps.insert(0, 1)
+
+        # Repeat the input tensor along each dimension
+        tiled_tensor = input
+        for axis, rep in enumerate(reps):
+            if rep != 1:
+                tiled_tensor = jt.concat([tiled_tensor] * rep, dim=axis)
 
+        return tiled_tensor
 
 def tile(input, multiples):
     """
@@ -835,8 +865,8 @@ def tile(input, multiples):
     -------
         A Tensor. Has the same type as input.
     """
-
-    return jt.tile(input, multiples)
+    tile_op = Tile()
+    return tile_op(input, multiples)
 
 
 class Cast(object):
@@ -845,7 +875,7 @@ def __init__(self, dtype=None):
         self.dtype = dtype
 
     def __call__(self, x):
-        return x.type(self.dtype)
+        return x.cast(self.dtype)
 
 
 def cast(x, dtype=None):
@@ -865,7 +895,7 @@ def cast(x, dtype=None):
         A Tensor or SparseTensor or IndexedSlices with same shape as x and same type as dtype.
     """
 
-    return x.type(dtype)
+    return x.cast(dtype)
 
 
 class Transpose(object):
@@ -880,36 +910,36 @@ def __call__(self, a):
 
 def transpose(a, perm=None, conjugate=False):
     """
-    Transposes a.
+    Transposes a tensor.
 
     Parameters
     ----------
-    a : tensor
-        A Tensor.
-    perm : list / int
+    a : jt.Var
+        A Jittor tensor.
+    perm : list / int, optional
         A permutation of the dimensions of a.
-    conjugate : bool
-        Setting it to True is mathematically equivalent to tf.math.conj(tf.transpose(input)).
 
     Returns
     -------
-        A transposed Tensor.
-    """
-    if perm == None:
-        if len(a.shape) <= 2:
-            return jt.t(a)
-        if len(a.shape) == 3:
-            perm = [2, 1, 0]
-        if len(a.shape) == 4:
-            perm = [3, 2, 1, 0]
-        if len(a.shape) == 5:
-            perm = [4, 3, 2, 1, 0]
-    out = jt.permute(a, perm)
+    jt.Var
+        A transposed Jittor tensor.
+    """
+    if not isinstance(a, jt.Var):
+        raise TypeError("Input must be a Jittor tensor.")
+
+    if perm is None:
+        perm = list(range(len(a.shape)))[::-1]
+    elif not isinstance(perm, jt.NanoVector):
+        perm = jt.NanoVector(perm)
+
+
+    out = jt.transpose(a, perm)
+
     if conjugate:
-        out = jt.conj_physical(out)
+        raise NotImplementedError("Conjugate transpose is not supported in Jittor.")
+    
     return out
 
-
 def gather_nd(params, indices, batch_dims=0):
     """
     Gather slices from params into a Tensor with shape specified by indices.
@@ -927,19 +957,19 @@ def gather_nd(params, indices, batch_dims=0):
     -------
         A Tensor. Has the same type as params.
     """
-
-    out_shape = indices.shape[:-1]
-    indices = indices.unsqueeze(0).transpose(0, -1)
-    ndim = indices.shape[0]
-    indices = indices.long()
-    idx = jt.zeros_like(indices[0], device=indices.device).long()
-    m = 1
-
-    for i in range(ndim)[::-1]:
-        idx += indices[i] * m
-        m *= params.size(i)
-    out = jt.take(params, idx)
-    return out.view(out_shape)
+    raise NotImplementedError
+    # out_shape = indices.shape[:-1]
+    # indices = indices.unsqueeze(0).transpose(0, -1)
+    # ndim = indices.shape[0]
+    # indices = indices.long()
+    # idx = jt.zeros_like(indices[0], device=indices.device).long()
+    # m = 1
+
+    # for i in range(ndim)[::-1]:
+    #     idx += indices[i] * m
+    #     m *= params.size(i)
+    # out = jt.take(params, idx)
+    # return out.view(out_shape)
 
 def scatter_nd(indices, updates, shape):
     raise NotImplementedError
@@ -951,7 +981,8 @@ def __init__(self, clip_min=-1, clip_max=1):
         self.max = clip_max
 
     def __call__(self, inputs):
-        jt.nn.utils.clip_grad_value_(inputs, clip_value=self.max)
+        raise NotImplementedError
+        # jt.nn.utils.clip_grad_value_(inputs, clip_value=self.max)
 
 
 class ClipGradByNorm(object):
@@ -959,7 +990,8 @@ def __init__(self, clip_norm=0.1):
         self.clip_norm = clip_norm
 
     def __call__(self, inputs):
-        jt.nn.utils.clip_grad_norm_(inputs, max_norm=self.clip_norm, norm_type=2)
+        raise NotImplementedError
+        # jt.nn.utils.clip_grad_norm_(inputs, max_norm=self.clip_norm, norm_type=2)
 
 
 class ClipByGlobalNorm(object):
@@ -1088,13 +1120,13 @@ def __init__(self, depth=-1, on_value=None, off_value=None, axis=None, dtype=Non
 
     def __call__(self, inputs):
         if [self.on_value, self.off_value] == [None, None]:
-            return jt.nn.functional.one_hot(inputs, self.depth)
+            return jt.nn.one_hot(inputs, self.depth)
         else:
-            out = jt.nn.functional.one_hot(inputs, self.depth)
+            out = jt.nn.one_hot(inputs, self.depth)
             out = cast(out, jt.float64)
             out = jt.where(out == 1, self.on_value, out)
             out = jt.where(out == 0, self.off_value, out)
-            out = cast(out, jt.int)
+            out = cast(out, jt.Var.int64)
             return out
 
 
@@ -1106,7 +1138,7 @@ def __init__(self, axis=None, epsilon=1e-12):
 
     def __call__(self, input, *args, **kwargs):
 
-        return jt.nn.functional.normalize(input, p = 2, dim=self.axis, eps=self.epsilon)
+        return jt.misc.normalize(input, p = 2, dim=self.axis, eps=self.epsilon)
 
 
 
@@ -1126,14 +1158,13 @@ def __call__(self, params, ids):
 
 
 class NCELoss(object):
-
     def __init__(self, num_true=1, sampled_values=None, remove_accidental_hits=False):
         self.num_true = num_true
         self.sampled_values = sampled_values
         self.remove_accidental_hits = remove_accidental_hits
 
     def __call__(self, weights, biases, labels, inputs, num_sampled, num_classes):
-        raise NotImplementedError
+        raise NotImplementedError("NCELoss is not implemented for the Jittor backend")
 
 
 class NotEqual(object):
@@ -1142,7 +1173,8 @@ def __init__(self):
         pass
 
     def __call__(self, x, y):
-        return jt.ne(x, y)
+        return jt.Var.not_equal(x, y)
+
 
 
 class CountNonzero(object):
@@ -1152,12 +1184,14 @@ def __init__(self, keepdims=None, dtype=None):
         self.dtype = dtype
 
     def __call__(self, input, axis=None):
+        # Calculate the count of non-zero elements along the specified axis
+        if axis is None:
+            return (input != 0).sum().cast(self.dtype)
+        else:
+            return (input != 0).sum(dim=axis, keepdims=self.keepdims).cast(self.dtype)
 
-        return jt.count_nonzero(input, dim=axis)
-
-
-class Resize:
 
+class Resize(object):
     def __init__(self, scale, method, antialias=False, data_format='channels_last'):
         self.method = method
         self.antialias = antialias
@@ -1167,12 +1201,26 @@ def __init__(self, scale, method, antialias=False, data_format='channels_last'):
     def __call__(self, inputs):
         if self.data_format == "channels_last":
             inputs = nhwc_to_nchw(inputs)
-        outputs = jt.nn.interpolate(inputs, scale_factor=self.scale, mode=self.method, align_corners=self.antialias)
+        
+        # Ensure scale is handled correctly
+        if isinstance(self.scale, (tuple, list)):
+            if len(self.scale) == 1:
+                scale_factor = self.scale[0]
+            else:
+                scale_factor = self.scale
+        else:
+            scale_factor = self.scale
+        
+        # Convert scale_factor to a single value if it's a tuple/list of same values
+        if isinstance(scale_factor, (tuple, list)) and len(set(scale_factor)) == 1:
+            scale_factor = scale_factor[0]
+        
+        outputs = jt.nn.interpolate(inputs, scale_factor=scale_factor, mode=self.method, align_corners=self.antialias)
+        
         if self.data_format == "channels_last":
             outputs = nchw_to_nhwc(outputs)
         return outputs
 
-
 def resize(inputs, output_size, method, antialias):
     return jt.nn.interpolate(inputs, size=output_size, mode=method, align_corners=antialias)
 
@@ -1228,7 +1276,7 @@ def __init__(self):
         pass
 
     def __call__(self, x):
-        return jt.sign(x)
+        return jt.nn.sign(x)
 
 
 class Ceil(object):
@@ -1301,7 +1349,8 @@ def acosh(x):
 
 
 def angle(x):
-    return jt.angle(x)
+    raise NotImplementedError
+    # return jt.angle(x)
 
 
 def argmax(x, axis=None, keepdim=False, dtype='int64'):
@@ -1337,7 +1386,8 @@ def cosh(x):
 
 
 def count_nonzero(x, axis=None, keepdims=None, dtype="int64"):
-    return jt.count_nonzero(x, dim=axis)
+    raise NotImplementedError
+    # return jt.count_nonzero(x, dim=axis)
 
 
 def cumprod(x, axis=0, exclusive=False, reverse=False):
@@ -1361,7 +1411,8 @@ def floordiv(x, y):
 
 
 def floormod(x, y):
-    return jt.fmod(x, y)
+    raise NotImplementedError
+    # return jt.fmod(x, y)
 
 
 def greater(x, y):
@@ -1492,7 +1543,7 @@ def sigmoid(x):
 
 
 def sign(x):
-    return jt.sign(x)
+    return jt.nn.sign(x)
 
 
 def sin(x):
@@ -1522,11 +1573,11 @@ def softplus(x):
 
 
 def square(x):
-    return jt.square(x)
+    return jt.sqr(x)
 
 
 def squared_difference(x, y):
-    return jt.square(x-y)
+    return jt.sqr(x-y)
 
 
 def subtract(x, y):
@@ -1703,7 +1754,8 @@ def mask_select(x, mask, axis = 0):
     if axis < 0:
         axis = len(x.shape) + axis
     if x.shape == mask.shape:
-        return jt.masked_select(x, mask)
+        raise NotImplementedError
+        # return jt.masked_select(x, mask)
     if axis == 0:
         return x[mask]
     elif axis == 1:
@@ -1729,25 +1781,27 @@ def __init__(self, equation):
         self.equation = equation
 
     def __call__(self, *args):
-        return jt.einsum(self.equation, *args)
+        return jt.linalg.einsum(self.equation, *args)
 
 def set_device(device = 'GPU', id = 0):
     if device == 'GPU':
         jt.flags.use_cuda = 1
 
 def distributed_init(backend="cncl"):
-    jt.distributed.init_process_group(backend=backend)
+    raise NotImplementedError
+    # jt.distributed.init_process_group(backend=backend)
 
 def distributed_model(module, device_ids=None, output_device=None, 
                     dim=0, broadcast_buffers=True, process_group=None, bucket_cap_mb=25, 
                     find_unused_parameters=False, check_reduction=False, gradient_as_bucket_view=False):
-    return jt.nn.parallel.DistributedDataParallel(module, device_ids=device_ids,
-                                                     output_device=output_device,
-                                                     dim=dim, broadcast_buffers=broadcast_buffers,
-                                                     process_group=process_group, bucket_cap_mb=bucket_cap_mb,
-                                                     find_unused_parameters=find_unused_parameters,
-                                                     check_reduction=check_reduction, 
-                                                     gradient_as_bucket_view=gradient_as_bucket_view)
+    return NotImplementedError
+    # return jt.nn.parallel.DistributedDataParallel(module, device_ids=device_ids,
+    #                                                  output_device=output_device,
+    #                                                  dim=dim, broadcast_buffers=broadcast_buffers,
+    #                                                  process_group=process_group, bucket_cap_mb=bucket_cap_mb,
+    #                                                  find_unused_parameters=find_unused_parameters,
+    #                                                  check_reduction=check_reduction, 
+    #                                                  gradient_as_bucket_view=gradient_as_bucket_view)
 
 def scatter_update(tensor, indices, updates):
     tensor = jt.array(tensor)
diff --git a/tensorlayerx/backend/ops/jittor_nn.py b/tensorlayerx/backend/ops/jittor_nn.py
index fd69da8..5ebd1a2 100644
--- a/tensorlayerx/backend/ops/jittor_nn.py
+++ b/tensorlayerx/backend/ops/jittor_nn.py
@@ -405,6 +405,7 @@ def __init__(self, p=0.5, seed=0 , is_train=False):
     def __call__(self, inputs):
         return nn.dropout(inputs, p=self.p, is_train=self.is_train)
 
+
 def dropout(x, p=0.5, is_train=False):
     return nn.dropout(x , p=p, is_train=is_train)
 
@@ -468,6 +469,7 @@ def bias_add(x, bias, data_format=None):
 
 
 
+
 class Conv1D(object):
 
     def __init__(self, stride, padding, data_format='NWC', dilations=None, out_channel=None, k_size=None, groups=1):
@@ -475,16 +477,24 @@ def __init__(self, stride, padding, data_format='NWC', dilations=None, out_chann
         self.dilations = dilations
         self.groups = groups
         self.data_format, self.padding = preprocess_1d_format(data_format, padding)
-        # self.conv1d = nn.Conv1d()
+        # Initialize Conv1d layer here
+        self.conv1d = nn.Conv1d
+
     def __call__(self, input, filters):
         if self.data_format == 'NLC':
             input = nhwc_to_nchw(input)
         if self.padding == 'same':
             out = self.conv1d_same_padding(input, filters)
         else:
-            
-            out = nn.Conv1d(input, filters, stride=self.stride, padding=self.padding,
-                           dilation=self.dilations, groups=self.groups)
+            out = self.conv1d(
+                in_channels=input.shape[1],
+                out_channels=filters.shape[0],
+                kernel_size=filters.shape[2],
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilations,
+                groups=self.groups
+            )(input)
         if self.data_format == 'NLC':
             out = nchw_to_nhwc(out)
 
@@ -494,8 +504,14 @@ def conv1d_same_padding(self, input, filters):
         rows_odd, padding_rows = same_padding(input, filters, self.stride, 1)
         if rows_odd:
             input = nn.pad(input, [0, int(rows_odd)], 'replicate')
-        
-        return nn.Conv1d(input, filters, stride=self.stride, padding=(padding_rows // 2), groups=self.groups)
+        return nn.Conv1d(
+            in_channels=input.shape[1],
+            out_channels=filters.shape[0],
+            kernel_size=filters.shape[2],
+            stride=self.stride,
+            padding=(padding_rows // 2),
+            groups=self.groups
+        )(input)
 
 
 
@@ -535,10 +551,12 @@ def same_padding(input, weight, strides, dilations):
     #                     H(in) + 2* padding[0] - dilation[0] * (Ksize[0] - 1) - 1
     # H(out) = = floor( --------------------------------------------------------------   + 1 )
     #                                        stride[0]
+
+    print(type(weight))
     if isinstance(weight, jt.Var):
         if len(input.shape) == 3:
             filter_rows = weight.size(2)
-        if len(input.shape) == 4:
+        elif len(input.shape) == 4:
             filter_rows = weight.size(2)
             filter_cols = weight.size(3)
         elif len(input.shape) == 5:
@@ -557,7 +575,7 @@ def same_padding(input, weight, strides, dilations):
             filter_depth = weight[2]
 
     if len(input.shape) == 3:
-        input_rows = input.size(2)
+        input_rows = input.size(1)
         out_rows = (input_rows + strides - 1) // strides
         padding_rows = max(0, (out_rows - 1) * strides + (filter_rows - 1) * dilations + 1 - input_rows)
         rows_odd = (padding_rows % 2 != 0)
@@ -566,18 +584,37 @@ def same_padding(input, weight, strides, dilations):
     if len(input.shape) == 4:
         input_rows = input.size(2)
         input_cols = input.size(3)
-
-        # filter_rows = weight.size(2)
-        # filter_cols = weight.size(3)
-
+        if isinstance(weight, jt.Var):
+            filter_rows = weight.shape[2]  # Changed from weight.size(2)
+            filter_cols = weight.shape[3]  # Changed from weight.size(3)
+        else:
+            filter_rows = weight[0]  # Changed from weight.size(2)
+            filter_cols = weight[1]  # Changed from weight.size(3)
         out_rows = (input_rows + strides[0] - 1) // strides[0]
         out_cols = (input_cols + strides[1] - 1) // strides[1]
 
+
+        # print(f"4D output rows: {out_rows}, output cols: {out_cols}")
+        # print(f"4D dilations: {dilations}")
+
+
         padding_rows = max(0, (out_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - input_rows)
         padding_cols = max(0, (out_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - input_cols)
 
         rows_odd = (padding_rows % 2 != 0)
         cols_odd = (padding_cols % 2 != 0)
+        # if rows_odd:
+        #     padding_rows += 1
+   
+        # if cols_odd:
+        #     padding_cols += 1
+        
+        # print(f"Filter Rows: {filter_rows}, Filter Cols: {filter_cols}")
+        # print(f"Input Rows: {input_rows}, Input Cols: {input_cols}")
+        # print(f"Output Rows: {out_rows}, Output Cols: {out_cols}")
+        # print(f"Padding Rows: {padding_rows}, Padding Cols: {padding_cols}")
+        # print(f"Rows Odd: {rows_odd}, Cols Odd: {cols_odd}")
+
         return rows_odd, cols_odd, padding_rows, padding_cols
 
     if len(input.shape) == 5:
@@ -593,6 +630,8 @@ def same_padding(input, weight, strides, dilations):
         out_cols = (input_cols + strides[1] - 1) // strides[1]
         out_depth = (input_depth + strides[2] - 1) // strides[2]
 
+
+
         padding_rows = max(0, (out_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - input_rows)
         padding_cols = max(0, (out_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - input_cols)
         padding_depth = max(0, (out_depth - 1) * strides[2] + (filter_depth - 1) * dilations[2] + 1 - input_depth)
@@ -607,22 +646,30 @@ class Conv2D(object):
 
     def __init__(self, strides, padding, data_format='NHWC', dilations=None, out_channel=None, k_size=None, groups=1):
         self.data_format, self.padding = preprocess_2d_format(data_format, padding)
-        if self.data_format is 'NHWC':
+        if self.data_format == 'NHWC':
+            self.strides = (strides[1], strides[2])
+            self.dilations = (dilations[0], dilations[1])
+        elif self.data_format == 'NCHW':
             self.strides = (strides[1], strides[2])
             self.dilations = (dilations[1], dilations[2])
-        elif self.data_format is 'NCHW':
-            self.strides = (strides[2], strides[3])
-            self.dilations = (dilations[2], dilations[3])
         self.groups = groups
+        # print(f"strides =  {strides}")
 
     def __call__(self, input, filters):
+        # print(f"Conv2D_Input shape: {input.shape}")
+        # print(f"Conv2D_Filters shape: {filters.shape}")
+        # print(f"Conv2D_Strides: {self.strides}")
+        # print(f"Conv2D_Padding: {self.padding}")
+        # print(f"Conv2D_Dilations: {self.dilations}")
+        # print(f"Conv2D_Groups: {self.groups}")
+
         if self.data_format == 'NHWC':
             input = nhwc_to_nchw(input)
 
         if self.padding == 'same':
             output = self.conv2d_same_padding(input, filters)
         else:
-            output = nn.conv2d(input, filters, stride=self.strides, padding=self.padding,
+            output = nn.conv2d(input, filters, stride=self.strides, padding=(0 if isinstance(self.padding, str) else self.padding),
                               dilation=self.dilations, groups=self.groups)
 
         if self.data_format == 'NHWC':
@@ -631,6 +678,7 @@ def __call__(self, input, filters):
 
     def conv2d_same_padding(self, input, weight, bias=None):
         rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, weight, self.strides, self.dilations)
+        # print(f"Padding rows: {padding_rows}, Padding cols: {padding_cols}")
         if rows_odd or cols_odd:
             input = nn.pad(input, [0, int(cols_odd), 0, int(rows_odd)])
 
@@ -682,10 +730,10 @@ class Conv3D(object):
 
     def __init__(self, strides, padding, data_format='NDHWC', dilations=None, out_channel=None, k_size=None):
         self.data_format, self.padding = preprocess_3d_format(data_format, padding)
-        if self.data_format is 'NDHWC':
+        if self.data_format == 'NDHWC':
             self._strides = (strides[1], strides[2], strides[3])
             self._dilations = (dilations[1], dilations[2], dilations[3])
-        elif self.data_format is 'NCDHW':
+        elif self.data_format == 'NCDHW':
             self._strides = (strides[2], strides[3], strides[4])
             self._dilations = (dilations[2], dilations[3], dilations[4])
 
@@ -804,8 +852,12 @@ def moments(x, axes, shift=None, keepdims=False):
 
 class MaxPool1d(object):
 
-    def __call__():
-        return NotImplementedError
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError("MaxPool1d is not implemented in Jittor backend")
 
 
 class MaxPool(object):
@@ -919,8 +971,13 @@ def max_pool3d(input, kernel_size, stride=None, padding=0, return_mask=False, da
 
 class AvgPool1d(object):
 
-    def __call__(inputs):
-        raise NotImplementedError
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError("AvgPool1d is not implemented in Jittor backend")
+    
 
 
 class AvgPool(object):
@@ -973,9 +1030,14 @@ def avgpool3d_same_padding(self, input):
         )
         if rows_odd or cols_odd or depth_odd:
             input = nn.pad(input, [0, int(cols_odd), 0, int(rows_odd), 0, int(depth_odd)], mode='replicate')
-        return nn.AvgPool3d(
-                input, self.ksize, self.strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2)
-        )
+        
+        out = nn.AvgPool3d(kernel_size=self.ksize, stride=self.strides, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2))(input)
+        return nchw_to_nhwc(out)
+
+        # return nn.AvgPool3d(
+        #     input, self.ksize, self.strides, (padding_rows // 2, padding_cols // 2, padding_depth // 2)
+        # )
+
 
 
 def avg_pool(input, ksize, strides, padding):
@@ -1030,37 +1092,37 @@ def __call__(self, inputs):
         return self.max_pool3d(inputs)
 
 
-# def max_pool3d(input, ksize, strides, padding, data_format=None):
-#     """
-#     Performs the max pooling on the input.
-#
-#     Parameters
-#     ----------
-#     input : tensor
-#          A 5-D Tensor of the format specified by data_format.
-#     ksize : int or list of ints
-#         An int or list of ints that has length 1, 3 or 5.
-#         The size of the window for each dimension of the input tensor.
-#     strides : int or list of ints
-#         An int or list of ints that has length 1, 3 or 5.
-#         The stride of the sliding window for each dimension of the input tensor.
-#     padding : string
-#         'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
-#     data_format : string
-#          "NDHWC", "NCDHW". Defaults to "NDHWC". The data format of the input and output data.
-#          With the default format "NDHWC", the data is stored in the order of: [batch, in_depth, in_height, in_width, in_channels].
-#          Alternatively, the format could be "NCDHW", the data storage order is: [batch, in_channels, in_depth, in_height, in_width].
-#     name : string
-#          A name for the operation (optional).
-#
-#     Returns
-#     -------
-#         A Tensor of format specified by data_format. The max pooled output tensor.
-#     """
-#
-#     data_format, padding = preprocess_3d_format(data_format, padding)
-#     max_pool3d_obj = MaxPool(ksize, strides, padding, data_format)
-#     return max_pool3d_obj(input)
+    # def max_pool3d(input, ksize, strides, padding, data_format=None):
+    #     """
+    #     Performs the max pooling on the input.
+    #
+    #     Parameters
+    #     ----------
+    #     input : tensor
+    #          A 5-D Tensor of the format specified by data_format.
+    #     ksize : int or list of ints
+    #         An int or list of ints that has length 1, 3 or 5.
+    #         The size of the window for each dimension of the input tensor.
+    #     strides : int or list of ints
+    #         An int or list of ints that has length 1, 3 or 5.
+    #         The stride of the sliding window for each dimension of the input tensor.
+    #     padding : string
+    #         'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+    #     data_format : string
+    #          "NDHWC", "NCDHW". Defaults to "NDHWC". The data format of the input and output data.
+    #          With the default format "NDHWC", the data is stored in the order of: [batch, in_depth, in_height, in_width, in_channels].
+    #          Alternatively, the format could be "NCDHW", the data storage order is: [batch, in_channels, in_depth, in_height, in_width].
+    #     name : string
+    #          A name for the operation (optional).
+    #
+    #     Returns
+    #     -------
+    #         A Tensor of format specified by data_format. The max pooled output tensor.
+    #     """
+    #
+    #     data_format, padding = preprocess_3d_format(data_format, padding)
+    #     max_pool3d_obj = MaxPool(ksize, strides, padding, data_format)
+    #     return max_pool3d_obj(input)
 
 
 class AvgPool3d(object):
@@ -1073,33 +1135,33 @@ def __call__(self, inputs):
         return self.avg_pool3d_obj(inputs)
 
 
-# def avg_pool3d(input, ksize, strides, padding, data_format=None):
-#     """
-#     Performs the average pooling on the input.
-#
-#     Parameters
-#     ----------
-#     input : tensor
-#         A 5-D Tensor of shape [batch, height, width, channels] and type float32, float64, qint8, quint8, or qint32.
-#     ksize : int or list of ints
-#         An int or list of ints that has length 1, 3 or 5. The size of the window for each dimension of the input tensor.
-#     strides : int or list of ints
-#         An int or list of ints that has length 1, 3 or 5.
-#         The stride of the sliding window for each dimension of the input tensor.
-#     padding : string
-#         'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
-#     data_format : string
-#         'NDHWC' and 'NCDHW' are supported.
-#     name : string
-#         Optional name for the operation.
-#
-#     Returns
-#     -------
-#         A Tensor with the same type as value. The average pooled output tensor.
-#     """
-#
-#     avg_pool_obj = AvgPool(ksize, strides, padding, data_format)
-#     return avg_pool_obj(input)
+    # def avg_pool3d(input, ksize, strides, padding, data_format=None):
+    #     """
+    #     Performs the average pooling on the input.
+    #
+    #     Parameters
+    #     ----------
+    #     input : tensor
+    #         A 5-D Tensor of shape [batch, height, width, channels] and type float32, float64, qint8, quint8, or qint32.
+    #     ksize : int or list of ints
+    #         An int or list of ints that has length 1, 3 or 5. The size of the window for each dimension of the input tensor.
+    #     strides : int or list of ints
+    #         An int or list of ints that has length 1, 3 or 5.
+    #         The stride of the sliding window for each dimension of the input tensor.
+    #     padding : string
+    #         'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+    #     data_format : string
+    #         'NDHWC' and 'NCDHW' are supported.
+    #     name : string
+    #         Optional name for the operation.
+    #
+    #     Returns
+    #     -------
+    #         A Tensor with the same type as value. The average pooled output tensor.
+    #     """
+    #
+    #     avg_pool_obj = AvgPool(ksize, strides, padding, data_format)
+    #     return avg_pool_obj(input)
 
 
 def pool(input, window_shape, pooling_type, strides=None, padding='VALID', data_format=None, dilations=None, name=None):
@@ -1150,10 +1212,10 @@ class DepthwiseConv2d(object):
 
     def __init__(self, strides, padding, data_format=None, dilations=None, ksize=None, channel_multiplier=1, in_channels=None):
         self.data_format, self.padding = preprocess_2d_format(data_format, padding)
-        if self.data_format is 'NHWC':
+        if self.data_format == 'NHWC':
             self.strides = (1, strides[0], strides[1], 1)
             self.dilations = (1, dilations[0], dilations[1], 1)
-        elif self.data_format is 'NCHW':
+        elif self.data_format == 'NCHW':
             self.strides = (1, 1, strides[0], strides[1])
             self.dilations = (1, 1, dilations[0], dilations[1])
         self.depthwise = Conv2D(padding=self.padding, strides=self.strides, data_format=self.data_format,
@@ -1200,8 +1262,7 @@ def depthwise_conv2d(input, filter, strides, padding, data_format=None, dilation
 
 
 def same_padding_deconvolution(input, weight, strides, dilations):
-    #H(out) = floor((H(in) - 1)*stride[0] - 2* padding[0] + dilation[0] * (ksize[0]-1) + 1)
-
+    # Calculate the dimensions of the filter
     if isinstance(weight, jt.Var):
         if len(input.shape) == 3:
             filter_rows = weight.size(2)
@@ -1226,7 +1287,7 @@ def same_padding_deconvolution(input, weight, strides, dilations):
     if len(input.shape) == 3:
         input_rows = input.size(2)
         out_rows = input_rows * strides - strides + 1
-        padding_rows = max(0, (input_rows-1) * strides + (filter_rows - 1) * dilations + 1 - out_rows)
+        padding_rows = max(0, (input_rows - 1) * strides + (filter_rows - 1) * dilations + 1 - out_rows)
         rows_odd = (padding_rows % 2 != 0)
         return rows_odd, padding_rows
 
@@ -1237,12 +1298,12 @@ def same_padding_deconvolution(input, weight, strides, dilations):
         out_rows = input_rows * strides[0] - strides[0] + 1
         out_cols = input_cols * strides[1] - strides[1] + 1
 
-
         padding_rows = max(0, (input_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - out_rows)
         padding_cols = max(0, (input_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - out_cols)
 
         rows_odd = (padding_rows % 2 != 0)
         cols_odd = (padding_cols % 2 != 0)
+
         return rows_odd, cols_odd, padding_rows, padding_cols
 
     if len(input.shape) == 5:
@@ -1250,9 +1311,16 @@ def same_padding_deconvolution(input, weight, strides, dilations):
         input_cols = input.size(3)
         input_depth = input.size(4)
 
-        out_rows = input_rows * strides[0] - strides[0] + 1
-        out_cols = input_cols * strides[1] - strides[1] + 1
-        out_depth = input_depth * strides[2] - strides[2] + 1
+        
+        out_rows = (input_rows - 1) * strides[0] + filter_rows
+        out_cols = (input_cols - 1) * strides[1] + filter_cols
+        out_depth = (input_depth - 1) * strides[2] + filter_depth
+
+        # print(f"SAME_PADDING_Stride : {strides}")
+        # print(f"out_rows = {input_rows} * {strides[0]} - {strides[0]} + 1")
+        # print(f"out_cols = {input_cols} * {strides[1]} - {strides[1]} + 1")
+        # print(f"out_depth = {input_depth} * {strides[2]} - {strides[2]} + 1")
+
 
         padding_rows = max(0, (input_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - out_rows)
         padding_cols = max(0, (input_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - out_cols)
@@ -1261,9 +1329,18 @@ def same_padding_deconvolution(input, weight, strides, dilations):
         rows_odd = (padding_rows % 2 != 0)
         cols_odd = (padding_cols % 2 != 0)
         depth_odd = (padding_depth % 2 != 0)
+
+        # print(f"SAME_PADDING_Filter: {filter_rows}, {filter_cols}, {filter_depth if 'filter_depth' in locals() else 'N/A'}")
+        # print(f"SAME_PADDING_Input : {input_rows}, {input_cols}, {input_depth}")
+        # print(f"SAME_PADDING_Output : {out_rows}, {out_cols}, {out_depth}")
+
+        # print(f"SAME_PADDING_Padding: {padding_rows}, {padding_cols},  {padding_depth}")
+        # print(f"SAME_PADDING_Rows Odd: {rows_odd}, Cols Odd: {cols_odd}, Depth Odd: {depth_odd}")
+
         return rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth
 
 
+
 class Conv1d_transpose(object):
 
     # def __init__(
@@ -1273,73 +1350,77 @@ class Conv1d_transpose(object):
     #     self.dilations = dilations
     #     self.data_format, self.padding = preprocess_1d_format(data_format, padding)
 
-    def __call__(self, input, filters):
-        raise NotImplementedError
-#         if self.data_format == 'NLC':
-#             input = nhwc_to_nchw(input)
-#         if self.padding == 'same':
-#             out = self.conv1d_transpose_same_padding(input, filters)
-#         else:
-#             out = F.conv_transpose1d(
-#                 input,
-#                 weight=filters,
-#                 padding=(0 if isinstance(self.padding, str) else self.padding),
-#                 stride=self.stride,
-#                 dilation=self.dilations
-#             )
-#         if self.data_format == 'NLC':
-#             out = nchw_to_nhwc(out)
-#         return out
-
-#     def conv1d_transpose_same_padding(self, input, filters):
-#         rows_odd, padding_rows = same_padding_deconvolution(input, filters, self.stride, 1)
-#         if rows_odd:
-#             input = F.pad(input, [0, int(rows_odd)])
-#             out_padding = 0
-#         else:
-#             out_padding = 1
-#         return F.conv_transpose1d(input, weight=filters, padding=(padding_rows // 2), stride=self.stride,
-#                                   dilation=self.dilations, output_padding=out_padding)
-
-
-
-# def conv1d_transpose(
-#     input, filters, output_shape, strides, padding='SAME', data_format='NWC', dilations=None, name=None
-# ):
-#     """
-#     The transpose of conv1d.
-
-#     Parameters
-#     ----------
-#     input : tensor
-#         A 3-D Tensor of type float and shape [batch, in_width, in_channels]
-#         for NWC data format or [batch, in_channels, in_width] for NCW data format.
-#     filters : tensor
-#         A 3-D Tensor with the same type as value and shape [filter_width, output_channels, in_channels].
-#         filter's in_channels dimension must match that of value.
-#     output_shape : tensor
-#         A 1-D Tensor, containing three elements, representing the output shape of the deconvolution op.
-#     strides : list
-#         An int or list of ints that has length 1 or 3. The number of entries by which the filter is moved right at each step.
-#     padding : string
-#         'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
-#     data_format : string
-#         'NWC' and 'NCW' are supported.
-#     dilations : list
-#          An int or list of ints that has length 1 or 3 which defaults to 1.
-#          The dilation factor for each dimension of input. If set to k > 1,
-#          there will be k-1 skipped cells between each filter element on that dimension.
-#          Dilations in the batch and depth dimensions must be 1.
-#     name : string
-#         Optional name for the returned tensor.
-
-#     Returns
-#     -------
-#         A Tensor with the same type as value.
-#     """
-
-#     conv1d_transpose_obj = Conv1d_transpose(strides, padding, data_format, dilations)
-#     return conv1d_transpose_obj(input, filters)
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" Conv1d_transpose is not implemented in Jittor backend")
+    #         if self.data_format == 'NLC':
+    #             input = nhwc_to_nchw(input)
+    #         if self.padding == 'same':
+    #             out = self.conv1d_transpose_same_padding(input, filters)
+    #         else:
+    #             out = F.conv_transpose1d(
+    #                 input,
+    #                 weight=filters,
+    #                 padding=(0 if isinstance(self.padding, str) else self.padding),
+    #                 stride=self.stride,
+    #                 dilation=self.dilations
+    #             )
+    #         if self.data_format == 'NLC':
+    #             out = nchw_to_nhwc(out)
+    #         return out
+
+    #     def conv1d_transpose_same_padding(self, input, filters):
+    #         rows_odd, padding_rows = same_padding_deconvolution(input, filters, self.stride, 1)
+    #         if rows_odd:
+    #             input = F.pad(input, [0, int(rows_odd)])
+    #             out_padding = 0
+    #         else:
+    #             out_padding = 1
+    #         return F.conv_transpose1d(input, weight=filters, padding=(padding_rows // 2), stride=self.stride,
+    #                                   dilation=self.dilations, output_padding=out_padding)
+
+
+
+    # def conv1d_transpose(
+    #     input, filters, output_shape, strides, padding='SAME', data_format='NWC', dilations=None, name=None
+    # ):
+    #     """
+    #     The transpose of conv1d.
+
+    #     Parameters
+    #     ----------
+    #     input : tensor
+    #         A 3-D Tensor of type float and shape [batch, in_width, in_channels]
+    #         for NWC data format or [batch, in_channels, in_width] for NCW data format.
+    #     filters : tensor
+    #         A 3-D Tensor with the same type as value and shape [filter_width, output_channels, in_channels].
+    #         filter's in_channels dimension must match that of value.
+    #     output_shape : tensor
+    #         A 1-D Tensor, containing three elements, representing the output shape of the deconvolution op.
+    #     strides : list
+    #         An int or list of ints that has length 1 or 3. The number of entries by which the filter is moved right at each step.
+    #     padding : string
+    #         'VALID' or 'SAME'. The padding algorithm. See the "returns" section of tf.ops.convolution for details.
+    #     data_format : string
+    #         'NWC' and 'NCW' are supported.
+    #     dilations : list
+    #          An int or list of ints that has length 1 or 3 which defaults to 1.
+    #          The dilation factor for each dimension of input. If set to k > 1,
+    #          there will be k-1 skipped cells between each filter element on that dimension.
+    #          Dilations in the batch and depth dimensions must be 1.
+    #     name : string
+    #         Optional name for the returned tensor.
+
+    #     Returns
+    #     -------
+    #         A Tensor with the same type as value.
+    #     """
+
+    #     conv1d_transpose_obj = Conv1d_transpose(strides, padding, data_format, dilations)
+    #     return conv1d_transpose_obj(input, filters)
 
 def _ntuple(n, name="parse"):
     def parse(x):
@@ -1551,10 +1632,15 @@ def __init__(
         self.name = name
         self.out_channel = out_channel
         self.data_format, self.padding = preprocess_3d_format(data_format, padding)
+        
+        # print(f'__init__Conv3D_TRANSPOSE_Stride = {self.strides}' )
+        # print(f'__init__SAME_PADDING_Dialation = {self.dilations}' )      
 
     def __call__(self, input, filters):
+        # print(f"conv3D_Transpose_Call: input shape={input.shape}, filters shape={filters.shape}")
         if self.data_format == 'NDHWC':
             input = nhwc_to_nchw(input)
+
         if self.padding == 'same':
             out = self.conv3d_transpore_same(input, filters)
         else:
@@ -1565,20 +1651,28 @@ def __call__(self, input, filters):
                 stride=self.strides,
                 dilation=self.dilations
             )
+
         if self.data_format == 'NDHWC':
             out = nchw_to_nhwc(out)
         return out
 
-    def conv3d_transpore_same(self,input, filters):
+    def conv3d_transpore_same(self, input, filters):
+
+        # print(f'conv3d_transpore_same_Conv3D_TRANSPOSE_Stride = {self.strides}' )
+        # print(f'conv3d_transpore_same_SAME_PADDING_Dialation = {self.dilations}' )    
+        
         rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding_deconvolution(
             input, filters, self.strides, (1, 1, 1))
-        if rows_odd or cols_odd or depth_odd:
-            input = nn.pad(input, [0, int(rows_odd), 0, int(cols_odd), 0, int(depth_odd)])
-            out_padding = 0
-        else:
-            out_padding = 1
+        
+
+        # if rows_odd or cols_odd or depth_odd:
+        #     input = nn.pad(input, [0, int(rows_odd), 0, int(cols_odd), 0, int(depth_odd)])
+        #     out_padding = 0
+        # else:
+        #     out_padding = 1
+
         out = nn.conv_transpose3d(input, weight=filters, padding=(padding_rows // 2, padding_cols // 2, padding_depth // 2),
-                                 stride=self.strides, dilation=self.dilations, output_padding=out_padding)
+                                  stride=self.strides, dilation=self.dilations, output_padding=0)
         return out
 
 
@@ -1735,6 +1829,8 @@ def __call__(self, inputs):
         if self.data_format == 'channels_last':
             out = nchw_to_nhwc(out)
         return out
+    
+
 class GroupConv2D(object):
 
     def __init__(self, strides, padding, data_format, dilations, out_channel, k_size, groups=1):
@@ -1765,13 +1861,21 @@ class SeparableConv2D(object):
 
     def __init__(self, strides, padding, data_format, dilations, out_channel, k_size, in_channel, depth_multiplier):
         self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        # print(f"SeparableConv2D-_strides = {strides}")
+        dilations = dilations[1:] + [dilations[0]]
+
+        # print(f"SeparableConv2D-_dilations = {dilations}")
         self.depthwise_conv = Conv2D(strides, self.padding, self.data_format, dilations, groups=in_channel)
-        self.pointwise_conv = Conv2D((1, 1), self.padding, self.data_format, (1, 1))
+        self.strides = (0,1,1,0)
+        self.dialations = (1,1)
+        self.pointwise_conv = Conv2D(self.strides, self.padding, self.data_format, self.dialations)
 
 
     def __call__(self, input, filter, point_filter=None):
+
         depthwise_conv = self.depthwise_conv(input, filter)
         pointwise_conv = self.pointwise_conv(depthwise_conv, point_filter)
+        # print(f'pointwise_conv  = {pointwise_conv.shape}' )
         return pointwise_conv
 
 
@@ -1780,9 +1884,11 @@ class AdaptiveMeanPool1D(object):
     # def __init__(self, output_size, data_format):
     #     self.data_format, _ = preprocess_1d_format(data_format, None)
     #     self.op = nn.AdaptiveAvgPool1d(output_size=output_size)
+    def __init__(self, *args, **kwargs):
+        pass
 
-    def __call__():
-        raise NotImplementedError
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" AdaptiveMeanPool1D is not implemented in Jittor backend")
         # if self.data_format == 'NLC':
         #     input = nhwc_to_nchw(input)
         # output = self.op(input)
@@ -1797,8 +1903,11 @@ class AdaptiveMeanPool2D(object):
     #     self.data_format, _ = preprocess_2d_format(data_format, None)
     #     self.op = nn.AdaptiveMeanPool2d(output_size=output_size)
 
-    def __call__():
-        raise NotImplementedError
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError("AdaptiveMeanPool2D is not implemented in Jittor backend")
     #     if self.data_format == 'NHWC':
     #         inputs = nhwc_to_nchw(inputs)
     #     output = self.op(inputs)
@@ -1809,18 +1918,13 @@ def __call__():
 
 class AdaptiveMeanPool3D(object):
 
-    # def __init__(self, output_size, data_format):
-        # self.data_format, _ = preprocess_3d_format(data_format, None)
-        # self.op = torch.nn.AdaptiveAvgPool3d(output_size=output_size)
 
-    def __call__():
-        raise NotImplementedError
-        # if self.data_format == 'NDHWC':
-        #     inputs = nhwc_to_nchw(inputs)
-        # output = self.op(inputs)
-        # if self.data_format == 'NDHWC':
-        #     output = nchw_to_nhwc(output)
-        # return output
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" AdaptiveMeanPool3D is not implemented in Jittor backend")
 
 
 def adaptive_avg_pool1d(input, output_size):
@@ -1844,8 +1948,12 @@ class AdaptiveMaxPool1D(object):
     #     self.data_format, _ = preprocess_1d_format(data_format, None)
     #     self.op = torch.nn.AdaptiveMaxPool1d(output_size=output_size)
 
-    def __call__(self, input):
-        raise NotImplementedError
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" AdaptiveMaxPool1D is not implemented in Jittor backend")
         # if self.data_format == 'NLC':
         #     input = nhwc_to_nchw(input)
         # output = self.op(input)
@@ -1875,12 +1983,25 @@ def __init__(self, output_size, data_format):
         self.data_format, _ = preprocess_3d_format(data_format, None)
         self.op = nn.AdaptiveMaxPool3d(output_size=output_size)
     def __call__(self, inputs):
-        if self.data_format == 'NDHWC':
-            inputs = nhwc_to_nchw(inputs)
-        output = self.op(inputs)
-        if self.data_format == 'NDHWC':
-            output = nchw_to_nhwc(output)
-        return output
+
+        raise NotImplementedError
+    #     if self.data_format == 'NDHWC':
+    #         inputs = nhwc_to_nchw(inputs)
+        
+    #     # Debugging print statements
+    #     print(f"Input shape before pooling: {inputs.shape}")
+    #     print(f"Input type before pooling: {type(inputs)}")
+
+    #     output = self.op(inputs)
+
+    #     # Debugging print statements
+    #     print(f"Output shape after pooling: {output.shape}")
+    #     print(f"Output type after pooling: {type(output)}")
+        
+    #     if self.data_format == 'NDHWC':
+    #         output = nchw_to_nhwc(output)
+        # return output
+
 
 def adaptive_max_pool1d(input, output_size, return_indices = False):
     raise NotImplementedError
@@ -1896,118 +2017,138 @@ def adaptive_max_pool3d(input, output_size, return_indices=False):
 
 class BinaryConv2D(object):
 
-    def __init__(self, strides, padding, data_format, dilations, out_channel, k_size, in_channel):
-        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
-        self.strides = strides
-        self.dilations = dilations
+    def __init__(self, *args, **kwargs):
+        pass
 
-    def quantize(self, x):
-        raise NotImplementedError
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" BinaryConv2D is not implemented in Jittor backend")
 
-    def __call__(self, inputs, filters):
-        raise NotImplementedError
 
 
 class DorefaConv2D(object):
 
-    def __init__(self, bitW, bitA, strides, padding, data_format, dilations, out_channel, k_size, in_channel):
-        self.data_format, self.padding = preprocess_2d_format(data_format, padding)
-        self.strides = strides
-        self.dilations = dilations
-        self.bitW = bitW
-        self.bitA = bitA
-
-    def _quantize_dorefa(self, x, k):
-        raise NotImplementedError
-
-    def cabs(self, x):
-        raise NotImplementedError
-
-    def quantize_active(self, x, bitA):
-        raise NotImplementedError
-
-    def quantize_weight(self, x, bitW, force_quantization=False):
-        raise NotImplementedError
-
-    def __call__(self, inputs, filters):
-        raise NotImplementedError
+    def __init__(self, *args, **kwargs):
+        pass
 
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" DorefaConv2D is not implemented in Jittor backend")
+    
 
 class rnncell(object):
+    def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh, act):
+        self.weight_ih = weight_ih
+        self.weight_hh = weight_hh
+        self.bias_ih = bias_ih
+        self.bias_hh = bias_hh
+        self.act = act
 
-    def __init__(self,  input_size , hidden_size , bias = True, nonlinearity='tanh'):
-        self.input_size = input_size
-        self.hidden_size= hidden_size
-        self.bias = bias
-        self.act = nonlinearity
-
-    def __call__(self, input, h):
+    def execute(self, input, hx=None):
+        if hx is None:
+            hx = jt.zeros((input.shape[0], self.weight_hh.shape[0]), dtype=input.dtype)
+        
+        y = jt.matmul(input, self.weight_ih.transpose()) + jt.matmul(hx, self.weight_hh.transpose())
+        
+        if self.bias_ih is not None:
+            y += self.bias_ih + self.bias_hh
+        
         if self.act == 'tanh':
-            h = nn.RNNCell(
-                input,
-                h,
-                bias=self.bias,
-                nonlinearity='tanh'
-            )
+            y = jt.tanh(y)
+        elif self.act == 'relu':
+            y = jt.relu(y)
         else:
-            h = nn.RNNCell(
-                input,
-                h,
-                bias=self.bias,
-                nonlinearity='relu'
-            )
-        return h, h
+            raise RuntimeError("Unknown nonlinearity: {}".format(self.act))
+        
+        return y, y
+
+    def __call__(self, input, hx=None):
+        return self.execute(input, hx)
+    
 
 
 class lstmcell(object):
-
-    def __init__(self,  input_size , hidden_size , bias = True, nonlinearity='tanh'):
-        self.input_size = input_size
-        self.hidden_size= hidden_size
-        self.bias = bias
+    def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh):
+        self.weight_ih = weight_ih
+        self.weight_hh = weight_hh
+        self.bias_ih = bias_ih
+        self.bias_hh = bias_hh
 
     def __call__(self, input, h, c):
-        h = (h, c)
-        h, c = nn.LSTMCell(
-                input,
-                h,
-                bias=self.bias
-                )
-        return h, h, c
+        gates = jt.matmul(input, jt.transpose(self.weight_ih)) + jt.matmul(h, jt.transpose(self.weight_hh))
+        if self.bias_ih is not None:
+            gates += self.bias_ih + self.bias_hh
+
+        i, f, g, o = jt.chunk(gates, 4, dim=1)
+        i = jt.sigmoid(i)
+        f = jt.sigmoid(f)
+        g = jt.tanh(g)
+        o = jt.sigmoid(o)
+
+        c_new = f * c + i * g
+        h_new = o * jt.tanh(c_new)
+        return h_new, h_new, c_new
+
+
+class grucell(Module):
+    def __init__(self, weight_ih, weight_hh, bias_ih=None, bias_hh=None):
+        super(grucell, self).__init__()
+        self.weight_ih = weight_ih
+        self.weight_hh = weight_hh
+        self.bias_ih = bias_ih
+        self.bias_hh = bias_hh
+        self.hidden_size = weight_hh.shape[1]
+
+    def execute(self, inputs, states):
+        hx = states[0] if isinstance(states, (tuple, list)) else states
+        gates = jt.matmul(inputs, self.weight_ih.t()) + jt.matmul(hx, self.weight_hh.t())
+        if self.bias_ih is not None and self.bias_hh is not None:
+            gates += self.bias_ih + self.bias_hh
+        
+        # Separate the gates
+        r, z, n = jt.chunk(gates, 3, dim=1)
+
+        r = jt.sigmoid(r)
+        z = jt.sigmoid(z)
+        n = jt.tanh(n + r * (jt.matmul(hx, self.weight_hh[2 * self.hidden_size:].t()) + (self.bias_hh[2 * self.hidden_size:] if self.bias_hh is not None else 0)))
+        hy = (1 - z) * n + z * hx
+        
+        return hy, hy
 
 
-class grucell(object):
 
-    def __init__(self,  input_size , hidden_size , bias = True, nonlinearity='tanh'):
-        self.input_size = input_size
-        self.hidden_size= hidden_size
-        self.bias = bias
 
-    def __call__(self, input, h):
-        h = nn.GRUCell(
-                input,
-                h,
-                bias=self.bias
-        )
-        return h, h
 
 
 class rnnbase(Module):
 
     def __init__(
         self,
-            mode:str,  
-            input_size:int,
-            hidden_size:int,  
-            num_layers:int= 1,
-            bias:bool=True,
-            batch_first:bool=False ,  
-            dropout: float= 0,  
-            bidirectional:bool=False,  
-            proj_size : int = 0 ,  
-            nonlinearity: str = None
+        mode: str,  
+        input_size: int,
+        hidden_size: int,  
+        num_layers: int = 1,
+        bias: bool = True,
+        batch_first: bool = False,  
+        dropout: float = 0,  
+        bidirectional: bool = False,  
+        proj_size: int = 0,  
+        nonlinearity: str = None,
+        is_train: bool = True,  # Additional parameter
+        w_ih=None,  # Additional parameter
+        w_hh=None,  # Additional parameter
+        b_ih=None,  # Additional parameter
+        b_hh=None   # Additional parameter
     ):
         super(rnnbase, self).__init__()
+
+        if mode == 'RNN_TANH':
+            mode = 'RNN'
+            self.nonlinearity = 'tanh'
+        elif mode == 'RNN_RELU':
+            mode = 'RNN'
+            self.nonlinearity = 'relu'
+        else:
+            self.nonlinearity = nonlinearity
+        
         self.mode = mode 
         self.input_size = input_size 
         self.hidden_size = hidden_size 
@@ -2016,8 +2157,13 @@ def __init__(
         self.batch_first = batch_first 
         self.dropout = dropout 
         self.bidirectional = bidirectional 
-        self.proj_size = proj_size 
-        self.nonlinearity = nonlinearity
+        self.proj_size = proj_size
+
+        self.is_train = is_train
+        self.w_ih = w_ih
+        self.w_hh = w_hh
+        self.b_ih = b_ih
+        self.b_hh = b_hh
 
         if mode == 'LSTM':
             gate_size = 4 * hidden_size
@@ -2042,18 +2188,18 @@ def build_unit(name, in_channels, out_channels=None):
 
         for layer in range(num_layers):
             if layer == 0:
-                build_unit(f'weight_ih_l{layer}', gate_size, input_size)
+                build_unit(f'weight_ih_l{layer}', input_size, gate_size)
             else:
                 if proj_size > 0:
-                    build_unit(f'weight_ih_l{layer}', gate_size, num_directions * proj_size)
+                    build_unit(f'weight_ih_l{layer}', num_directions * proj_size, gate_size)
                 else:
-                    build_unit(f'weight_ih_l{layer}', gate_size, num_directions * hidden_size)
+                    build_unit(f'weight_ih_l{layer}', num_directions * hidden_size, gate_size)
 
             if proj_size > 0:
-                build_unit(f'weight_hh_l{layer}', gate_size, proj_size)
+                build_unit(f'weight_hh_l{layer}', proj_size, gate_size)
                 build_unit(f'weight_hr_l{layer}', proj_size, hidden_size)
             else:
-                build_unit(f'weight_hh_l{layer}', gate_size, hidden_size)
+                build_unit(f'weight_hh_l{layer}', hidden_size, gate_size)
 
             if bias:
                 build_unit(f'bias_ih_l{layer}', gate_size)
@@ -2109,9 +2255,80 @@ def copy_to(param_name, offset_idx, idx):
         else:
             raise RuntimeError("Not Cudnn found")
 
-    @abstractmethod
     def call_rnn_cell(self, input, hidden, suffix):
-        pass
+        if self.mode == 'RNN':
+            weight_ih = getattr(self, f'weight_ih_{suffix}')
+            weight_hh = getattr(self, f'weight_hh_{suffix}')
+            bias_ih = getattr(self, f'bias_ih_{suffix}', None)
+            bias_hh = getattr(self, f'bias_hh_{suffix}', None)
+
+													 
+            preact = jt.matmul(input, weight_ih) + jt.matmul(hidden, weight_hh)
+            if self.bias:
+                preact += bias_ih + bias_hh
+
+            if self.nonlinearity == 'tanh':
+                hidden_new = jt.tanh(preact)
+            elif self.nonlinearity == 'relu':
+                hidden_new = jt.relu(preact)
+            return hidden_new, hidden_new
+
+        elif self.mode == 'LSTM':
+            weight_ih = getattr(self, f'weight_ih_{suffix}')
+            weight_hh = getattr(self, f'weight_hh_{suffix}')
+            bias_ih = getattr(self, f'bias_ih_{suffix}', None)
+            bias_hh = getattr(self, f'bias_hh_{suffix}', None)
+
+
+            # Adjust the dimensions to match the expected shapes
+            gates_input = jt.matmul(input, weight_ih)
+
+            gates_hidden = jt.matmul(hidden[0], weight_hh)
+
+            gates = gates_input + gates_hidden
+
+            if self.bias:
+                gates += bias_ih + bias_hh
+
+            i, f, g, o = jt.chunk(gates, 4, dim=1)
+            i = jt.sigmoid(i)
+            f = jt.sigmoid(f)
+            g = jt.tanh(g)
+            o = jt.sigmoid(o)
+
+
+            c_new = f * hidden[1] + i * g
+            h_new = o * jt.tanh(c_new)
+
+
+            return h_new, (h_new, c_new)
+
+        elif self.mode == 'GRU':
+            weight_ih = getattr(self, f'weight_ih_{suffix}')
+            weight_hh = getattr(self, f'weight_hh_{suffix}')
+            bias_ih = getattr(self, f'bias_ih_{suffix}', None)
+            bias_hh = getattr(self, f'bias_hh_{suffix}', None)
+
+            gates_input = jt.matmul(input, weight_ih)
+
+            gates_hidden = jt.matmul(hidden, weight_hh)
+
+            gates = gates_input + gates_hidden
+
+            if self.bias:
+                gates += bias_ih + bias_hh
+
+            r, z, n = jt.chunk(gates, 3, dim=1)
+            r = jt.sigmoid(r)
+            z = jt.sigmoid(z)
+            n = jt.tanh(n + r * (jt.matmul(hidden, weight_hh[:, 2 * self.hidden_size:])))
+
+            h_new = (1 - z) * n + z * hidden
+            return h_new, h_new
+
+
+        else:
+            raise ValueError("Unrecognized RNN mode: " + self.mode)
 
     def call_rnn_sequence(self, input, hidden, suffix):
         if 'reverse' in suffix:
@@ -2221,6 +2438,7 @@ def __call__(self, input):
         return nn.layer_norm(input, self.normalized_shape, self.gamma, self.beta, self.eps)
 
 
+
 class multiheadattention(Module):
     def __init__(
         self,
@@ -2230,10 +2448,21 @@ def __init__(
         vdim=None,
         dropout=0.0,
         bias=True,
+        batch_first=False,
+        need_weights=True,
         add_bias_kv=False,
         add_zero_attn=False,
         self_attention=False,
         encoder_decoder_attention=False,
+        q_weight=None,
+        k_weight=None,
+        v_weight=None,
+        out_weight=None,
+        q_bias=None,
+        k_bias=None,
+        v_bias=None,
+        out_bias=None,
+        train=True,
         q_noise=0.0,
         qn_block_size=8,
     ):
@@ -2244,7 +2473,7 @@ def __init__(
         self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
 
         self.num_heads = num_heads
-        assert dropout==0, "TODO: dropout>0"
+        self.dropout = dropout
 
         self.head_dim = embed_dim // num_heads
         assert (self.head_dim * num_heads == self.embed_dim), "embed_dim must be divisible by num_heads"
@@ -2252,17 +2481,31 @@ def __init__(
 
         self.self_attention = self_attention
         self.encoder_decoder_attention = encoder_decoder_attention
+        self.batch_first = batch_first
+        self.need_weights = need_weights
+        self.is_train = train
+
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and value to be of the same size")
 
-        assert not self.self_attention or self.qkv_same_dim, ("Self-attention requires query, key and " "value to be of the same size")
+        self.q_weight = q_weight
+        self.k_weight = k_weight
+        self.v_weight = v_weight
+        self.out_weight = out_weight
+        self.q_bias = q_bias
+        self.k_bias = k_bias
+        self.v_bias = v_bias
+        self.out_bias = out_bias
 
-        #TODO: quant_noise
-        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias) if k_weight is None else None
+        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias) if v_weight is None else None
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) if q_weight is None else None
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) if out_weight is None else None
 
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.dropout_layer = nn.Dropout(dropout)
 
         assert not add_bias_kv, "TODO: add_bias_kv=True"
+
         self.bias_k = self.bias_v = None
 
         self.add_zero_attn = add_zero_attn
@@ -2273,52 +2516,51 @@ def __init__(
         self.tpu = False
 
     def reset_parameters(self):
-        '''
-        初始化参数
-
-            代码示例:
-                >>> multihead_attn = jt.attention.MultiheadAttention(embed_dim, num_heads)
-                >>> multihead_attn.reset_parameters()
-                
-        
-        '''
         if self.qkv_same_dim:
-            # Empirically observed the convergence to be much better with
-            # the scaled initialization
-            init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
-            init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
-            init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+            if self.q_proj is not None:
+                init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+            if self.k_proj is not None:
+                init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            if self.v_proj is not None:
+                init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
         else:
-            init.xavier_uniform_(self.k_proj.weight)
-            init.xavier_uniform_(self.v_proj.weight)
-            init.xavier_uniform_(self.q_proj.weight)
-
-        # init.xavier_uniform_(self.out_proj.weight)
-        if self.out_proj.bias is not None:
+            if self.q_proj is not None:
+                init.xavier_uniform_(self.q_proj.weight)
+            if self.k_proj is not None:
+                init.xavier_uniform_(self.k_proj.weight)
+            if self.v_proj is not None:
+                init.xavier_uniform_(self.v_proj.weight)
+
+        if self.out_proj is not None and self.out_proj.bias is not None:
             init.constant_(self.out_proj.bias, 0.)
         if self.bias_k is not None:
             init.xavier_normal_(self.bias_k)
         if self.bias_v is not None:
             init.xavier_normal_(self.bias_v)
 
-
-
     def execute(
         self,
         query,
-        key = None,
-        value = None,
-        key_padding_mask = None,
-        incremental_state = None,
-        need_weights = True,
-        static_kv = False,
-        attn_mask = None,
-        before_softmax = False,
-        need_head_weights = False,
+        key=None,
+        value=None,
+        key_padding_mask=None,
+        incremental_state=None,
+        need_weights=True,
+        static_kv=False,
+        attn_mask=None,
+        before_softmax=False,
+        need_head_weights=False,
     ):
         if need_head_weights:
             need_weights = True
 
+        if self.batch_first:
+            query = query.transpose(1, 0, 2)
+            if key is not None:
+                key = key.transpose(1, 0, 2)
+            if value is not None:
+                value = value.transpose(1, 0, 2)
+
         tgt_len, bsz, embed_dim = query.shape
         assert embed_dim == self.embed_dim
         assert list(query.shape) == [tgt_len, bsz, embed_dim]
@@ -2326,25 +2568,11 @@ def execute(
         assert incremental_state is None, "TODO: incremental_state is not None"
         saved_state = None
 
-        if self.self_attention:
-            q = self.q_proj(query)
-            k = self.k_proj(query)
-            v = self.v_proj(query)
-        elif self.encoder_decoder_attention:
-            # encoder-decoder attention
-            q = self.q_proj(query)
-            if key is None:
-                assert value is None
-                k = v = None
-            else:
-                k = self.k_proj(key)
-                v = self.v_proj(key)
-        else:
-            assert key is not None and value is not None
-            q = self.q_proj(query)
-            k = self.k_proj(key)
-            v = self.v_proj(value)
-        q = q*self.scaling
+        q = jt.matmul(query, self.q_weight) + self.q_bias if self.q_weight is not None else self.q_proj(query)
+        k = jt.matmul(query, self.k_weight) + self.k_bias if self.k_weight is not None else self.k_proj(query)
+        v = jt.matmul(query, self.v_weight) + self.v_bias if self.v_weight is not None else self.v_proj(query)
+
+        q = q * self.scaling
 
         assert self.bias_k is None, "TODO: self.bias_k is not None:"
 
@@ -2358,84 +2586,85 @@ def execute(
         assert k is not None
         src_len = k.shape[1]
 
-        assert key_padding_mask is None, "TODO: key_padding_mask is not None"
         assert not self.add_zero_attn, "TODO: self.add_zero_attn=True"
 
         attn_weights = nn.bmm(q, k.transpose(0, 2, 1))
 
         assert list(attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
 
-        assert attn_mask is None, "TODO: attn_mask is not None"
-        assert key_padding_mask is None, "TODO: key_padding_mask is not None"
+        # Apply the attention mask if provided
+        if attn_mask is not None:
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(attn_mask.unsqueeze(1).unsqueeze(2), float('-inf'))
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         
+        # Apply key padding mask
+        if key_padding_mask is not None:
+            key_padding_mask = key_padding_mask[:bsz, :]
+            key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2)
+            key_padding_mask = key_padding_mask.expand(bsz, self.num_heads, 1, src_len)
+            key_padding_mask = key_padding_mask.reshape(bsz * self.num_heads, 1, src_len)
+            attn_weights = attn_weights.masked_fill(key_padding_mask, float('-inf'))
+
         if before_softmax:
             return attn_weights, v
-        
+
         attn_weights_float = nn.softmax(attn_weights, dim=-1)
         attn_weights = attn_weights_float.type_as(attn_weights)
 
         assert v is not None
         attn = nn.bmm(attn_weights, v)
         assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.dropout > 0.0:
+            attn = self.dropout_layer(attn)
         if self.onnx_trace and attn.shape[1] == 1:
-            # when ONNX tracing a single decoder step (sequence length == 1)
-            # the transpose is a no-op copy before view, thus unnecessary
             attn = attn.view(tgt_len, bsz, embed_dim)
         else:
             attn = attn.transpose(1, 0, 2).view(tgt_len, bsz, embed_dim)
-        attn = self.out_proj(attn)
+        attn = jt.matmul(attn, self.out_weight) + self.out_bias if self.out_weight is not None else self.out_proj(attn)
         attn_weights = None
         if need_weights:
             attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0, 2, 3)
             if not need_head_weights:
-                # average attention weights over heads
                 attn_weights = attn_weights.mean(dims=[0])
 
         return attn, attn_weights
 
+
 class BinaryDense(object):
 
-    def __init__(self, weights, bias):
-        self.weights = weights
-        self.bias = bias
+    def __init__(self, *args, **kwargs):
+        pass
 
-    def __call__(self, inputs):
-        raise NotImplementedError
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" BinaryDense is not implemented in Jittor backend")
 
 
 class DorefaDense(object):
 
-    def __init__(self, weights, bias, bitW, bitA):
-        self.weights = weights
-        self.bias = bias
-        self.bitW = bitW
-        self.bitA = bitA
+    def __init__(self, *args, **kwargs):
+        pass
 
-    def __call__(self, inputs):
-        raise NotImplementedError
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" DorefaDense is not implemented in Jittor backend")
 
 
 class TernaryDense(object):
 
-    def __init__(self, weights, bias):
-        self.weights = weights
-        self.bias = bias
+    def __init__(self, *args, **kwargs):
+        pass
 
-    def __call__(self, inputs):
-        raise NotImplementedError
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" TernaryDense is not implemented in Jittor backend")
 
 
 class QuanDense(object):
 
-    def __init__(self, weights, bias, bitW, bitA):
-        self.weights = weights
-        self.bias = bias
-        self.bitW = bitW
-        self.bitA = bitA
-
-    def __call__(self, inputs):
-        raise NotImplementedError
+    def __init__(self, *args, **kwargs):
+        pass
 
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" QuanDense is not implemented in Jittor backend")
 
 class QuanDenseBn(object):
 
@@ -2507,25 +2736,42 @@ def __call__(self, inputs):
         raise NotImplementedError
 
 
-class PReLU(object):
+class Swish(object):
 
-    def __init__(self, data_format):
+    def __init__(self, *args, **kwargs):
+        pass
 
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError(" Swish is not implemented in Jittor backend")
+    
+
+class PReLU(object):
+    def __init__(self, data_format='channels_last'):
         self.data_format = data_format
 
     def __call__(self, input, weight):
-        if self.data_format == 'channels_last' :
+        if self.data_format == 'channels_last':
             input = nhwc_to_nchw(input)
-        output = nn.PReLU(input, weight)
+        
+        if weight.shape[0] > 1:
+            raise  NotImplementedError("num_parameters > 1 is not yet functional in Jittor. Only use this function if num_parameters are < 2")
+        
+        else:
+            prelu_layer = nn.PReLU(num_parameters=weight.shape[0])
+            prelu_layer.weight = weight
+            output = prelu_layer(input)
+            
         if self.data_format == 'channels_last':
             output = nchw_to_nhwc(output)
+        
         return output
 
 
 def prelu(input, weight, data_format):
     if data_format == 'channels_last':
-        input = nhwc_to_nchw(input)
-    output = nn.PReLU(input, weight)
+        input = nhwc_to_nchw(input)    
+    num_parameters = weight.shape[0]
+    output = nn.PReLU(input, num_parameters)
     if data_format == 'channels_last':
         output = nchw_to_nhwc(output)
     return output
diff --git a/tensorlayerx/nn/core/core_jittor.py b/tensorlayerx/nn/core/core_jittor.py
index 8bd8942..635549d 100644
--- a/tensorlayerx/nn/core/core_jittor.py
+++ b/tensorlayerx/nn/core/core_jittor.py
@@ -12,6 +12,7 @@
 from collections import OrderedDict, abc as container_abcs
 import warnings
 import tensorlayerx as tlx
+from tensorlayerx.nn.initializers import xavier_uniform
 
 _global_layer_name_dict = {}
 _global_layer_node = []
@@ -88,10 +89,16 @@ def build(self, inputs_shape):
     def forward(self, *inputs, **kwargs):
         raise Exception("The forward method must be implemented by inherited class")
 
-    def _get_weights(self, var_name, shape, init=None, trainable=True, transposed=None, order=False):
+    def _get_weights(self, var_name, shape, init=xavier_uniform(), trainable=True, transposed=None, order=False):
+        
+        if isinstance(shape, int):
+            shape = (shape,)      
+
+
         if order:
-            w_tmp = Parameter(init(shape), requires_grad=trainable)
+            w_tmp = jt.nn.Parameter(init(shape), requires_grad=trainable)
             return w_tmp
+        
 
         if len(shape) == 3:
             shape = shape[::-1]
@@ -104,7 +111,7 @@ def _get_weights(self, var_name, shape, init=None, trainable=True, transposed=No
             shape = (shape[4], shape[3], shape[0], shape[1], shape[2])
         # TODO paramters name should be add
         _param = init(shape)
-        param = Parameter(_param, requires_grad=trainable)
+        param = jt.nn.Parameter(_param, requires_grad=trainable)
         self.var_name = var_name
         return param
 
@@ -598,12 +605,13 @@ def update(self, modules):
 
 
 class ParameterList(Module):
-
     def __init__(self, parameters=None):
         super(ParameterList, self).__init__()
+        self._initialized = False
+        self._param_list = []  # Use a different internal attribute name
         self._initialized = True
         if parameters is not None:
-            self += parameters
+            self.extend(parameters)
 
     def __setstate__(self, state):
         state['_initialized'] = False
@@ -616,30 +624,30 @@ def _get_abs_string_index(self, idx):
             raise IndexError('index {} is out of range'.format(idx))
         if idx < 0:
             idx += len(self)
-        return str(idx)
+        return idx
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
-            return self.__class__(list(self._parameters.values())[idx])
+            return self.__class__(self._param_list[idx])
         else:
             idx = self._get_abs_string_index(idx)
-            return self._parameters[str(idx)]
+            return self._param_list[idx]
 
     def __setitem__(self, idx, param):
         idx = self._get_abs_string_index(idx)
-        return self.register_parameter(str(idx), param)
+        self._param_list[idx] = param
 
     def __setattr__(self, key, value):
         if getattr(self, "_initialized", False):
-            if not hasattr(self, key) and not isinstance(value, jt.nn.Parameter):
+            if not hasattr(self, key) and not isinstance(value, jt.Var):
                 warnings.warn("Setting attributes on ParameterList is not supported.")
         super(ParameterList, self).__setattr__(key, value)
 
     def __len__(self):
-        return len(self._parameters)
+        return len(self._param_list)
 
     def __iter__(self):
-        return iter(self._parameters.values())
+        return iter(self._param_list)
 
     def __iadd__(self, parameters):
         return self.extend(parameters)
@@ -650,8 +658,7 @@ def __dir__(self):
         return keys
 
     def append(self, parameter):
-
-        self.register_parameter(str(len(self)), parameter)
+        self._param_list.append(parameter)
         return self
 
     def extend(self, parameters):
@@ -660,15 +667,16 @@ def extend(self, parameters):
                 "ParameterList.extend should be called with an "
                 "iterable, but got " + type(parameters).__name__
             )
-        offset = len(self)
-        for i, param in enumerate(parameters):
-            self.register_parameter(str(offset + i), param)
+        for param in parameters:
+            self.append(param)
         return self
 
     def __call__(self, input):
         raise RuntimeError('ParameterList should not be called.')
 
 
+
+
 class ParameterDict(Module):
 
     def __init__(self, parameters=None):
diff --git a/tensorlayerx/nn/initializers/jittor_initializers.py b/tensorlayerx/nn/initializers/jittor_initializers.py
index ca4664d..4e556d3 100644
--- a/tensorlayerx/nn/initializers/jittor_initializers.py
+++ b/tensorlayerx/nn/initializers/jittor_initializers.py
@@ -162,7 +162,7 @@ def __init__(self, minval=-0.05, maxval=0.05, seed=None):
 
     def __call__(self, shape, dtype=tlx.float32):
         _tensor = jt.empty(shape, dtype=dtype)
-        return jt.nn.init.uniform_(_tensor, a=self.minval, b=self.maxval)
+        return jt.nn.init.uniform_(_tensor, low=self.minval, high=self.maxval)
 
     def get_config(self):
         return {"minval": self.minval, "maxval": self.maxval, "seed": self.seed}
diff --git a/tensorlayerx/nn/layers/convolution/separable_conv.py b/tensorlayerx/nn/layers/convolution/separable_conv.py
index ce15177..c77db3d 100644
--- a/tensorlayerx/nn/layers/convolution/separable_conv.py
+++ b/tensorlayerx/nn/layers/convolution/separable_conv.py
@@ -113,7 +113,7 @@ def build(self, inputs_shape):
 
         if BACKEND == 'tensorflow':
             self.depthwise_filter_shape = (self.kernel_size, self.in_channels, self.depth_multiplier)
-        elif BACKEND in ['mindspore', 'paddle', 'torch']:
+        elif BACKEND in ['mindspore', 'paddle', 'torch','jittor']:
             self.depthwise_filter_shape = (self.kernel_size, 1, self.depth_multiplier * self.in_channels)
 
         self.pointwise_filter_shape = (1, self.depth_multiplier * self.in_channels, self.out_channels)
@@ -272,7 +272,7 @@ def build(self, inputs_shape):
             )
             self.pointwise_filter_shape = (1, 1, self.depth_multiplier * self.in_channels, self.out_channels)
 
-        elif BACKEND in ['mindspore' , 'paddle', 'torch']:
+        elif BACKEND in ['mindspore' , 'paddle', 'torch', 'jittor']:
             self.depthwise_filter_shape = (
                 self.kernel_size[0], self.kernel_size[1], 1, self.depth_multiplier * self.in_channels
             )
diff --git a/tensorlayerx/nn/layers/shape.py b/tensorlayerx/nn/layers/shape.py
index 2291bd1..4424c0f 100644
--- a/tensorlayerx/nn/layers/shape.py
+++ b/tensorlayerx/nn/layers/shape.py
@@ -209,7 +209,7 @@ def build(self, inputs_shape=None):
             self.reshape2 = tlx.ops.Reshape([-1, h, w, in_channel])
 
     def forward(self, inputs):
-        if tlx.BACKEND in ['tensorflow', 'paddle', 'torch']:
+        if tlx.BACKEND in ['tensorflow', 'paddle', 'torch', 'jittor']:
             in_shape = tlx.get_tensor_shape(inputs)
             h, w, in_channel = in_shape[1:]
             reshape1 = tlx.ops.Reshape([-1, h, w, in_channel // self.group, self.group])

From 772af7ad37d5659ea4de4ec4c143ae9f6f15f206 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Wed, 31 Jul 2024 13:31:55 +0300
Subject: [PATCH 26/27] updated jittor to load save weights of model

---
 examples/basic_tutorials/jiitor_tutorial.py  | 537 +++++++++++++++++++
 tensorlayerx/files/utils.py                  |  33 +-
 tensorlayerx/model/utils.py                  |   9 +-
 tensorlayerx/nn/core/core_jittor.py          |   5 +-
 tensorlayerx/optimizers/jittor_optimizers.py | 173 +++++-
 5 files changed, 723 insertions(+), 34 deletions(-)
 create mode 100644 examples/basic_tutorials/jiitor_tutorial.py

diff --git a/examples/basic_tutorials/jiitor_tutorial.py b/examples/basic_tutorials/jiitor_tutorial.py
new file mode 100644
index 0000000..654835f
--- /dev/null
+++ b/examples/basic_tutorials/jiitor_tutorial.py
@@ -0,0 +1,537 @@
+
+
+# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor CNN ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+# import os
+# import time
+# import tensorlayerx as tlx
+# from tensorlayerx.dataflow import Dataset, DataLoader
+# from tensorlayerx.vision.transforms import (
+#     Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
+# )
+# from tensorlayerx.nn import Conv2d, Linear, Flatten, Module, MaxPool2d, BatchNorm2d
+# from tensorlayerx.optimizers import Adam
+# from tqdm import tqdm
+
+# # Enable debug logging
+# tlx.logging.set_verbosity(tlx.logging.DEBUG)
+
+# os.environ['TL_BACKEND'] = 'jittor'
+
+# # Download and prepare the CIFAR10 dataset
+# print("Downloading CIFAR10 dataset...")
+# X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+
+# # Define the CIFAR10 dataset
+# class CIFAR10Dataset(Dataset):
+#     def __init__(self, data, label, transforms):
+#         self.data = data
+#         self.label = label
+#         self.transforms = transforms
+
+#     def __getitem__(self, idx):
+#         x = self.data[idx].astype('uint8')
+#         y = self.label[idx].astype('int64')
+#         x = self.transforms(x)
+#         return x, y
+
+#     def __len__(self):
+#         return len(self.label)
+
+# # Define the CIFAR10 images preprocessing pipeline
+# train_transforms = Compose([
+#     RandomCrop(size=[24, 24]),
+#     RandomFlipHorizontal(),
+#     RandomBrightness(brightness_factor=(0.5, 1.5)),
+#     RandomContrast(contrast_factor=(0.5, 1.5)),
+#     StandardizePerImage()
+# ])
+
+# test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
+
+# # Create DataLoaders for training and testing
+# print("Processing CIFAR10 dataset...")
+# train_dataset = CIFAR10Dataset(data=X_train, label=y_train, transforms=train_transforms)
+# test_dataset = CIFAR10Dataset(data=X_test, label=y_test, transforms=test_transforms)
+
+# train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
+# test_dataloader = DataLoader(test_dataset, batch_size=128)
+
+
+# class SimpleCNN(Module):
+#     def __init__(self):
+#         super(SimpleCNN, self).__init__()
+#         self.conv1 = Conv2d(16, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=3)
+#         self.conv2 = Conv2d(32, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=16)
+#         self.maxpool1 = MaxPool2d((2, 2), (2, 2), padding='SAME')
+#         self.conv3 = Conv2d(64, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=32)
+#         self.bn1 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+#         self.conv4 = Conv2d(128, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=64)
+#         self.maxpool2 = MaxPool2d((2, 2), (2, 2), padding='SAME')
+#         self.flatten = Flatten()
+#         self.fc1 = Linear(out_features=128, act=tlx.nn.ReLU, in_features=128 * 6 * 6)
+#         self.fc2 = Linear(out_features=64, act=tlx.nn.ReLU, in_features=128)
+#         self.fc3 = Linear(out_features=10, act=None, in_features=64)
+
+#     def forward(self, x):
+#         z = self.conv1(x)
+#         z = self.conv2(z)
+#         z = self.maxpool1(z)
+#         z = self.conv3(z)
+#         z = self.bn1(z)
+#         z = self.conv4(z)
+#         z = self.maxpool2(z)
+#         z = self.flatten(z)
+#         z = self.fc1(z)
+#         z = self.fc2(z)
+#         z = self.fc3(z)
+#         return z
+
+
+
+
+# # Instantiate the model
+# model = SimpleCNN()
+
+# # Define the optimizer
+# optimizer = Adam(model.trainable_weights, lr=0.001)
+
+# # Define the loss function
+# loss_fn = tlx.losses.softmax_cross_entropy_with_logits
+
+# # Training loop
+# n_epoch = 2
+# for epoch in range(n_epoch):
+#     start_time = time.time()
+#     model.set_train()
+#     train_loss, n_iter = 0, 0
+
+#     with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{n_epoch}", unit="batch") as pbar:
+#         for X_batch, y_batch in train_dataloader:
+            
+#             X_batch = tlx.convert_to_tensor(X_batch)
+#             y_batch = tlx.convert_to_tensor(y_batch)
+#             _logits = model(X_batch)
+#             loss = loss_fn(_logits, y_batch)
+#             optimizer.zero_grad()
+#             optimizer.step(loss)
+            
+#             train_loss += loss.item()  # Using .item() to get the scalar value
+#             n_iter += 1
+#             pbar.update(1)
+
+#     print(f"Epoch {epoch + 1} of {n_epoch} took {time.time() - start_time:.2f}s")
+#     print(f"   train loss: {train_loss / n_iter:.4f}")
+
+
+
+
+# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor LSTM ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# import os
+# import sys
+# import tensorlayerx as tlx
+# from tensorlayerx.nn import Module, Linear, LSTM, Embedding
+# from tensorlayerx.dataflow import Dataset
+# from keras.datasets import imdb
+# from keras.preprocessing import sequence
+# import numpy as np
+# os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+# os.environ['TL_BACKEND'] = 'jittor'
+# sys.setrecursionlimit(10000)  # Increase recursion limit
+
+# # Set parameters
+# max_features = 20000
+# maxlen = 200
+
+# prev_h = np.random.random([1, 200, 64]).astype(np.float32)
+# prev_h = tlx.convert_to_tensor(prev_h)
+
+# # Load and preprocess the IMDB dataset
+# (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
+# X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
+# X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
+
+# vocab_size = max_features
+# seq_Len = 200
+
+
+# class ImdbDataset(Dataset):
+
+#     def __init__(self, X, y):
+#         self.X = X
+#         self.y = y
+
+#     def __getitem__(self, index):
+#         data = self.X[index]
+#         data = np.concatenate([data[:seq_Len], [0] * (seq_Len - len(data))]).astype('int64')  # set
+#         label = self.y[index].astype('int64')
+#         return data, label
+
+#     def __len__(self):
+#         return len(self.y)
+
+
+# class ImdbNet(Module):
+
+#     def __init__(self):
+#         super(ImdbNet, self).__init__()
+#         self.embedding = Embedding(num_embeddings=vocab_size, embedding_dim=64)
+#         self.lstm = LSTM(input_size=64, hidden_size=64)
+#         self.linear1 = Linear(in_features=64, out_features=64, act=tlx.nn.ReLU)
+#         self.linear2 = Linear(in_features=64, out_features=2)
+#     def forward(self, x):
+#         x = self.embedding(x)
+#         x, _ = self.lstm(x)
+#         x = tlx.reduce_mean(x, axis=1)
+#         x = self.linear1(x)
+#         x = self.linear2(x)
+#         return x
+
+#     def __repr__(self):
+#         return "ImdbNet(embedding_dim=64, hidden_size=64, num_classes=2)"
+
+#     def __str__(self):
+#         return self.__repr__()
+
+# # Training settings
+# n_epoch = 1
+# batch_size = 64
+# print_freq = 2
+
+# # Create DataLoader
+# train_dataset = ImdbDataset(X=X_train, y=y_train)
+# train_loader = tlx.dataflow.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+
+# # Initialize the network
+# net = ImdbNet()
+# print(net)
+
+# # Define optimizer, metric, and loss function using TLX functions
+# optimizer = tlx.optimizers.Adam(lr=1e-3, params=net.trainable_weights)
+# metric = tlx.metrics.Accuracy()
+# loss_fn = tlx.losses.softmax_cross_entropy_with_logits
+
+# # Create and train the model
+# model = tlx.model.Model(network=net, loss_fn=loss_fn, optimizer=optimizer, metrics=metric)
+# model.train(n_epoch=n_epoch, train_dataset=train_loader, print_freq=print_freq, print_train_batch=True)
+
+
+
+
+# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor MLP ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+# # ! /usr/bin/python
+# # -*- coding: utf-8 -*-
+
+# # The same set of code can switch the backend with one line
+# import os
+# os.environ['TL_BACKEND'] = 'jittor'
+# import tensorlayerx as tlx
+# from tensorlayerx.nn import Module
+# from tensorlayerx.nn import Linear, Dropout
+# from tensorlayerx.dataflow import Dataset, DataLoader
+
+# # ################## Download and prepare the MNIST dataset ##################
+# # This is just some way of getting the MNIST dataset from an online location and loading it into numpy arrays
+# X_train, y_train, X_val, y_val, X_test, y_test = tlx.files.load_mnist_dataset(shape=(-1, 784))
+
+# # ################## MNIST dataset ##################
+# # We define a Dataset class for Loading MNIST images and labels.
+# class mnistdataset(Dataset):
+
+#     def __init__(self, data=X_train, label=y_train):
+#         self.data = data
+#         self.label = label
+
+#     def __getitem__(self, index):
+#         data = self.data[index].astype('float32')
+#         label = self.label[index].astype('int64')
+#         return data, label
+
+#     def __len__(self):
+#         return len(self.data)
+
+# # We use DataLoader to batch and shuffle data, and make data into iterators.
+# train_dataset = mnistdataset(data=X_train, label=y_train)
+# train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
+
+# # ##################### Build the neural network model #######################
+# # This creates an MLP of  two hidden Linear layers of 800 units each, followed by a Linear output layer of 10 units.
+
+# class CustomModel(Module):
+
+#     def __init__(self):
+#         super(CustomModel, self).__init__()
+#         # It applies 20% dropout to each Linear layer.
+#         self.dropout1 = Dropout(p=0.2)
+#         # Linear layer with 800 units, using ReLU for output.
+#         self.linear1 = Linear(out_features=800, act=tlx.nn.ReLU, in_features=784)
+#         self.dropout2 = Dropout(p=0.2)
+#         # Linear layer with 800 units, using ReLU for output.
+#         self.linear2 = Linear(out_features=800, act=tlx.nn.ReLU, in_features=800)
+#         self.dropout3 = Dropout(p=0.2)
+#         # Linear layer with 10 units, using ReLU for output.
+#         self.linear3 = Linear(out_features=10, act=tlx.nn.ReLU, in_features=800)
+
+#     # We define the forward computation process.
+#     def forward(self, x):
+#         z = self.dropout1(x)
+#         z = self.linear1(z)
+#         z = self.dropout2(z)
+#         z = self.linear2(z)
+#         z = self.dropout3(z)
+#         out = self.linear3(z)
+#         return out
+
+# # We initialize the network
+# MLP = CustomModel()
+# # Set the number of training cycles
+# n_epoch = 50
+# # set print frequency.
+# print_freq = 2
+
+# # Get training parameters
+# train_weights = MLP.trainable_weights
+# # Define the optimizer, use the Momentum optimizer, and set the learning rate to 0.05, momentum to 0.9
+# optimizer = tlx.optimizers.Momentum(lr=0.05, momentum= 0.9, params = train_weights )
+# # Define evaluation metrics.
+# metric = tlx.metrics.Accuracy()
+# # Define loss function, this operator implements the cross entropy loss function with softmax. This function
+# # combines the calculation of the softmax operation and the cross entropy loss function
+# # to provide a more numerically stable computing.
+# loss_fn = tlx.losses.softmax_cross_entropy_with_logits
+
+# # Using a simple training method without custom trianing loops.
+# model = tlx.model.Model(network=MLP, loss_fn=loss_fn, optimizer=optimizer, metrics=metric)
+# model.train(n_epoch=n_epoch, train_dataset=train_loader, print_freq=print_freq, print_train_batch=False)
+
+# # Optionally, you could now dump the network weights to a file like this:
+# # model.save_weights('./model.npz', format='npz_dict')
+# # model.load_weights('./model.npz', format='npz_dict')
+
+
+# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor MNIST Sequential ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+# import os
+# os.environ['TL_BACKEND'] = 'jittor'
+
+
+# from tensorlayerx.nn import Sequential
+# from tensorlayerx.nn import Linear
+# import tensorlayerx as tlx
+# from tensorlayerx.dataflow import Dataset
+
+# layer_list = []
+# layer_list.append(Linear(out_features=800, act=tlx.nn.ReLU, in_features=784, name='linear1'))
+# layer_list.append(Linear(out_features=800, act=tlx.nn.ReLU, in_features=800, name='linear2'))
+# layer_list.append(Linear(out_features=10, act=tlx.nn.ReLU, in_features=800, name='linear3'))
+# MLP = Sequential(layer_list)
+
+# X_train, y_train, X_val, y_val, X_test, y_test = tlx.files.load_mnist_dataset(shape=(-1, 784))
+
+
+# class mnistdataset(Dataset):
+
+#     def __init__(self, data=X_train, label=y_train):
+#         self.data = data
+#         self.label = label
+
+#     def __getitem__(self, index):
+#         data = self.data[index].astype('float32')
+#         label = self.label[index].astype('int64')
+
+#         return data, label
+
+#     def __len__(self):
+
+#         return len(self.data)
+
+
+# n_epoch = 1
+# batch_size = 128
+# print_freq = 2
+# shuffle_buffer_size = 128
+
+# train_weights = MLP.trainable_weights
+# optimizer = tlx.optimizers.Momentum(lr=0.05,momentum= 0.9, params=train_weights)
+# train_dataset = mnistdataset(data=X_train, label=y_train)
+# train_loader = tlx.dataflow.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+# metric = tlx.metrics.Accuracy()
+# model = tlx.model.Model(
+#     network=MLP, loss_fn=tlx.losses.softmax_cross_entropy_with_logits, optimizer=optimizer, metrics=metric
+# )
+# model.train(n_epoch=n_epoch, train_dataset=train_loader, print_freq=print_freq, print_train_batch=False)
+# model.save_weights('./model.npz', format='npz_dict')
+# model.load_weights('./model.npz', format='npz_dict')
+
+
+# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor MNIST GAN ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+# #! /usr/bin/python
+# # -*- coding: utf-8 -*-
+
+# import os
+# os.environ['TL_BACKEND'] = 'jittor'
+
+# import time
+# import numpy as np
+# import tensorlayerx as tlx
+# from tensorlayerx.nn import Module, Linear
+# from tensorlayerx.dataflow import Dataset
+# from tensorlayerx.model import TrainOneStep
+
+# # ################## Download and prepare the MNIST dataset ##################
+# # This is just some way of getting the MNIST dataset from an online location and loading it into numpy arrays
+# X_train, y_train, X_val, y_val, X_test, y_test = tlx.files.load_mnist_dataset(shape=(-1, 784))
+
+# # ################## MNIST dataset ##################
+# # We define a Dataset class for Loading MNIST images and labels.
+# class mnistdataset(Dataset):
+
+#     def __init__(self, data=X_train, label=y_train):
+#         self.data = data
+#         self.label = label
+
+#     def __getitem__(self, index):
+#         data = self.data[index].astype('float32')
+#         label = self.label[index].astype('int64')
+#         return data, label
+
+#     def __len__(self):
+#         return len(self.data)
+
+# # We use DataLoader to batch and shuffle data, and make data into iterators.
+# batch_size = 128
+# train_dataset = mnistdataset(data=X_train, label=y_train)
+# train_loader = tlx.dataflow.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+
+# # We define generator network.
+# class generator(Module):
+
+#     def __init__(self):
+#         super(generator, self).__init__()
+#         # Linear layer with 256 units, using ReLU for output.
+#         self.g_fc1 = Linear(out_features=256, in_features=100, act=tlx.nn.ReLU)
+#         self.g_fc2 = Linear(out_features=256, in_features=256, act=tlx.nn.ReLU)
+#         self.g_fc3 = Linear(out_features=784, in_features=256, act=tlx.nn.Tanh)
+
+#     def forward(self, x):
+#         out = self.g_fc1(x)
+#         out = self.g_fc2(out)
+#         out = self.g_fc3(out)
+#         return out
+
+# # We define discriminator network.
+# class discriminator(Module):
+
+#     def __init__(self):
+#         super(discriminator, self).__init__()
+#         # Linear layer with 256 units, using ReLU for output.
+#         self.d_fc1 = Linear(out_features=256, in_features=784, act=tlx.LeakyReLU)
+#         self.d_fc2 = Linear(out_features=256, in_features=256, act=tlx.LeakyReLU)
+#         self.d_fc3 = Linear(out_features=1, in_features=256, act=tlx.Sigmoid)
+
+#     def forward(self, x):
+#         out = self.d_fc1(x)
+#         out = self.d_fc2(out)
+#         out = self.d_fc3(out)
+#         return out
+
+
+# G = generator()
+# D = discriminator()
+
+# # Define the generator network loss calculation process
+# class WithLossG(Module):
+
+#     def __init__(self, G, D, loss_fn):
+#         super(WithLossG, self).__init__()
+#         self.g_net = G
+#         self.d_net = D
+#         self.loss_fn = loss_fn
+
+#     def forward(self, g_data, label):
+#         fake_image = self.g_net(g_data)
+#         logits_fake = self.d_net(fake_image)
+#         valid = tlx.convert_to_tensor(np.ones(logits_fake.shape), dtype=tlx.float32)
+#         loss = self.loss_fn(logits_fake, valid)
+#         return loss
+
+# # Define the discriminator network loss calculation process
+# class WithLossD(Module):
+
+#     def __init__(self, G, D, loss_fn):
+#         super(WithLossD, self).__init__()
+#         self.g_net = G
+#         self.d_net = D
+#         self.loss_fn = loss_fn
+
+#     def forward(self, real_data, g_data):
+#         logits_real = self.d_net(real_data)
+#         fake_image = self.g_net(g_data)
+#         logits_fake = self.d_net(fake_image)
+
+#         valid = tlx.convert_to_tensor(np.ones(logits_real.shape), dtype=tlx.float32)
+#         fake = tlx.convert_to_tensor(np.zeros(logits_fake.shape), dtype=tlx.float32)
+
+#         loss = self.loss_fn(logits_real, valid) + self.loss_fn(logits_fake, fake)
+#         return loss
+
+
+# # loss_fn = tlx.losses.sigmoid_cross_entropy
+# # optimizer = tlx.optimizers.Momentum(learning_rate=5e-4, momentum=0.5)
+# loss_fn = tlx.losses.mean_squared_error
+
+# # Get training parameters
+# g_weights = G.trainable_weights
+# d_weights = D.trainable_weights
+
+# net_with_loss_G = WithLossG(G, D, loss_fn)
+# net_with_loss_D = WithLossD(G, D, loss_fn)
+
+# # Define the optimizers, use the Adam optimizer.
+# optimizer_g = tlx.optimizers.Adam(lr=3e-4, beta_1=0.5, beta_2=0.999, params= g_weights)
+# optimizer_d = tlx.optimizers.Adam(lr=3e-4, params= d_weights)
+
+# # Initialize one-step training
+# train_one_step_g = TrainOneStep(net_with_loss_G, optimizer_g, g_weights)
+# train_one_step_d = TrainOneStep(net_with_loss_D, optimizer_d, d_weights)
+# n_epoch = 50
+
+
+# def plot_fake_image(fake_image, num):
+#     fake_image = tlx.reshape(fake_image, shape=(num, 28, 28))
+#     fake_image = tlx.convert_to_numpy(fake_image)
+#     import matplotlib.pylab as plt
+#     for i in range(num):
+#         plt.subplot(int(np.sqrt(num)), int(np.sqrt(num)), i + 1)
+#         plt.imshow(fake_image[i])
+#     plt.show()
+
+# # Custom training loops
+# for epoch in range(n_epoch):
+#     d_loss, g_loss = 0.0, 0.0
+#     n_iter = 0
+#     start_time = time.time()
+#     # Get training data and labels
+#     for data, label in train_loader:
+#         noise = tlx.convert_to_tensor(np.random.random(size=(batch_size, 100)), dtype=tlx.float32)
+#         # Calculate the loss value, and automatically complete the gradient update for discriminator
+#         _loss_d = train_one_step_d(data, noise)
+#         # Calculate the loss value, and automatically complete the gradient update for generator
+#         _loss_g = train_one_step_g(noise, label)
+#         d_loss += _loss_d
+#         g_loss += _loss_g
+
+#         n_iter += 1
+#         print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
+#         print("   d loss: {}".format(d_loss / n_iter))
+#         print("   g loss:  {}".format(g_loss / n_iter))
+#     fake_image = G(tlx.convert_to_tensor(np.random.random(size=(36, 100)), dtype=tlx.float32))
+#     plot_fake_image(fake_image, 36)
diff --git a/tensorlayerx/files/utils.py b/tensorlayerx/files/utils.py
index 7d0b94e..2dbf3e1 100644
--- a/tensorlayerx/files/utils.py
+++ b/tensorlayerx/files/utils.py
@@ -75,6 +75,10 @@ def load_keras_model(model_config):
 if tlx.BACKEND == 'torch':
     import torch
 
+if tlx.BACKEND == 'jittor':
+    import jittor
+
+
 if sys.version_info[0] == 2:
     from urllib import urlretrieve
 else:
@@ -1805,6 +1809,8 @@ def save_npz(save_list=None, name='model.npz'):
         save_list_var = ms_variables_to_numpy(save_list)
     elif tlx.BACKEND == 'paddle':
         save_list_var = pd_variables_to_numpy(save_list)
+    elif tlx.BACKEND == 'jittor':
+        save_list_var = jt_variables_to_numpy(save_list)        
     elif tlx.BACKEND == 'torch':
         save_list_var = th_variables_to_numpy(save_list)
     else:
@@ -1904,6 +1910,10 @@ def construct(self, x):
     elif tlx.BACKEND == 'torch':
         for idx, param in enumerate(weights):
             assign_th_variable(network.all_weights[idx], param)
+    elif tlx.BACKEND == 'jittor':
+        for idx, param in enumerate(weights):
+            assign_jt_variable(network.all_weights[idx], param)
+
 
     else:
         raise NotImplementedError("This backend is not supported")
@@ -1959,6 +1969,8 @@ def save_npz_dict(save_list=None, name='model.npz'):
         save_list_var = tf_variables_to_numpy(save_list)
     elif tlx.BACKEND == 'mindspore':
         save_list_var = ms_variables_to_numpy(save_list)
+    elif tlx.BACKEND == 'jittor':
+        save_list_var = jt_variables_to_numpy(save_list)
     elif tlx.BACKEND == 'paddle':
         save_list_var = pd_variables_to_numpy(save_list)
     elif tlx.BACKEND == 'torch':
@@ -2493,6 +2505,16 @@ def th_variables_to_numpy(variables):
     return results
 
 
+def jt_variables_to_numpy(variables):
+    if not isinstance(variables, list):
+        var_list = [variables]
+    else:
+        var_list = variables
+    results = [v.cpu().detach().numpy() for v in var_list]
+    return results
+
+
+
 def assign_tf_variable(variable, value):
     """Assign value to a TF variable"""
     variable.assign(value)
@@ -2522,6 +2544,10 @@ def assign_th_variable(variable, value):
     variable.data = torch.as_tensor(value)
 
 
+def assign_jt_variable(variable, value):
+    variable.set_value(value)
+
+
 def _save_weights_to_hdf5_group(f, save_list):
     """
     Save layer/model weights into hdf5 group recursively.
@@ -2546,6 +2572,9 @@ def _save_weights_to_hdf5_group(f, save_list):
         save_list_var = ms_variables_to_numpy(save_list)
     elif tlx.BACKEND == 'paddle':
         save_list_var = pd_variables_to_numpy(save_list)
+    elif tlx.BACKEND == 'jittor':
+        save_list_var = jt_variables_to_numpy(save_list)
+
     elif tlx.BACKEND == 'torch':
         save_list_names = []
         save_list_var = []
@@ -2716,7 +2745,9 @@ def load_hdf5_to_weights_in_order(filepath, network, skip=False):
                 assign_param = Tensor(weights[key], dtype=ms.float32)
                 assign_ms_variable(network.all_weights[net_weights_name.index(key_t)], assign_param)
             elif tlx.BACKEND == 'paddle':
-                assign_pd_variable(network.all_weights[net_weights_name.index(key_t)], weights[key])
+                assign_pd_variable(network.all_weights[net_weights_name.index(key_t)], weights[key])   
+            elif tlx.BACKEND == 'jittor':
+                assign_jt_variable(network.all_weights[net_weights_name.index(key_t)], weights[key])                    
             elif tlx.BACKEND == 'torch':
                 assign_th_variable(torch_weights_dict[key_t], weights[key])
             else:
diff --git a/tensorlayerx/model/utils.py b/tensorlayerx/model/utils.py
index 541b6a3..d7d9c2a 100644
--- a/tensorlayerx/model/utils.py
+++ b/tensorlayerx/model/utils.py
@@ -228,10 +228,11 @@ def __init__(self, net_with_loss, optimizer, train_weights):
         self.train_weights = train_weights
 
     def __call__(self, data, label, *args, **kwargs):
-        loss = self.net_with_loss(data, label, *args, **kwargs)
-        grads = self.optimizer.gradient(loss, self.train_weights)
-        self.optimizer.apply_gradients(zip(grads, self.train_weights))
-        return loss.numpy()
+        # loss = self.net_with_loss(data, label, *args, **kwargs)
+        # grads = self.optimizer.gradient(loss, self.train_weights)
+        # self.optimizer.apply_gradients(zip(grads, self.train_weights))
+        # return loss.numpy()
+        return NotImplementedError('TrainOneStep With jittor is not Implemented')
     
 
 class TrainOneStepWithGradientClippingTF(object):
diff --git a/tensorlayerx/nn/core/core_jittor.py b/tensorlayerx/nn/core/core_jittor.py
index 635549d..f69d712 100644
--- a/tensorlayerx/nn/core/core_jittor.py
+++ b/tensorlayerx/nn/core/core_jittor.py
@@ -77,7 +77,10 @@ def set_train(self, mode=True):
             raise ValueError("training mode is expected to be boolean")
         self.is_train = mode
         for module in self.children():
-            module.set_train(mode)
+            if hasattr(module, 'set_train'):
+                module.set_train(mode)
+            else:
+                module.is_train = mode
         return self
 
     def set_eval(self):
diff --git a/tensorlayerx/optimizers/jittor_optimizers.py b/tensorlayerx/optimizers/jittor_optimizers.py
index 29ce336..4905940 100644
--- a/tensorlayerx/optimizers/jittor_optimizers.py
+++ b/tensorlayerx/optimizers/jittor_optimizers.py
@@ -29,8 +29,117 @@ def app_gradients(self):
 
 
 class Adam(object):
-    def __init__(self, params, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, weight_decay=0.0):
-        self.optimizer = optimizer.Adam(params, lr=lr, eps=eps, betas=(beta_1, beta_2), weight_decay=weight_decay)
+    def __init__(
+            self,
+            params, 
+            lr=0.001, 
+            beta_1=0.9, 
+            beta_2=0.999, 
+            eps=1e-8, 
+            weight_decay=0.0,
+            momentum = 0.0,
+            grad_clip=None                    
+
+            ):
+        
+        self.optimizer = optimizer.Adam(
+            params, 
+            lr=lr, 
+            eps=eps, 
+            betas=(beta_1, beta_2), 
+            weight_decay=weight_decay)
+
+        self.lr = lr
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.eps = eps
+        self.init_optim = False
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+
+
+    @jt.no_grad()
+    def apply_gradients(self, grads_and_vars=None, closure=None):
+        if not self.init_optim:
+            raise AttributeError("Can not apply gradients before zero_grad call.")
+        loss = None
+        if closure is not None:
+            with jt.enable_grad():
+                loss = closure()
+
+        for group in self.optimizer_adam.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            beta1, beta2 = group['betas']
+
+            for p in group['params']:
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    grads.append(p.grad)
+
+                    state = self.optimizer_adam.state[p]
+                    # Lazy state initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        # Exponential moving average of gradient values
+                        state['exp_avg'] = jt.zeros_like(p)
+                        # Exponential moving average of squared gradient values
+                        state['exp_avg_sq'] = jt.zeros_like(p)
+                        if group['amsgrad']:
+                            # Maintains max of all exp. moving avg. of sq. grad. values
+                            state['max_exp_avg_sq'] = jt.zeros_like(p)
+
+                    exp_avgs.append(state['exp_avg'])
+                    exp_avg_sqs.append(state['exp_avg_sq'])
+
+                    if group['amsgrad']:
+                        max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+                    # update the steps for each param group update
+                    state['step'] += 1
+                    # record the step after step update
+                    state_steps.append(state['step'])
+
+            jt.optim.Adam(params_with_grad,
+                   grads,
+                   exp_avgs,
+                   exp_avg_sqs,
+                   max_exp_avg_sqs,
+                   state_steps,
+                   amsgrad=group['amsgrad'],
+                   beta1=beta1,
+                   beta2=beta2,
+                   lr=get_lr(self.lr),
+                   weight_decay=group['weight_decay'],
+                   eps=group['eps'])
+        return loss
+
+    def gradient(self, loss, weights=None, return_grad=True):
+        if weights is None:
+            raise AttributeError("Parameter train_weights must be entered.")
+        if not self.init_optim:
+            self.optimizer_adam = optimizer.Adam(
+                params=weights, lr=get_lr(self.lr), betas=(self.beta_1, self.beta_2), eps=self.eps,
+                weight_decay=self.weight_decay
+            )
+            self.init_optim = True
+        self.optimizer_adam.zero_grad()
+        self.optimizer_adam.step(loss)
+
+        if self.grad_clip is not None:
+            self.grad_clip(weights)
+
+        if return_grad ==True:
+            return _grads(weights)
+        else:
+            return None
+
 
     def step(self, loss=None):
         self.optimizer.step(loss)
@@ -283,62 +392,62 @@ class Momentum(object):
 
     def __init__(
         self,
+        params,  # Add params to the constructor
         lr=0.001,
-        momentum=0,
+        momentum=0.9,
         weight_decay=0.0,
         nesterov=False,
         grad_clip=None,
     ):
         self.lr = lr
         self.momentum = momentum
-        self.init_optim = False
         self.weight_decay = weight_decay
         self.nesterov = nesterov
         self.grad_clip = grad_clip
+        self.init_optim = False
+
+        self.optimizer = optimizer.SGD(  # Initialize the Jittor SGD optimizer
+            params, lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov
+        )
 
     @jt.no_grad()
     def apply_gradients(self, grads_and_vars=None, closure=None):
         if not self.init_optim:
-            raise AttributeError("Can not apply gradients before zero_grad call.")
+            raise AttributeError("Cannot apply gradients before zero_grad call.")
 
         loss = None
         if closure is not None:
             with jt.enable_grad():
                 loss = closure()
 
-        for group in self.optimizer_momentum.param_groups:
+        for group in self.optimizer.param_groups:
             params_with_grad = []
             d_p_list = []
             momentum_buffer_list = []
-            weight_decay = group['weight_decay']
-            momentum = group['momentum']
-            dampening = group['dampening']
-            nesterov = group['nesterov']
-            lr = get_lr(self.lr)
 
             for p in group['params']:
                 if p.grad is not None:
                     params_with_grad.append(p)
                     d_p_list.append(p.grad)
 
-                    state = self.optimizer_momentum.state[p]
+                    state = self.optimizer.state[p]
                     if 'momentum_buffer' not in state:
                         momentum_buffer_list.append(None)
                     else:
                         momentum_buffer_list.append(state['momentum_buffer'])
 
             optimizer.SGD(params_with_grad,
-                  d_p_list,
-                  momentum_buffer_list,
-                  weight_decay=weight_decay,
-                  momentum=momentum,
-                  lr=lr,
-                  dampening=dampening,
-                  nesterov=nesterov)
-
-            # update momentum_buffers in state
+                          d_p_list,
+                          momentum_buffer_list,
+                          weight_decay=group['weight_decay'],
+                          momentum=group['momentum'],
+                          lr=self.lr,
+                          dampening=group['dampening'],
+                          nesterov=group['nesterov'])
+
+            # Update momentum_buffers in state
             for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
-                state = self.optimizer_momentum.state[p]
+                state = self.optimizer.state[p]
                 state['momentum_buffer'] = momentum_buffer
 
         return loss
@@ -347,21 +456,28 @@ def gradient(self, loss, weights=None, return_grad=True):
         if weights is None:
             raise AttributeError("Parameter train_weights must be entered.")
         if not self.init_optim:
-            self.optimizer_momentum = optimizer.SGD(
-                params=weights, lr=get_lr(self.lr), momentum=self.momentum, weight_decay=self.weight_decay, nesterov=self.nesterov
+            self.optimizer = optimizer.SGD(
+                params=weights, lr=self.lr, momentum=self.momentum, weight_decay=self.weight_decay, nesterov=self.nesterov
             )
             self.init_optim = True
-        self.optimizer_momentum.zero_grad()
+        self.optimizer.zero_grad()
         loss.backward()
 
         if self.grad_clip is not None:
             self.grad_clip(weights)
 
-        if return_grad ==True:
+        if return_grad:
             return _grads(weights)
         else:
             return None
 
+    def step(self, loss=None):
+        self.optimizer.step(loss)
+
+    def zero_grad(self):
+        self.optimizer.zero_grad()
+
+
 
 def Lamb(**kwargs):
     raise Exception('Lamb optimizer function not implemented')
@@ -371,12 +487,13 @@ def LARS(**kwargs):
     raise Exception('LARS optimizer function not implemented')
 
 
-def _grads(weights):
+def _grads(weights, optimizer_adam):
     grads = []
     for w in weights:
-        grads.append(w.grad)
+        grads.append(w.opt_grad(optimizer_adam))
     return grads
 
+
 def get_lr(lr):
     if isinstance(lr, LRScheduler):
         return lr()

From 80f3f6c1e4685dcebca38eddb6e2e567818b686a Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 12 Aug 2024 03:24:49 +0300
Subject: [PATCH 27/27] add_Jittor: Passing model tests, Parameter and Module
 Container test

add_Jittor: Passing model tests, Parameter and Module Container test.

Additional Functionality:
1- TrainOneStep integration.
2- Updated core/train_jt to enable accuracy to be measured.
3- Updated Jittor Optimizer: replaced gradient and apply_gradient function with jittors default functions Zero_grad() and Step(). included a new function Set() to set the trainable_weights paramters for the optimizer.
4- Updated Jittor Metrics for Accuracy, Recall, Precision and AUC.
5- Creating Jittor model tutorial file jittor_module_tutorial.py
6- Module Container and Parameter Container: Updated core_jittor ModuleList and ParameterDict to enable OrderedDict intialization which was not available due to the parent class (Jittor Module) initializing Dict by default which caused integration issues. This issue was handled by updating the function and also excluding the parent Module for these functions.

Areas to optimize integration: Enabling Jittor integration to run large model training as currently it is limited in the complexity of NN layers.
---
 examples/basic_tutorials/cifar10_cnn.py       |  67 ++-
 examples/basic_tutorials/cifar10_cnn_dist.py  |   3 +-
 examples/basic_tutorials/cifar10_cnn_train.py |   3 +-
 .../gradient_clip_mixed_tensorflow.py         |   3 +-
 ..._tutorial.py => jiitor_models_tutorial.py} | 370 +++++++------
 examples/basic_tutorials/mnist_dataflow.py    |   4 +-
 examples/basic_tutorials/mnist_gan.py         |   3 +-
 .../basic_tutorials/mnist_mlp_custom_train.py |   3 +-
 .../mnist_mlp_mix_programming.py              | 237 ++++++---
 .../basic_tutorials/mnist_mlp_simple_train.py |   3 +-
 examples/basic_tutorials/mnist_sequential.py  |   5 +-
 examples/basic_tutorials/module_container.py  |   3 +-
 .../basic_tutorials/parameter_container.py    |   3 +-
 examples/basic_tutorials/quick_start.py       |  31 +-
 .../basic_tutorials/tensorlayerx_graph.py     |  34 +-
 .../tensorlayerx_model_load.py                |  11 +-
 ...ts.out.tfevents.1722986988.LAPTOP-48J7839G | Bin 0 -> 40 bytes
 tensorlayerx/backend/ops/jittor_backend.py    |   4 +-
 tensorlayerx/backend/ops/jittor_nn.py         |  52 +-
 tensorlayerx/files/utils.py                   |  21 +-
 tensorlayerx/metrics/jittor_metric.py         | 196 ++++---
 tensorlayerx/model/core.py                    |  16 +-
 tensorlayerx/model/utils.py                   |  35 +-
 tensorlayerx/nn/core/core_jittor.py           |  95 ++--
 .../nn/layers/convolution/deformable_conv.py  |  11 +-
 tensorlayerx/optimizers/jittor_optimizers.py  | 492 ++++++------------
 26 files changed, 853 insertions(+), 852 deletions(-)
 rename examples/basic_tutorials/{jiitor_tutorial.py => jiitor_models_tutorial.py} (67%)
 create mode 100644 runs/mlp/events.out.tfevents.1722986988.LAPTOP-48J7839G

diff --git a/examples/basic_tutorials/cifar10_cnn.py b/examples/basic_tutorials/cifar10_cnn.py
index 569a7bb..35061da 100644
--- a/examples/basic_tutorials/cifar10_cnn.py
+++ b/examples/basic_tutorials/cifar10_cnn.py
@@ -1,15 +1,16 @@
 #! /usr/bin/python
 # -*- coding: utf-8 -*-
 
+################################ TensorLayerX and Jittor. #################################
+
 import os
 import time
-import numpy as np
 import tensorlayerx as tlx
 from tensorlayerx.dataflow import Dataset, DataLoader
 from tensorlayerx.vision.transforms import (
     Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
 )
-from tensorlayerx.nn import Conv2d, Linear, Flatten, Module
+from tensorlayerx.nn import Conv2d, Linear, Flatten, Module, MaxPool2d, BatchNorm2d
 from tensorlayerx.optimizers import Adam
 from tqdm import tqdm
 
@@ -18,9 +19,7 @@
 
 os.environ['TL_BACKEND'] = 'jittor'
 
-
-
-# Download and prepare the CIFAR10 dataset with progress bar
+# Download and prepare the CIFAR10 dataset
 print("Downloading CIFAR10 dataset...")
 X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
 
@@ -59,58 +58,54 @@ def __len__(self):
 train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
 test_dataloader = DataLoader(test_dataset, batch_size=128)
 
-# Define a simple CNN model
+
 class SimpleCNN(Module):
     def __init__(self):
         super(SimpleCNN, self).__init__()
         self.conv1 = Conv2d(16, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=3)
+        self.conv2 = Conv2d(32, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=16)
+        self.maxpool1 = MaxPool2d((2, 2), (2, 2), padding='SAME')
+        self.conv3 = Conv2d(64, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=32)
+        self.bn1 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+        self.conv4 = Conv2d(128, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=64)
+        self.maxpool2 = MaxPool2d((2, 2), (2, 2), padding='SAME')
         self.flatten = Flatten()
-        self.fc1 = Linear(out_features=64, act=tlx.nn.ReLU, in_features=16 * 24 * 24)
-        self.fc2 = Linear(out_features=10, act=None, in_features=64)
+        self.fc1 = Linear(out_features=128, act=tlx.nn.ReLU, in_features=128 * 6 * 6)
+        self.fc2 = Linear(out_features=64, act=tlx.nn.ReLU, in_features=128)
+        self.fc3 = Linear(out_features=10, act=None, in_features=64)
 
     def forward(self, x):
         z = self.conv1(x)
+        z = self.conv2(z)
+        z = self.maxpool1(z)
+        z = self.conv3(z)
+        z = self.bn1(z)
+        z = self.conv4(z)
+        z = self.maxpool2(z)
         z = self.flatten(z)
         z = self.fc1(z)
         z = self.fc2(z)
+        z = self.fc3(z)
         return z
 
+
 # Instantiate the model
 model = SimpleCNN()
 
 # Define the optimizer
-optimizer = Adam(model.trainable_weights, lr=0.001)
+optimizer = Adam(lr=0.001)
+# optimizer = Adam(lr=0.001, params=model.trainable_weights )
 
 # Define the loss function
 loss_fn = tlx.losses.softmax_cross_entropy_with_logits
 
-# Training loop
-n_epoch = 2
-for epoch in range(n_epoch):
-    start_time = time.time()
-    model.set_train()
-    train_loss, n_iter = 0, 0
-
-    with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{n_epoch}", unit="batch") as pbar:
-        for X_batch, y_batch in train_dataloader:
-            X_batch = tlx.convert_to_tensor(X_batch)
-            y_batch = tlx.convert_to_tensor(y_batch)
-            _logits = model(X_batch)
-            loss = loss_fn(_logits, y_batch)
-            
-            optimizer.zero_grad()
-            optimizer.step(loss)
-            
-            train_loss += loss.item()
-            n_iter += 1
-            pbar.update(1)
-
-    print(f"Epoch {epoch + 1} of {n_epoch} took {time.time() - start_time:.2f}s")
-    print(f"   train loss: {train_loss / n_iter:.4f}")
-
-
-
-################################ TensorLayerX and Jittor can be mixed programming. #################################
+# Use the built-in training method
+metric = tlx.metrics.Recall()
+tlx_model = tlx.model.Model(network=model, loss_fn=loss_fn, optimizer=optimizer, metrics=metric)
+tlx_model.train(n_epoch=2, train_dataset=train_dataloader, print_freq=1, print_train_batch=True)
+
+
+################################ TensorLayerX and Torch. #################################
 
 
 
diff --git a/examples/basic_tutorials/cifar10_cnn_dist.py b/examples/basic_tutorials/cifar10_cnn_dist.py
index c4713e0..bff9efb 100644
--- a/examples/basic_tutorials/cifar10_cnn_dist.py
+++ b/examples/basic_tutorials/cifar10_cnn_dist.py
@@ -2,7 +2,8 @@
 # -*- coding: utf-8 -*-
 
 import os
-os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'paddle'
+os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
 # os.environ['TL_BACKEND'] = 'torch'
diff --git a/examples/basic_tutorials/cifar10_cnn_train.py b/examples/basic_tutorials/cifar10_cnn_train.py
index 2661ce5..e98294d 100644
--- a/examples/basic_tutorials/cifar10_cnn_train.py
+++ b/examples/basic_tutorials/cifar10_cnn_train.py
@@ -9,7 +9,6 @@
 os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
-
 # os.environ['TL_BACKEND'] = 'torch'
 
 
@@ -76,7 +75,7 @@ def forward(self, x):
 
 # 定义损失函数、优化器等
 loss_fn=tlx.losses.softmax_cross_entropy_with_logits
-optimizer = tlx.optimizers.Adam(net.trainable_weights, lr=learning_rate)
+optimizer = tlx.optimizers.Adam(lr=learning_rate)
 metrics = tlx.metrics.Accuracy()
 
 
diff --git a/examples/basic_tutorials/gradient_clip_mixed_tensorflow.py b/examples/basic_tutorials/gradient_clip_mixed_tensorflow.py
index 4432e81..d72b0c7 100644
--- a/examples/basic_tutorials/gradient_clip_mixed_tensorflow.py
+++ b/examples/basic_tutorials/gradient_clip_mixed_tensorflow.py
@@ -2,9 +2,10 @@
 # -*- coding: utf-8 -*-
 # The tensorlayerx and tensorflow operators can be mixed
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'paddle'
 # os.environ['TL_BACKEND'] = 'torch'
+os.environ['TL_BACKEND'] = 'jittor'
 
 
 import time
diff --git a/examples/basic_tutorials/jiitor_tutorial.py b/examples/basic_tutorials/jiitor_models_tutorial.py
similarity index 67%
rename from examples/basic_tutorials/jiitor_tutorial.py
rename to examples/basic_tutorials/jiitor_models_tutorial.py
index 654835f..afb4495 100644
--- a/examples/basic_tutorials/jiitor_tutorial.py
+++ b/examples/basic_tutorials/jiitor_models_tutorial.py
@@ -1,142 +1,123 @@
 
+# """"
+# Here we have a Tutorial of Jittor backend being used with several different models, which includes:
 
+# """
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-# +++++++++++++++++++++++++++++++++++++ Jittor CNN ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-# import os
-# import time
-# import tensorlayerx as tlx
-# from tensorlayerx.dataflow import Dataset, DataLoader
-# from tensorlayerx.vision.transforms import (
-#     Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
-# )
-# from tensorlayerx.nn import Conv2d, Linear, Flatten, Module, MaxPool2d, BatchNorm2d
-# from tensorlayerx.optimizers import Adam
-# from tqdm import tqdm
-
-# # Enable debug logging
-# tlx.logging.set_verbosity(tlx.logging.DEBUG)
-
-# os.environ['TL_BACKEND'] = 'jittor'
-
-# # Download and prepare the CIFAR10 dataset
-# print("Downloading CIFAR10 dataset...")
-# X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
-
-# # Define the CIFAR10 dataset
-# class CIFAR10Dataset(Dataset):
-#     def __init__(self, data, label, transforms):
-#         self.data = data
-#         self.label = label
-#         self.transforms = transforms
-
-#     def __getitem__(self, idx):
-#         x = self.data[idx].astype('uint8')
-#         y = self.label[idx].astype('int64')
-#         x = self.transforms(x)
-#         return x, y
-
-#     def __len__(self):
-#         return len(self.label)
-
-# # Define the CIFAR10 images preprocessing pipeline
-# train_transforms = Compose([
-#     RandomCrop(size=[24, 24]),
-#     RandomFlipHorizontal(),
-#     RandomBrightness(brightness_factor=(0.5, 1.5)),
-#     RandomContrast(contrast_factor=(0.5, 1.5)),
-#     StandardizePerImage()
-# ])
-
-# test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
-
-# # Create DataLoaders for training and testing
-# print("Processing CIFAR10 dataset...")
-# train_dataset = CIFAR10Dataset(data=X_train, label=y_train, transforms=train_transforms)
-# test_dataset = CIFAR10Dataset(data=X_test, label=y_test, transforms=test_transforms)
-
-# train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
-# test_dataloader = DataLoader(test_dataset, batch_size=128)
-
-
-# class SimpleCNN(Module):
-#     def __init__(self):
-#         super(SimpleCNN, self).__init__()
-#         self.conv1 = Conv2d(16, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=3)
-#         self.conv2 = Conv2d(32, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=16)
-#         self.maxpool1 = MaxPool2d((2, 2), (2, 2), padding='SAME')
-#         self.conv3 = Conv2d(64, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=32)
-#         self.bn1 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
-#         self.conv4 = Conv2d(128, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=64)
-#         self.maxpool2 = MaxPool2d((2, 2), (2, 2), padding='SAME')
-#         self.flatten = Flatten()
-#         self.fc1 = Linear(out_features=128, act=tlx.nn.ReLU, in_features=128 * 6 * 6)
-#         self.fc2 = Linear(out_features=64, act=tlx.nn.ReLU, in_features=128)
-#         self.fc3 = Linear(out_features=10, act=None, in_features=64)
-
-#     def forward(self, x):
-#         z = self.conv1(x)
-#         z = self.conv2(z)
-#         z = self.maxpool1(z)
-#         z = self.conv3(z)
-#         z = self.bn1(z)
-#         z = self.conv4(z)
-#         z = self.maxpool2(z)
-#         z = self.flatten(z)
-#         z = self.fc1(z)
-#         z = self.fc2(z)
-#         z = self.fc3(z)
-#         return z
-
-
-
-
-# # Instantiate the model
-# model = SimpleCNN()
-
-# # Define the optimizer
-# optimizer = Adam(model.trainable_weights, lr=0.001)
-
-# # Define the loss function
-# loss_fn = tlx.losses.softmax_cross_entropy_with_logits
-
-# # Training loop
-# n_epoch = 2
-# for epoch in range(n_epoch):
-#     start_time = time.time()
-#     model.set_train()
-#     train_loss, n_iter = 0, 0
-
-#     with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{n_epoch}", unit="batch") as pbar:
-#         for X_batch, y_batch in train_dataloader:
-            
-#             X_batch = tlx.convert_to_tensor(X_batch)
-#             y_batch = tlx.convert_to_tensor(y_batch)
-#             _logits = model(X_batch)
-#             loss = loss_fn(_logits, y_batch)
-#             optimizer.zero_grad()
-#             optimizer.step(loss)
-            
-#             train_loss += loss.item()  # Using .item() to get the scalar value
-#             n_iter += 1
-#             pbar.update(1)
-
-#     print(f"Epoch {epoch + 1} of {n_epoch} took {time.time() - start_time:.2f}s")
-#     print(f"   train loss: {train_loss / n_iter:.4f}")
-
-
+# +++++++++++++++++++++++++++++++++++++ Jittor CIFAR CNN ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import os
+import time
+import tensorlayerx as tlx
+from tensorlayerx.dataflow import Dataset, DataLoader
+from tensorlayerx.vision.transforms import (
+    Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
+)
+from tensorlayerx.nn import Conv2d, Linear, Flatten, Module, MaxPool2d, BatchNorm2d
+from tensorlayerx.optimizers import Adam
+from tqdm import tqdm
+
+# Enable debug logging
+tlx.logging.set_verbosity(tlx.logging.DEBUG)
+
+os.environ['TL_BACKEND'] = 'jittor'
+
+# Download and prepare the CIFAR10 dataset
+print("Downloading CIFAR10 dataset...")
+X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+
+# Define the CIFAR10 dataset
+class CIFAR10Dataset(Dataset):
+    def __init__(self, data, label, transforms):
+        self.data = data
+        self.label = label
+        self.transforms = transforms
+
+    def __getitem__(self, idx):
+        x = self.data[idx].astype('uint8')
+        y = self.label[idx].astype('int64')
+        x = self.transforms(x)
+        return x, y
+
+    def __len__(self):
+        return len(self.label)
+
+# Define the CIFAR10 images preprocessing pipeline
+train_transforms = Compose([
+    RandomCrop(size=[24, 24]),
+    RandomFlipHorizontal(),
+    RandomBrightness(brightness_factor=(0.5, 1.5)),
+    RandomContrast(contrast_factor=(0.5, 1.5)),
+    StandardizePerImage()
+])
+
+test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
+
+# Create DataLoaders for training and testing
+print("Processing CIFAR10 dataset...")
+train_dataset = CIFAR10Dataset(data=X_train, label=y_train, transforms=train_transforms)
+test_dataset = CIFAR10Dataset(data=X_test, label=y_test, transforms=test_transforms)
+
+train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
+test_dataloader = DataLoader(test_dataset, batch_size=128)
+
+
+class SimpleCNN(Module):
+    def __init__(self):
+        super(SimpleCNN, self).__init__()
+        self.conv1 = Conv2d(16, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=3)
+        self.conv2 = Conv2d(32, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=16)
+        self.maxpool1 = MaxPool2d((2, 2), (2, 2), padding='SAME')
+        self.conv3 = Conv2d(64, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=32)
+        self.bn1 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+        self.conv4 = Conv2d(128, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=64)
+        self.maxpool2 = MaxPool2d((2, 2), (2, 2), padding='SAME')
+        self.flatten = Flatten()
+        self.fc1 = Linear(out_features=128, act=tlx.nn.ReLU, in_features=128 * 6 * 6)
+        self.fc2 = Linear(out_features=64, act=tlx.nn.ReLU, in_features=128)
+        self.fc3 = Linear(out_features=10, act=None, in_features=64)
+
+    def forward(self, x):
+        z = self.conv1(x)
+        z = self.conv2(z)
+        z = self.maxpool1(z)
+        z = self.conv3(z)
+        z = self.bn1(z)
+        z = self.conv4(z)
+        z = self.maxpool2(z)
+        z = self.flatten(z)
+        z = self.fc1(z)
+        z = self.fc2(z)
+        z = self.fc3(z)
+        return z
+
+
+# Instantiate the model
+model = SimpleCNN()
+
+# Define the optimizer
+optimizer = Adam(lr=0.001)
+# optimizer = Adam(lr=0.001, params=model.trainable_weights )
+
+# Define the loss function
+loss_fn = tlx.losses.softmax_cross_entropy_with_logits
+
+# Use the built-in training method
+metric = tlx.metrics.Accuracy()
+tlx_model = tlx.model.Model(network=model, loss_fn=loss_fn, optimizer=optimizer, metrics=metric)
+tlx_model.train(n_epoch=2, train_dataset=train_dataloader, print_freq=1, print_train_batch=True)
 
 
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-# +++++++++++++++++++++++++++++++++++++ Jittor LSTM ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor IMDB LSTM ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
 # import os
 # import sys
 # import tensorlayerx as tlx
-# from tensorlayerx.nn import Module, Linear, LSTM, Embedding
+# from tensorlayerx.nn import Module, LSTM, Embedding, Linear
 # from tensorlayerx.dataflow import Dataset
-# from keras.datasets import imdb
-# from keras.preprocessing import sequence
 # import numpy as np
+
 # os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 # os.environ['TL_BACKEND'] = 'jittor'
 # sys.setrecursionlimit(10000)  # Increase recursion limit
@@ -144,15 +125,9 @@
 # # Set parameters
 # max_features = 20000
 # maxlen = 200
-
 # prev_h = np.random.random([1, 200, 64]).astype(np.float32)
 # prev_h = tlx.convert_to_tensor(prev_h)
-
-# # Load and preprocess the IMDB dataset
-# (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
-# X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
-# X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
-
+# X_train, y_train, X_test, y_test = tlx.files.load_imdb_dataset('data', nb_words=20000, test_split=0.2)
 # vocab_size = max_features
 # seq_Len = 200
 
@@ -209,7 +184,7 @@
 # print(net)
 
 # # Define optimizer, metric, and loss function using TLX functions
-# optimizer = tlx.optimizers.Adam(lr=1e-3, params=net.trainable_weights)
+# optimizer = tlx.optimizers.Adam(lr=1e-3)
 # metric = tlx.metrics.Accuracy()
 # loss_fn = tlx.losses.softmax_cross_entropy_with_logits
 
@@ -217,11 +192,8 @@
 # model = tlx.model.Model(network=net, loss_fn=loss_fn, optimizer=optimizer, metrics=metric)
 # model.train(n_epoch=n_epoch, train_dataset=train_loader, print_freq=print_freq, print_train_batch=True)
 
-
-
-
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-# +++++++++++++++++++++++++++++++++++++ Jittor MLP ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor MNIST MLP ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 # # ! /usr/bin/python
 # # -*- coding: utf-8 -*-
@@ -296,7 +268,7 @@
 # # Get training parameters
 # train_weights = MLP.trainable_weights
 # # Define the optimizer, use the Momentum optimizer, and set the learning rate to 0.05, momentum to 0.9
-# optimizer = tlx.optimizers.Momentum(lr=0.05, momentum= 0.9, params = train_weights )
+# optimizer = tlx.optimizers.Momentum(lr=0.05, momentum= 0.9 )
 # # Define evaluation metrics.
 # metric = tlx.metrics.Accuracy()
 # # Define loss function, this operator implements the cross entropy loss function with softmax. This function
@@ -315,11 +287,12 @@
 
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # +++++++++++++++++++++++++++++++++++++ Jittor MNIST Sequential ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-#! /usr/bin/python
+# ! /usr/bin/python
 # -*- coding: utf-8 -*-
 # import os
 # os.environ['TL_BACKEND'] = 'jittor'
 
+# # os.environ['TL_BACKEND'] = 'torch'
 
 # from tensorlayerx.nn import Sequential
 # from tensorlayerx.nn import Linear
@@ -358,7 +331,7 @@
 # shuffle_buffer_size = 128
 
 # train_weights = MLP.trainable_weights
-# optimizer = tlx.optimizers.Momentum(lr=0.05,momentum= 0.9, params=train_weights)
+# optimizer = tlx.optimizers.Momentum(lr=0.05,momentum= 0.9)
 # train_dataset = mnistdataset(data=X_train, label=y_train)
 # train_loader = tlx.dataflow.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
 # metric = tlx.metrics.Accuracy()
@@ -367,14 +340,14 @@
 # )
 # model.train(n_epoch=n_epoch, train_dataset=train_loader, print_freq=print_freq, print_train_batch=False)
 # model.save_weights('./model.npz', format='npz_dict')
-# model.load_weights('./model.npz', format='npz_dict')
+# model.load_weights('./model.npz', format='npz_dict', skip=True)
 
 
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # +++++++++++++++++++++++++++++++++++++ Jittor MNIST GAN ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-# #! /usr/bin/python
-# # -*- coding: utf-8 -*-
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
 
 # import os
 # os.environ['TL_BACKEND'] = 'jittor'
@@ -487,22 +460,18 @@
 # # loss_fn = tlx.losses.sigmoid_cross_entropy
 # # optimizer = tlx.optimizers.Momentum(learning_rate=5e-4, momentum=0.5)
 # loss_fn = tlx.losses.mean_squared_error
-
+# # Define the optimizers, use the Adam optimizer.
+# optimizer_g = tlx.optimizers.Adam(lr=3e-4, beta_1=0.5, beta_2=0.999)
+# optimizer_d = tlx.optimizers.Adam(lr=3e-4)
 # # Get training parameters
 # g_weights = G.trainable_weights
 # d_weights = D.trainable_weights
-
 # net_with_loss_G = WithLossG(G, D, loss_fn)
 # net_with_loss_D = WithLossD(G, D, loss_fn)
-
-# # Define the optimizers, use the Adam optimizer.
-# optimizer_g = tlx.optimizers.Adam(lr=3e-4, beta_1=0.5, beta_2=0.999, params= g_weights)
-# optimizer_d = tlx.optimizers.Adam(lr=3e-4, params= d_weights)
-
 # # Initialize one-step training
 # train_one_step_g = TrainOneStep(net_with_loss_G, optimizer_g, g_weights)
 # train_one_step_d = TrainOneStep(net_with_loss_D, optimizer_d, d_weights)
-# n_epoch = 50
+# n_epoch = 2
 
 
 # def plot_fake_image(fake_image, num):
@@ -535,3 +504,96 @@
 #         print("   g loss:  {}".format(g_loss / n_iter))
 #     fake_image = G(tlx.convert_to_tensor(np.random.random(size=(36, 100)), dtype=tlx.float32))
 #     plot_fake_image(fake_image, 36)
+
+
+
+# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor IMDB RNN +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+
+# import os
+# import sys
+# import tensorlayerx as tlx
+# from tensorlayerx.nn import Module, RNN, Embedding, Linear
+# from tensorlayerx.dataflow import Dataset
+# import numpy as np
+					 
+
+# os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+# os.environ['TL_BACKEND'] = 'jittor'
+# sys.setrecursionlimit(10000)  # Increase recursion limit
+
+# # Set parameters
+# max_features = 20000
+# maxlen = 200
+# prev_h = np.random.random([1, 200, 64]).astype(np.float32)
+# prev_h = tlx.convert_to_tensor(prev_h)
+# X_train, y_train, X_test, y_test = tlx.files.load_imdb_dataset('data', nb_words=20000, test_split=0.2)
+# vocab_size = max_features
+# seq_Len = 200
+
+		
+# class ImdbDataset(Dataset):
+					   
+#     def __init__(self, X, y):
+#         self.X = X
+#         self.y = y
+
+#     def __getitem__(self, index):
+#         data = self.X[index]
+#         data = np.concatenate([data[:seq_Len], [0] * (seq_Len - len(data))]).astype('int64')  # set
+#         label = self.y[index].astype('int64')
+#         return data, label
+
+#     def __len__(self):
+#         return len(self.y)
+
+
+# class ImdbNet(Module):
+
+#     def __init__(self):
+#         super(ImdbNet, self).__init__()
+#         self.embedding = Embedding(num_embeddings=vocab_size, embedding_dim=64)
+#         self.rnn = RNN(input_size=64, hidden_size=64)
+#         self.linear1 = Linear(in_features=64, out_features=64, act=tlx.nn.ReLU)
+#         self.linear2 = Linear(in_features=64, out_features=2)
+
+#     def forward(self, x):
+#         x = self.embedding(x)
+#         x, _ = self.rnn(x)
+#         x = tlx.reduce_mean(x, axis=1)
+#         x = self.linear1(x)
+#         x = self.linear2(x)
+#         return x
+
+#     def __repr__(self):
+#         return "ImdbNet(embedding_dim=64, hidden_size=64, num_classes=2)"
+
+#     def __str__(self):
+#         return self.__repr__()
+
+# # Training settings
+# n_epoch = 1
+# batch_size = 64
+# print_freq = 2
+
+# # Create DataLoader
+# train_dataset = ImdbDataset(X=X_train, y=y_train)
+# train_loader = tlx.dataflow.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+
+# # Initialize the network
+# net = ImdbNet()
+# print(net)
+
+# # Define optimizer, metric, and loss function using TLX functions
+# optimizer = tlx.optimizers.Adam(lr=1e-3)
+# metric = tlx.metrics.Accuracy()
+# loss_fn = tlx.losses.softmax_cross_entropy_with_logits
+
+# # Create and train the model
+# model = tlx.model.Model(network=net, loss_fn=loss_fn, optimizer=optimizer, metrics=metric)
+# model.train(n_epoch=n_epoch, train_dataset=train_loader, print_freq=print_freq, print_train_batch=True)
+# Optionally, you could now dump the network weights to a file like this:
+# model.save_weights('./rnn_model.npz', format='npz_dict')
+# model.load_weights('./rnn_model.npz', format='npz_dict', skip= True)
+
diff --git a/examples/basic_tutorials/mnist_dataflow.py b/examples/basic_tutorials/mnist_dataflow.py
index a4856d9..18af70a 100644
--- a/examples/basic_tutorials/mnist_dataflow.py
+++ b/examples/basic_tutorials/mnist_dataflow.py
@@ -2,7 +2,9 @@
 # -*- coding: utf-8 -*-
 
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'tensorflow'
+os.environ['TL_BACKEND'] = 'jittor'
+
 # os.environ['TL_BACKEND'] = 'mindspore'
 # os.environ['TL_BACKEND'] = 'paddle'
 
diff --git a/examples/basic_tutorials/mnist_gan.py b/examples/basic_tutorials/mnist_gan.py
index 5700dd6..a40dd7a 100644
--- a/examples/basic_tutorials/mnist_gan.py
+++ b/examples/basic_tutorials/mnist_gan.py
@@ -5,7 +5,8 @@
 # os.environ['TL_BACKEND'] = 'paddle'
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
-os.environ['TL_BACKEND'] = 'torch'
+# os.environ['TL_BACKEND'] = 'torch'
+os.environ['TL_BACKEND'] = 'jittor'
 
 import time
 import numpy as np
diff --git a/examples/basic_tutorials/mnist_mlp_custom_train.py b/examples/basic_tutorials/mnist_mlp_custom_train.py
index 60e0bce..fe66f50 100644
--- a/examples/basic_tutorials/mnist_mlp_custom_train.py
+++ b/examples/basic_tutorials/mnist_mlp_custom_train.py
@@ -5,7 +5,8 @@
 import os
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
-os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'paddle'
+os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'oneflow'
 # os.environ['TL_BACKEND'] = 'torch'
 
diff --git a/examples/basic_tutorials/mnist_mlp_mix_programming.py b/examples/basic_tutorials/mnist_mlp_mix_programming.py
index 4f2035d..d05619e 100644
--- a/examples/basic_tutorials/mnist_mlp_mix_programming.py
+++ b/examples/basic_tutorials/mnist_mlp_mix_programming.py
@@ -1,39 +1,46 @@
-################################ TensorLayerX and TensorFlow can be mixed programming. #################################
-import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+################################ TensorLayerX and Jittor can be mixed programming. #################################
 
-import numpy as np
+import os
 import time
-
-import tensorflow as tf
+import numpy as np
 import tensorlayerx as tlx
-from tensorlayerx.nn import Module
-from tensorlayerx.nn import Linear, Dropout
+import jittor as jt
+from jittor import nn, optim
+from tensorlayerx.nn import Module, Linear, Dropout
+from tensorlayerx.dataflow import Dataset, DataLoader
+from tqdm import tqdm
+
+# Enable debug logging
+tlx.logging.set_verbosity(tlx.logging.DEBUG)
+
+# Set the backend environment variable
+os.environ['TL_BACKEND'] = 'jittor'
 
 # Load MNIST data by TensorLayerX
 X_train, y_train, X_val, y_val, X_test, y_test = tlx.files.load_mnist_dataset(shape=(-1, 784))
 
-def generator_train():
-    inputs = X_train
-    targets = y_train
-    if len(inputs) != len(targets):
-        raise AssertionError("The length of inputs and targets should be equal")
-    for _input, _target in zip(inputs, targets):
-        yield _input, _target
+# Define the MNIST dataset using TensorLayerX
+class MNISTDataset(Dataset):
+    def __init__(self, data, label):
+        self.data = data
+        self.label = label
 
-# Make Dataset by TensorFlow
-train_ds = tf.data.Dataset.from_generator(generator_train, output_types=(tf.float32, tf.int32))
-shuffle_buffer_size = 128
-batch_size = 128
-train_ds = train_ds.shuffle(shuffle_buffer_size)
-train_ds = train_ds.batch(batch_size)
+    def __getitem__(self, index):
+        data = self.data[index].astype('float32')
+        label = self.label[index].astype('int64')
+        return data, label
 
+    def __len__(self):
+        return len(self.data)
 
-# Define the network through tensorlayerx
-class CustomModel(Module):
+# Create DataLoaders for training and testing
+train_dataset = MNISTDataset(data=X_train, label=y_train)
+train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
 
+# Define a simple MLP model using TensorLayerX
+class MLP(Module):
     def __init__(self):
-        super(CustomModel, self).__init__()
+        super(MLP, self).__init__()
         self.dropout1 = Dropout(p=0.2)
         self.linear1 = Linear(out_features=800, in_features=784)
         self.dropout2 = Dropout(p=0.2)
@@ -50,48 +57,160 @@ def forward(self, x):
         out = self.linear3(z)
         return out
 
+# Instantiate the model
+model = MLP()
 
-MLP = CustomModel()
+# Define the loss function
+loss_fn = tlx.losses.softmax_cross_entropy_with_logits
+
+# Define the optimizer using Jittor
+optimizer = optim.Adam(model.trainable_weights, lr=0.0001)
+
+# Custom training loop
 n_epoch = 50
-batch_size = 500
 print_freq = 1
-train_weights = MLP.trainable_weights
-# Define the optimizer through tensorlayerx
-optimizer = tlx.optimizers.Adam(lr=0.0001)
 
-for epoch in range(n_epoch):  ## iterate the dataset n_epoch times
+for epoch in range(n_epoch):
     start_time = time.time()
-    ## iterate over the entire training set once (shuffle the data via training)
-    for X_batch, y_batch in train_ds :
-        MLP.set_train()  # enable dropout
-        with tf.GradientTape() as tape: # use tf.GradientTape() to record gradient
-            ## compute outputs
-            _logits = MLP(X_batch)
-            ## compute loss and update model
-            _loss = tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
-        grad = tape.gradient(_loss, train_weights)
-        optimizer.apply_gradients(zip(grad, train_weights))
-
-    ## use training and evaluation sets to evaluate the model every print_freq epoch
-    if epoch + 1 == 1 or (epoch + 1) % print_freq == 0:
-        print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
-        train_loss, train_acc, n_iter = 0, 0, 0
-        for X_batch, y_batch in train_ds :
-            _logits = MLP(X_batch)
-            train_loss += tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
-            train_acc += np.mean(np.equal(np.argmax(_logits, 1), y_batch))
+    model.set_train()
+    train_loss, train_acc, n_iter = 0, 0, 0
+
+    with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{n_epoch}", unit="batch") as pbar:
+        for X_batch, y_batch in train_dataloader:
+            X_batch = tlx.convert_to_tensor(X_batch)
+            y_batch = tlx.convert_to_tensor(y_batch)
+
+            # Forward pass
+            _logits = model(X_batch)
+            # Compute loss
+            _loss = loss_fn(_logits, y_batch)
+            # Backward pass and optimization
+            optimizer.step(_loss)
+
+            train_loss += _loss.item()
+            train_acc += np.mean(np.equal(np.argmax(_logits, axis=1), y_batch))
             n_iter += 1
-        print("   train loss: {}".format(train_loss / n_iter))
-        print("   train acc:  {}".format(train_acc / n_iter))
-
-        val_loss, val_acc, n_iter = 0, 0, 0
-        for X_batch, y_batch in train_ds:
-            _logits = MLP(X_batch)  # is_train=False, disable dropout
-            val_loss += tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
-            val_acc += np.mean(np.equal(np.argmax(_logits, 1), y_batch))
+
+            pbar.set_postfix({'loss': train_loss / n_iter, 'acc': train_acc / n_iter})
+            pbar.update(1)
+
+    # Print training progress
+    print("Epoch {} of {} took {:.2f}s".format(epoch + 1, n_epoch, time.time() - start_time))
+    print("   train loss: {:.6f}".format(train_loss / n_iter))
+    print("   train acc:  {:.6f}".format(train_acc / n_iter))
+
+    # Validation (optional, using training data as a placeholder for validation)
+    val_loss, val_acc, n_iter = 0, 0, 0
+    with tqdm(total=len(train_dataloader), desc="Validation", unit="batch") as pbar:
+        for X_batch, y_batch in train_dataloader:
+            X_batch = tlx.convert_to_tensor(X_batch)
+            y_batch = tlx.convert_to_tensor(y_batch)
+            _logits = model(X_batch)
+            val_loss += loss_fn(_logits, y_batch).item()
+            val_acc += np.mean(np.equal(np.argmax(_logits, axis=1), y_batch))
             n_iter += 1
-        print("   val loss: {}".format(val_loss / n_iter))
-        print("   val acc:  {}".format(val_acc / n_iter))
+
+            pbar.set_postfix({'val_loss': val_loss / n_iter, 'val_acc': val_acc / n_iter})
+            pbar.update(1)
+    print("   val loss: {:.6f}".format(val_loss / n_iter))
+    print("   val acc:  {:.6f}".format(val_acc / n_iter))
+
+
+
+################################ TensorLayerX and TensorFlow can be mixed programming. #################################
+# import os
+# os.environ['TL_BACKEND'] = 'tensorflow'
+
+# import numpy as np
+# import time
+
+# import tensorflow as tf
+# import tensorlayerx as tlx
+# from tensorlayerx.nn import Module
+# from tensorlayerx.nn import Linear, Dropout
+
+# # Load MNIST data by TensorLayerX
+# X_train, y_train, X_val, y_val, X_test, y_test = tlx.files.load_mnist_dataset(shape=(-1, 784))
+
+# def generator_train():
+#     inputs = X_train
+#     targets = y_train
+#     if len(inputs) != len(targets):
+#         raise AssertionError("The length of inputs and targets should be equal")
+#     for _input, _target in zip(inputs, targets):
+#         yield _input, _target
+
+# # Make Dataset by TensorFlow
+# train_ds = tf.data.Dataset.from_generator(generator_train, output_types=(tf.float32, tf.int32))
+# shuffle_buffer_size = 128
+# batch_size = 128
+# train_ds = train_ds.shuffle(shuffle_buffer_size)
+# train_ds = train_ds.batch(batch_size)
+
+
+# # Define the network through tensorlayerx
+# class CustomModel(Module):
+
+#     def __init__(self):
+#         super(CustomModel, self).__init__()
+#         self.dropout1 = Dropout(p=0.2)
+#         self.linear1 = Linear(out_features=800, in_features=784)
+#         self.dropout2 = Dropout(p=0.2)
+#         self.linear2 = Linear(out_features=800, act=tlx.nn.ReLU, in_features=800)
+#         self.dropout3 = Dropout(p=0.2)
+#         self.linear3 = Linear(out_features=10, act=tlx.nn.ReLU, in_features=800)
+
+#     def forward(self, x):
+#         z = self.dropout1(x)
+#         z = self.linear1(z)
+#         z = self.dropout2(z)
+#         z = self.linear2(z)
+#         z = self.dropout3(z)
+#         out = self.linear3(z)
+#         return out
+
+
+# MLP = CustomModel()
+# n_epoch = 50
+# batch_size = 500
+# print_freq = 1
+# train_weights = MLP.trainable_weights
+# # Define the optimizer through tensorlayerx
+# optimizer = tlx.optimizers.Adam(lr=0.0001)
+
+# for epoch in range(n_epoch):  ## iterate the dataset n_epoch times
+#     start_time = time.time()
+#     ## iterate over the entire training set once (shuffle the data via training)
+#     for X_batch, y_batch in train_ds :
+#         MLP.set_train()  # enable dropout
+#         with tf.GradientTape() as tape: # use tf.GradientTape() to record gradient
+#             ## compute outputs
+#             _logits = MLP(X_batch)
+#             ## compute loss and update model
+#             _loss = tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
+#         grad = tape.gradient(_loss, train_weights)
+#         optimizer.apply_gradients(zip(grad, train_weights))
+
+#     ## use training and evaluation sets to evaluate the model every print_freq epoch
+#     if epoch + 1 == 1 or (epoch + 1) % print_freq == 0:
+#         print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
+#         train_loss, train_acc, n_iter = 0, 0, 0
+#         for X_batch, y_batch in train_ds :
+#             _logits = MLP(X_batch)
+#             train_loss += tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
+#             train_acc += np.mean(np.equal(np.argmax(_logits, 1), y_batch))
+#             n_iter += 1
+#         print("   train loss: {}".format(train_loss / n_iter))
+#         print("   train acc:  {}".format(train_acc / n_iter))
+
+#         val_loss, val_acc, n_iter = 0, 0, 0
+#         for X_batch, y_batch in train_ds:
+#             _logits = MLP(X_batch)  # is_train=False, disable dropout
+#             val_loss += tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
+#             val_acc += np.mean(np.equal(np.argmax(_logits, 1), y_batch))
+#             n_iter += 1
+#         print("   val loss: {}".format(val_loss / n_iter))
+#         print("   val acc:  {}".format(val_acc / n_iter))
 
 ################################ TensorLayerX and MindSpore can be mixed programming. #################################
 # import os
diff --git a/examples/basic_tutorials/mnist_mlp_simple_train.py b/examples/basic_tutorials/mnist_mlp_simple_train.py
index b9787ae..e169c08 100644
--- a/examples/basic_tutorials/mnist_mlp_simple_train.py
+++ b/examples/basic_tutorials/mnist_mlp_simple_train.py
@@ -5,7 +5,8 @@
 import os
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
-os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'paddle'
+os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'oneflow'
 # os.environ['TL_BACKEND'] = 'torch'
 
diff --git a/examples/basic_tutorials/mnist_sequential.py b/examples/basic_tutorials/mnist_sequential.py
index edfe109..16f500b 100644
--- a/examples/basic_tutorials/mnist_sequential.py
+++ b/examples/basic_tutorials/mnist_sequential.py
@@ -3,7 +3,8 @@
 import os
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
-os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'paddle'
+os.environ['TL_BACKEND'] = 'jittor'
 
 from tensorlayerx.nn import Sequential
 from tensorlayerx.nn import Linear
@@ -51,4 +52,4 @@ def __len__(self):
 )
 model.train(n_epoch=n_epoch, train_dataset=train_loader, print_freq=print_freq, print_train_batch=False)
 model.save_weights('./model.npz', format='npz_dict')
-model.load_weights('./model.npz', format='npz_dict')
+model.load_weights('./model.npz', format='npz_dict', skip = True)
diff --git a/examples/basic_tutorials/module_container.py b/examples/basic_tutorials/module_container.py
index ef57193..5b7cffc 100644
--- a/examples/basic_tutorials/module_container.py
+++ b/examples/basic_tutorials/module_container.py
@@ -2,7 +2,8 @@
 # -*- coding: utf-8 -*-
 
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'tensorflow'
+os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'mindspore'
 # os.environ['TL_BACKEND'] = 'paddle'
 # os.environ['TL_BACKEND'] = 'torch'
diff --git a/examples/basic_tutorials/parameter_container.py b/examples/basic_tutorials/parameter_container.py
index f780dc3..d45ab13 100644
--- a/examples/basic_tutorials/parameter_container.py
+++ b/examples/basic_tutorials/parameter_container.py
@@ -2,7 +2,8 @@
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
 # os.environ['TL_BACKEND'] = 'paddle'
-os.environ['TL_BACKEND'] = 'torch'
+os.environ['TL_BACKEND'] = 'jittor'
+# os.environ['TL_BACKEND'] = 'torch'
 
 import tensorlayerx as tlx
 from tensorlayerx.nn import Module, Parameter, ParameterList, ParameterDict
diff --git a/examples/basic_tutorials/quick_start.py b/examples/basic_tutorials/quick_start.py
index 916f7ef..d428882 100644
--- a/examples/basic_tutorials/quick_start.py
+++ b/examples/basic_tutorials/quick_start.py
@@ -1,6 +1,8 @@
 # TensorlayerX目前支持包括TensorFlow、Pytorch、PaddlePaddle、MindSpore作为计算后端，指定计算后端的方法也非常简单，只需要设置环境变量即可
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'tensorflow'
+os.environ['TL_BACKEND'] = 'jittor'
+# os.environ['TL_BACKEND'] = 'torch'
 # os.environ['TL_BACKEND'] = 'mindspore'
 # os.environ['TL_BACKEND'] = 'paddle'
 
@@ -30,35 +32,34 @@ class CNN(Module):
 
     def __init__(self):
         super(CNN, self).__init__()
-        # weights init
         W_init = tlx.nn.initializers.truncated_normal(stddev=5e-2)
         W_init2 = tlx.nn.initializers.truncated_normal(stddev=0.04)
+        b_init = tlx.nn.initializers.constant(value=0.1)
         b_init2 = tlx.nn.initializers.constant(value=0.1)
 
-        self.conv1 = Conv2d(64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv1', in_channels=3)
-        self.bn = BatchNorm2d(num_features=64, act=tlx.ReLU)
-        self.maxpool1 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')
+        self.conv1 = Conv2d(32, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=b_init, name='conv1', in_channels=3)
+        self.bn1 = BatchNorm2d(num_features=32, act=tlx.nn.ReLU)
+        self.maxpool1 = MaxPool2d((2, 2), (2, 2), padding='SAME', name='pool1')
 
-        self.conv2 = Conv2d(
-            64, (5, 5), (1, 1), padding='SAME', act=tlx.ReLU, W_init=W_init, name='conv2', in_channels=64
-        )
-        self.maxpool2 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')
+        self.conv2 = Conv2d(64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, b_init=b_init, name='conv2', in_channels=32)
+        self.bn2 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+        self.maxpool2 = MaxPool2d((2, 2), (2, 2), padding='SAME', name='pool2')
 
         self.flatten = Flatten(name='flatten')
-        self.linear1 = Linear(384, act=tlx.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
-        self.linear2 = Linear(192, act=tlx.ReLU, W_init=W_init2, b_init=b_init2, name='linear2relu', in_features=384)
-        self.linear3 = Linear(10, act=None, W_init=W_init2, name='output', in_features=192)
+        self.linear1 = Linear(1024, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
+																													   
+        self.linear2 = Linear(10, act=None, W_init=W_init2, b_init=b_init2, name='output', in_features=1024)
 
     def forward(self, x):
         z = self.conv1(x)
-        z = self.bn(z)
+        z = self.bn1(z)
         z = self.maxpool1(z)
         z = self.conv2(z)
+        z = self.bn2(z)
         z = self.maxpool2(z)
         z = self.flatten(z)
         z = self.linear1(z)
-        z = self.linear2(z)
-        z = self.linear3(z)
+        z = self.linear2(z)						   
         return z
 
 X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
diff --git a/examples/basic_tutorials/tensorlayerx_graph.py b/examples/basic_tutorials/tensorlayerx_graph.py
index 259d797..f76eefd 100644
--- a/examples/basic_tutorials/tensorlayerx_graph.py
+++ b/examples/basic_tutorials/tensorlayerx_graph.py
@@ -4,7 +4,8 @@
 import os
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
-os.environ['TL_BACKEND'] = 'torch'
+os.environ['TL_BACKEND'] = 'jittor'
+# os.environ['TL_BACKEND'] = 'torch'
 
 import tensorlayerx as tlx
 from tensorlayerx.nn import Module
@@ -14,39 +15,34 @@ class CNN(Module):
 
     def __init__(self):
         super(CNN, self).__init__()
-        # weights init
         W_init = tlx.nn.initializers.truncated_normal(stddev=5e-2)
         W_init2 = tlx.nn.initializers.truncated_normal(stddev=0.04)
+        b_init = tlx.nn.initializers.constant(value=0.1)
         b_init2 = tlx.nn.initializers.constant(value=0.1)
 
-        self.conv1 = Conv2d(64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv1', in_channels=3, act=tlx.nn.ReLU)
-        self.bn = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
-        self.maxpool1 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')
+        self.conv1 = Conv2d(32, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=b_init, name='conv1', in_channels=3)
+        self.bn1 = BatchNorm2d(num_features=32, act=tlx.nn.ReLU)
+        self.maxpool1 = MaxPool2d((2, 2), (2, 2), padding='SAME', name='pool1')
 
-        self.conv2 = Conv2d(
-            64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, b_init=None, name='conv2', in_channels=64
-        )
-        self.maxpool2 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')
+        self.conv2 = Conv2d(64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, b_init=b_init, name='conv2', in_channels=32)
+        self.bn2 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+        self.maxpool2 = MaxPool2d((2, 2), (2, 2), padding='SAME', name='pool2')
 
         self.flatten = Flatten(name='flatten')
-        self.linear1 = Linear(384, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
-        self.linear2 = Linear(192, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear2relu', in_features=384)
-        self.linear3 = Linear(10, act=None, W_init=W_init2, name='output1', in_features=192)
-        self.linear4 = Linear(20, act=None, W_init=W_init2, name='output2', in_features=192)
-        self.concat = tlx.nn.Concat(name='concat')
+        self.linear1 = Linear(1024, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
+																													   
+        self.linear2 = Linear(10, act=None, W_init=W_init2, b_init=b_init2, name='output', in_features=1024)
 
     def forward(self, x):
         z = self.conv1(x)
-        z = self.bn(z)
+        z = self.bn1(z)
         z = self.maxpool1(z)
         z = self.conv2(z)
+        z = self.bn2(z)
         z = self.maxpool2(z)
         z = self.flatten(z)
         z = self.linear1(z)
-        z = self.linear2(z)
-        z1 = self.linear3(z)
-        z2 = self.linear4(z)
-        z = self.concat([z1, z2])
+        z = self.linear2(z)						   
         return z
 
 model = CNN()
diff --git a/examples/basic_tutorials/tensorlayerx_model_load.py b/examples/basic_tutorials/tensorlayerx_model_load.py
index 49a5df5..567ecd7 100644
--- a/examples/basic_tutorials/tensorlayerx_model_load.py
+++ b/examples/basic_tutorials/tensorlayerx_model_load.py
@@ -2,7 +2,8 @@
 # -*- coding: utf-8 -*-
 
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'tensorflow'
+os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'paddle'
 # os.environ['TL_BACKEND'] = 'mindspore'
 # os.environ['TL_BACKEND'] = 'torch'
@@ -61,11 +62,11 @@ def forward(self, x):
         z = self.conv1(x)
         print("conv1 outputs:", z[1, :, :, 1])
         z = self.maxpool1(z)
-        print("maxpool outputs:", z[1, :, :, 1])
+        # print("maxpool outputs:", z[1, :, :, 1])
         z = self.conv2(z)
-        print("conv2 outputs:", z[1, :, :, 1])
+        # print("conv2 outputs:", z[1, :, :, 1])
         z = self.maxpool2(z)
-        print("max2 outputs:", z[1, :, :, 1])
+        # print("max2 outputs:", z[1, :, :, 1])
         z = self.flatten(z)
         z = self.linear1(z)
         z = self.linear2(z)
@@ -87,7 +88,7 @@ def forward(self, x):
 #  and imported into TensorFlow/PyTorch/PaddlePaddle/MindSpore.
 cnn = CNN()
 # cnn.save_standard_weights('./cnn.npz')
-cnn.load_standard_weights('./cnn.npz', weights_from='torch', weights_to='tensorflow')
+cnn.load_standard_weights('./cnn.npz', weights_from='torch', weights_to='tensorflow', skip= True)
 cnn.set_eval()
 
 inputs = tlx.nn.Input(shape=(10, 28, 28, 3), dtype=tlx.float32)
diff --git a/runs/mlp/events.out.tfevents.1722986988.LAPTOP-48J7839G b/runs/mlp/events.out.tfevents.1722986988.LAPTOP-48J7839G
new file mode 100644
index 0000000000000000000000000000000000000000..a976b13fec485e1f648a54763a2351591164fc7e
GIT binary patch
literal 40
rcmb1OfPlsI-b$QR2ZgKGthwnZ#hX-=n3<>NT9%quVr5j(cv&0()ffz(

literal 0
HcmV?d00001

diff --git a/tensorlayerx/backend/ops/jittor_backend.py b/tensorlayerx/backend/ops/jittor_backend.py
index 774cd05..bbce67f 100644
--- a/tensorlayerx/backend/ops/jittor_backend.py
+++ b/tensorlayerx/backend/ops/jittor_backend.py
@@ -72,7 +72,7 @@ def zeros(shape, dtype=None, device = None):
     if device == 'gpu':
         jt.flags.use_cuda = 1
     
-    return jt.zeros(shape=shape, dtype=dtype)
+    return jt.zeros(shape, dtype)
 
 
 def ones(shape, dtype=None, device = None):
@@ -545,7 +545,7 @@ def reduce_mean(input_tensor, axis=None, keepdims=False):
     if axis is not None:
         if isinstance(axis, (tuple, list)):
             axis = tuple(axis)
-        return jt.mean(input_tensor, dims=axis, keepdims=keepdims)
+        return jt.mean(input_tensor, dim=axis, keepdims=keepdims)
     else:
         return jt.mean(input_tensor)
 
diff --git a/tensorlayerx/backend/ops/jittor_nn.py b/tensorlayerx/backend/ops/jittor_nn.py
index 5ebd1a2..985f3f0 100644
--- a/tensorlayerx/backend/ops/jittor_nn.py
+++ b/tensorlayerx/backend/ops/jittor_nn.py
@@ -552,7 +552,7 @@ def same_padding(input, weight, strides, dilations):
     # H(out) = = floor( --------------------------------------------------------------   + 1 )
     #                                        stride[0]
 
-    print(type(weight))
+
     if isinstance(weight, jt.Var):
         if len(input.shape) == 3:
             filter_rows = weight.size(2)
@@ -594,9 +594,6 @@ def same_padding(input, weight, strides, dilations):
         out_cols = (input_cols + strides[1] - 1) // strides[1]
 
 
-        # print(f"4D output rows: {out_rows}, output cols: {out_cols}")
-        # print(f"4D dilations: {dilations}")
-
 
         padding_rows = max(0, (out_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - input_rows)
         padding_cols = max(0, (out_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - input_cols)
@@ -609,11 +606,6 @@ def same_padding(input, weight, strides, dilations):
         # if cols_odd:
         #     padding_cols += 1
         
-        # print(f"Filter Rows: {filter_rows}, Filter Cols: {filter_cols}")
-        # print(f"Input Rows: {input_rows}, Input Cols: {input_cols}")
-        # print(f"Output Rows: {out_rows}, Output Cols: {out_cols}")
-        # print(f"Padding Rows: {padding_rows}, Padding Cols: {padding_cols}")
-        # print(f"Rows Odd: {rows_odd}, Cols Odd: {cols_odd}")
 
         return rows_odd, cols_odd, padding_rows, padding_cols
 
@@ -653,15 +645,10 @@ def __init__(self, strides, padding, data_format='NHWC', dilations=None, out_cha
             self.strides = (strides[1], strides[2])
             self.dilations = (dilations[1], dilations[2])
         self.groups = groups
-        # print(f"strides =  {strides}")
+
 
     def __call__(self, input, filters):
-        # print(f"Conv2D_Input shape: {input.shape}")
-        # print(f"Conv2D_Filters shape: {filters.shape}")
-        # print(f"Conv2D_Strides: {self.strides}")
-        # print(f"Conv2D_Padding: {self.padding}")
-        # print(f"Conv2D_Dilations: {self.dilations}")
-        # print(f"Conv2D_Groups: {self.groups}")
+ 
 
         if self.data_format == 'NHWC':
             input = nhwc_to_nchw(input)
@@ -678,7 +665,6 @@ def __call__(self, input, filters):
 
     def conv2d_same_padding(self, input, weight, bias=None):
         rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, weight, self.strides, self.dilations)
-        # print(f"Padding rows: {padding_rows}, Padding cols: {padding_cols}")
         if rows_odd or cols_odd:
             input = nn.pad(input, [0, int(cols_odd), 0, int(rows_odd)])
 
@@ -1316,10 +1302,7 @@ def same_padding_deconvolution(input, weight, strides, dilations):
         out_cols = (input_cols - 1) * strides[1] + filter_cols
         out_depth = (input_depth - 1) * strides[2] + filter_depth
 
-        # print(f"SAME_PADDING_Stride : {strides}")
-        # print(f"out_rows = {input_rows} * {strides[0]} - {strides[0]} + 1")
-        # print(f"out_cols = {input_cols} * {strides[1]} - {strides[1]} + 1")
-        # print(f"out_depth = {input_depth} * {strides[2]} - {strides[2]} + 1")
+
 
 
         padding_rows = max(0, (input_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - out_rows)
@@ -1330,12 +1313,6 @@ def same_padding_deconvolution(input, weight, strides, dilations):
         cols_odd = (padding_cols % 2 != 0)
         depth_odd = (padding_depth % 2 != 0)
 
-        # print(f"SAME_PADDING_Filter: {filter_rows}, {filter_cols}, {filter_depth if 'filter_depth' in locals() else 'N/A'}")
-        # print(f"SAME_PADDING_Input : {input_rows}, {input_cols}, {input_depth}")
-        # print(f"SAME_PADDING_Output : {out_rows}, {out_cols}, {out_depth}")
-
-        # print(f"SAME_PADDING_Padding: {padding_rows}, {padding_cols},  {padding_depth}")
-        # print(f"SAME_PADDING_Rows Odd: {rows_odd}, Cols Odd: {cols_odd}, Depth Odd: {depth_odd}")
 
         return rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth
 
@@ -1632,12 +1609,9 @@ def __init__(
         self.name = name
         self.out_channel = out_channel
         self.data_format, self.padding = preprocess_3d_format(data_format, padding)
-        
-        # print(f'__init__Conv3D_TRANSPOSE_Stride = {self.strides}' )
-        # print(f'__init__SAME_PADDING_Dialation = {self.dilations}' )      
+
 
     def __call__(self, input, filters):
-        # print(f"conv3D_Transpose_Call: input shape={input.shape}, filters shape={filters.shape}")
         if self.data_format == 'NDHWC':
             input = nhwc_to_nchw(input)
 
@@ -1658,9 +1632,6 @@ def __call__(self, input, filters):
 
     def conv3d_transpore_same(self, input, filters):
 
-        # print(f'conv3d_transpore_same_Conv3D_TRANSPOSE_Stride = {self.strides}' )
-        # print(f'conv3d_transpore_same_SAME_PADDING_Dialation = {self.dilations}' )    
-        
         rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding_deconvolution(
             input, filters, self.strides, (1, 1, 1))
         
@@ -1861,10 +1832,8 @@ class SeparableConv2D(object):
 
     def __init__(self, strides, padding, data_format, dilations, out_channel, k_size, in_channel, depth_multiplier):
         self.data_format, self.padding = preprocess_2d_format(data_format, padding)
-        # print(f"SeparableConv2D-_strides = {strides}")
         dilations = dilations[1:] + [dilations[0]]
 
-        # print(f"SeparableConv2D-_dilations = {dilations}")
         self.depthwise_conv = Conv2D(strides, self.padding, self.data_format, dilations, groups=in_channel)
         self.strides = (0,1,1,0)
         self.dialations = (1,1)
@@ -1875,7 +1844,6 @@ def __call__(self, input, filter, point_filter=None):
 
         depthwise_conv = self.depthwise_conv(input, filter)
         pointwise_conv = self.pointwise_conv(depthwise_conv, point_filter)
-        # print(f'pointwise_conv  = {pointwise_conv.shape}' )
         return pointwise_conv
 
 
@@ -1987,17 +1955,7 @@ def __call__(self, inputs):
         raise NotImplementedError
     #     if self.data_format == 'NDHWC':
     #         inputs = nhwc_to_nchw(inputs)
-        
-    #     # Debugging print statements
-    #     print(f"Input shape before pooling: {inputs.shape}")
-    #     print(f"Input type before pooling: {type(inputs)}")
 
-    #     output = self.op(inputs)
-
-    #     # Debugging print statements
-    #     print(f"Output shape after pooling: {output.shape}")
-    #     print(f"Output type after pooling: {type(output)}")
-        
     #     if self.data_format == 'NDHWC':
     #         output = nchw_to_nhwc(output)
         # return output
diff --git a/tensorlayerx/files/utils.py b/tensorlayerx/files/utils.py
index 2dbf3e1..c1d00d7 100644
--- a/tensorlayerx/files/utils.py
+++ b/tensorlayerx/files/utils.py
@@ -1981,8 +1981,16 @@ def save_npz_dict(save_list=None, name='model.npz'):
             save_list_var.append(values.cpu().detach().numpy())
     else:
         raise NotImplementedError('Not implemented')
+    
+    
     save_var_dict = {save_list_names[idx]: val for idx, val in enumerate(save_list_var)}
-    np.savez(name, **save_var_dict)
+
+    if isinstance(save_var_dict, dict):
+        save_var_dict = {str(k): v for k, v in save_var_dict.items()}
+        np.savez(name, **save_var_dict)
+    else:
+        raise ValueError("save_var_dict must be a dictionary")
+
     save_list_var = None
     save_var_dict = None
     del save_list_var
@@ -1990,7 +1998,8 @@ def save_npz_dict(save_list=None, name='model.npz'):
     logging.info("[*] Model saved in npz_dict %s" % name)
 
 
-def load_and_assign_npz_dict(name='model.npz', network=None, skip=False):
+def load_and_assign_npz_dict(name='model.npz', network=None, skip=False, name_map=None):
+
     """Restore the parameters saved by ``tlx.files.save_npz_dict()``.
 
     Parameters
@@ -2015,16 +2024,20 @@ def load_and_assign_npz_dict(name='model.npz', network=None, skip=False):
     if tlx.BACKEND == 'torch':
         net_weights_name = [n for n, v in network.named_parameters()]
         torch_weights_dict = {n: v for n, v in network.named_parameters()}
+    elif tlx.BACKEND == 'jittor':
+        net_weights_name = [w.name() for w in network.all_weights]
+																		 
     else:
         net_weights_name = [w.name for w in network.all_weights]
 
     for key in weights.keys():
+																		
         if key not in net_weights_name:
             if skip:
                 logging.warning("Weights named '%s' not found in network. Skip it." % key)
             else:
                 raise RuntimeError(
-                    "Weights named '%s' not found in network. Hint: set argument skip=Ture "
+                    "Weights named '%s' not found in network. Hint: set argument skip=True "
                     "if you want to skip redundant or mismatch weights." % key
                 )
         else:
@@ -2037,6 +2050,8 @@ def load_and_assign_npz_dict(name='model.npz', network=None, skip=False):
                 assign_pd_variable(network.all_weights[net_weights_name.index(key)], weights[key])
             elif tlx.BACKEND == 'torch':
                 assign_th_variable(torch_weights_dict[key], weights[key])
+            elif tlx.BACKEND == 'jittor':
+                network.all_weights[net_weights_name.index(key)].update(weights[key])
             else:
                 raise NotImplementedError('Not implemented')
 
diff --git a/tensorlayerx/metrics/jittor_metric.py b/tensorlayerx/metrics/jittor_metric.py
index d5a163c..81a6e2c 100644
--- a/tensorlayerx/metrics/jittor_metric.py
+++ b/tensorlayerx/metrics/jittor_metric.py
@@ -35,67 +35,66 @@ def reset(self):
 
 
 
-
-
-class Accuracy(Metric):
-    def __init__(self, topk=1):
-        super(Accuracy, self).__init__()
-        self.topk = int(topk)  # Ensure topk is an integer
-        self.reset()
+class Accuracy:
+    def __init__(self):
+        self.correct = 0
+        self.total = 0
 
     def update(self, y_pred, y_true):
-        y_pred = jt.argsort(y_pred, dim=-1, descending=True)[0]
-
-        if (len(y_true.shape) == 1) or (len(y_true.shape) == 2 and y_true.shape[-1] == 1):
-            y_true = jt.reshape(y_true, (-1, 1))
-        elif y_true.shape[-1] != 1:
-            y_true = jt.argmax(y_true, dim=-1, keepdim=True)
-
-        correct = y_pred == y_true
-        correct = correct.to(jt.float32)
-        correct = correct.numpy()
-        num_samples = np.prod(np.array(correct.shape[:-1]))
-        num_corrects = correct.sum()
-        self.total += num_corrects
-        self.count += num_samples
+        # Step 1: Get the predicted class labels using argmax
+        y_pred = jt.argmax(y_pred, dim=-1)
+        
+        # Step 2: Ensure y_true is reshaped to match y_pred
+        y_true = np.reshape(y_true, (-1,))
+        
+        # Step 3: Compare the predicted labels to the true labels
+        correct_predictions = np.equal(y_pred, y_true)
+        
+        # Step 4: Count the number of correct predictions
+        num_correct_predictions = np.sum(correct_predictions).item()
+        
+        # Update the running totals
+        self.correct += num_correct_predictions
+        self.total += y_true.shape[0]
 
     def result(self):
-        return float(self.total) / self.count if self.count > 0 else 0.
+        # Calculate the accuracy
+        return self.correct / self.total if self.total > 0 else 0.0
 
     def reset(self):
-        self.total = 0.0
-        self.count = 0.0
+        # Reset the counters
+        self.correct = 0
+        self.total = 0
 
-class Auc(object):
 
-    def __init__(
-        self,
-        curve='ROC',
-        num_thresholds=4095,
-    ):
-        self.curve = curve
+class Auc:
+    def __init__(self, num_thresholds=4095):
         self.num_thresholds = num_thresholds
         self.reset()
 
     def update(self, y_pred, y_true):
-        if isinstance(y_true, jt.array()):
-            y_true = y_true.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_true must be a numpy array or Tensor.")
-
-        if isinstance(y_pred, jt.array):
-            y_pred = y_pred.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_pred must be a numpy array or Tensor.")
-
+        # Convert Jittor tensors to NumPy arrays if necessary
+        if isinstance(y_true, jt.Var):
+            y_true = y_true.numpy()
+        if isinstance(y_pred, jt.Var):
+            y_pred = y_pred.numpy()
+
+        # Flatten y_true to ensure it's 1-dimensional
+        y_true = np.reshape(y_true, (-1,))
+        
+        # Get the positive class probabilities
+        pos_prob = y_pred[:, 1]
+
+        # Bin the predictions into thresholds
+        bin_idx = np.floor(pos_prob * self.num_thresholds).astype(int)
+        bin_idx = np.clip(bin_idx, 0, self.num_thresholds)
+
+        # Update the histogram bins
         for i, label in enumerate(y_true):
-            value = y_pred[i, 1]  # positive probability
-            bin_idx = int(value * self.num_thresholds)
-            assert bin_idx <= self.num_thresholds
             if label:
-                self._stat_pos[bin_idx] += 1.0
+                self._stat_pos[bin_idx[i]] += 1
             else:
-                self._stat_neg[bin_idx] += 1.0
+                self._stat_neg[bin_idx[i]] += 1
 
     @staticmethod
     def trapezoid_area(x1, x2, y1, y2):
@@ -105,91 +104,80 @@ def result(self):
         tot_pos = 0.0
         tot_neg = 0.0
         auc = 0.0
-        idx = self.num_thresholds
-        while idx > 0:
+
+        for idx in range(self.num_thresholds, 0, -1):
             tot_pos_prev = tot_pos
             tot_neg_prev = tot_neg
             tot_pos += self._stat_pos[idx]
             tot_neg += self._stat_neg[idx]
             auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos, tot_pos_prev)
-            idx -= 1
 
-        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
+        return auc / (tot_pos * tot_neg) if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
 
     def reset(self):
-        """
-        Reset states and result
-        """
-        _num_pred_buckets = self.num_thresholds + 1
-        self._stat_pos = np.zeros(_num_pred_buckets)
-        self._stat_neg = np.zeros(_num_pred_buckets)
+        self._stat_pos = np.zeros(self.num_thresholds + 1)
+        self._stat_neg = np.zeros(self.num_thresholds + 1)
 
 
-class Precision(object):
-
+class Precision:
     def __init__(self):
         self.reset()
 
     def update(self, y_pred, y_true):
-        if isinstance(y_true, jt.array):
-            y_true = y_true.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_true must be a numpy array or Tensor.")
-
-        if isinstance(y_pred, jt.array):
-            y_pred = y_pred.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_pred must be a numpy array or Tensor.")
-
-        sample_num = y_true.shape[0]
-        y_pred = np.rint(y_pred).astype('int32')
-
-        for i in range(sample_num):
-            pred = y_pred[i]
-            label = y_true[i]
-            if pred == 1:
-                if pred == label:
-                    self.tp += 1
-                else:
-                    self.fp += 1
+        # Convert Jittor tensors to NumPy arrays if necessary
+        if isinstance(y_true, jt.Var):
+            y_true = y_true.numpy()
+        if isinstance(y_pred, jt.Var):
+            y_pred = y_pred.numpy()
+
+        # Ensure y_true is reshaped to match y_pred
+        y_true = np.reshape(y_true, (-1,))
+        
+        # Convert probabilities to class predictions
+        y_pred = np.argmax(y_pred, axis=1)
+
+        # Update true positives (tp) and false positives (fp)
+        self.tp += np.sum((y_pred == 1) & (y_true == 1))
+        self.fp += np.sum((y_pred == 1) & (y_true == 0))
 
     def result(self):
-
         ap = self.tp + self.fp
-        return float(self.tp) / ap if ap != 0 else .0
+        return float(self.tp) / ap if ap != 0 else 0.0
 
     def reset(self):
         self.tp = 0
         self.fp = 0
 
 
-class Recall(object):
-
+class Recall:
     def __init__(self):
         self.reset()
 
     def update(self, y_pred, y_true):
-        if isinstance(y_true, jt.array):
-            y_true = y_true.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_true must be a numpy array or Tensor.")
-
-        if isinstance(y_pred, jt.array):
-            y_pred = y_pred.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_pred must be a numpy array or Tensor.")
-
-        sample_num = y_true.shape[0]
-        y_pred = np.rint(y_pred).astype('int32')
-
-        for i in range(sample_num):
-            pred = y_pred[i]
-            label = y_true[i]
-            if label == 1:
-                if pred == label:
-                    self.tp += 1
-                else:
-                    self.fn += 1
+        # Convert Jittor tensors to NumPy arrays if necessary
+        if isinstance(y_true, jt.Var):
+            y_true = y_true.numpy()
+        if isinstance(y_pred, jt.Var):
+            y_pred = y_pred.numpy()
+
+        # Ensure y_true is reshaped to match y_pred
+        y_true = np.reshape(y_true, (-1,))
+
+        # Convert probabilities to class predictions
+        y_pred = np.argmax(y_pred, axis=1)
+
+        # Update true positives (tp) and false negatives (fn)
+        self.tp += np.sum((y_pred == 1) & (y_true == 1))
+        self.fn += np.sum((y_true == 1) & (y_pred == 0))
+
+    def result(self):
+        recall = self.tp + self.fn
+        return float(self.tp) / recall if recall != 0 else 0.0
+
+    def reset(self):
+        self.tp = 0
+        self.fn = 0
+
 
     def result(self):
 
diff --git a/tensorlayerx/model/core.py b/tensorlayerx/model/core.py
index 218d40e..02ef47f 100644
--- a/tensorlayerx/model/core.py
+++ b/tensorlayerx/model/core.py
@@ -24,7 +24,7 @@
 if tlx.BACKEND == 'torch':
     import torch
 if tlx.BACKEND == 'jittor':
-    import torch
+    import jittor as jt
 __all__ = ['Model', 'WithLoss', 'WithGrad', 'TrainOneStep', 'TrainOneStepWithGradientClipping']
 
 
@@ -662,17 +662,21 @@ def jt_train(
                     network.set_train()
                     output = network(X_batch)
                     loss = loss_fn(output, y_batch)
+                    # optimizer.apply_gradients(loss, train_weights)
                     # grads = optimizer.gradient(loss, train_weights)
                     # optimizer.apply_gradients(zip(grads, train_weights))
+
+                    optimizer.set(train_weights)
                     optimizer.zero_grad()
                     optimizer.step(loss)
-                    train_loss += loss
+                    train_loss += loss.item()
+               
                     if metrics:
-                        metrics.update(output, y_batch)
-                        train_acc += metrics.result()
+                        metrics.update(y_pred=output,y_true= y_batch)
+                        train_acc += metrics.result() 
                         metrics.reset()
                     else:
-                        train_acc += (output.argmax(1) == y_batch).type(torch.float).mean().item()
+                        train_acc += np.mean(np.equal(np.argmax(output, axis=1), y_batch))
                     n_iter += 1
 
                     if print_train_batch:
@@ -701,7 +705,7 @@ def jt_train(
                                 val_acc += metrics.result()
                                 metrics.reset()
                             else:
-                                val_acc += (_logits.argmax(1) == y_batch).type(torch.float).mean().item()
+                                val_acc += (_logits.argmax(1) == y_batch).type(jt.float).mean().item()
                             n_iter += 1
                         print("   val loss: {}".format(val_loss / n_iter))
                         print("   val acc:  {}".format(val_acc / n_iter))
diff --git a/tensorlayerx/model/utils.py b/tensorlayerx/model/utils.py
index d7d9c2a..a1229b7 100644
--- a/tensorlayerx/model/utils.py
+++ b/tensorlayerx/model/utils.py
@@ -144,9 +144,10 @@ def __init__(self, network, loss_fn=None, optimizer=None):
         self.network.set_train()
 
     def __call__(self, inputs, label):
-        loss = self.network_with_loss(inputs, label)
-        grads = self.optimizer.gradient(loss, self.train_weights)
-        return grads
+        raise NotImplementedError("WithGradJT not Implemented")
+        # loss = self.network_with_loss(inputs, label)
+        # grads = self.optimizer.gradient(loss, self.train_weights)
+        # return grads
     
 
 
@@ -227,12 +228,14 @@ def __init__(self, net_with_loss, optimizer, train_weights):
         self.optimizer = optimizer
         self.train_weights = train_weights
 
-    def __call__(self, data, label, *args, **kwargs):
-        # loss = self.net_with_loss(data, label, *args, **kwargs)
-        # grads = self.optimizer.gradient(loss, self.train_weights)
-        # self.optimizer.apply_gradients(zip(grads, self.train_weights))
-        # return loss.numpy()
-        return NotImplementedError('TrainOneStep With jittor is not Implemented')
+    def __call__(self, data, label):
+        loss = self.net_with_loss(data, label)
+        self.optimizer.set(self.train_weights)
+        self.optimizer.zero_grad()
+        # if self.grad_clip is not None:
+        #     self.grad_clip(self.train_weights)
+        self.optimizer.step(loss)
+        return loss.numpy()
     
 
 class TrainOneStepWithGradientClippingTF(object):
@@ -296,7 +299,7 @@ def __call__(self, data, label):
     
 
 class TrainOneStepWithGradientClippingJT(object):
-    def __init__(self, net_with_loss, optimizer, train_weights, gradient_clipping):
+    def __init__(self, net_with_loss, optimizer, train_weights, gradient_clipping=None):
         self.net_with_loss = net_with_loss
         self.optimizer = optimizer
         self.train_weights = train_weights
@@ -304,7 +307,11 @@ def __init__(self, net_with_loss, optimizer, train_weights, gradient_clipping):
 
     def __call__(self, data, label):
         loss = self.net_with_loss(data, label)
-        grads = self.optimizer.gradient(loss, self.train_weights, grad_clip=self.gradient_clipping)
-        self.optimizer.apply_gradients(zip(grads, self.train_weights))
-        return loss.numpy()
-    
+        self.optimizer.set(self.train_weights)
+        self.optimizer.zero_grad()
+        
+        if self.gradient_clipping is not None:
+            self.gradient_clipping(self.train_weights)
+            
+        self.optimizer.step()
+        return loss.numpy()
\ No newline at end of file
diff --git a/tensorlayerx/nn/core/core_jittor.py b/tensorlayerx/nn/core/core_jittor.py
index f69d712..0a49980 100644
--- a/tensorlayerx/nn/core/core_jittor.py
+++ b/tensorlayerx/nn/core/core_jittor.py
@@ -406,7 +406,7 @@ def forward(self, input_data):
     #         tensor._info = (new_node, idx)
 
 
-class ModuleList(Module):
+class ModuleList():
     """
     Holds Modules in a list.
 
@@ -448,8 +448,26 @@ class ModuleList(Module):
 
     def __init__(self, modules=None):
         super(ModuleList, self).__init__()
+
+        # Force _modules to be an OrderedDict right after parent's __init__
+        self._modules = OrderedDict()
+
         if modules is not None:
             self.extend(modules)
+        
+    def extend(self, layers):
+        """
+        Appends layers from a Python iterable to the end of the list.
+        """
+        if not isinstance(layers, list):
+            raise TypeError('Modules should be a list of sublayers')
+
+        for layer in layers:
+            if _valid_module(layer):
+                self._modules[str(len(self._modules))] = layer
+                # print(f"self._modules after layers added: {self._modules}")        
+
+        return self
 
     def __getitem__(self, index):
         if isinstance(index, slice):
@@ -503,18 +521,7 @@ def insert(self, index, layer):
             length -= 1
         self._modules[str(idx)] = layer
 
-    def extend(self, layers):
-        """
-            Appends layers from a Python iterable to the end of the list.
-
-        """
 
-        if not isinstance(layers, list):
-            raise TypeError('Modules {} should be list of sublayers'.format(layers))
-        for layer in layers:
-            if _valid_module(layer):
-                self._modules[str(len(self))] = layer
-        return self
 
     def append(self, layer):
         """
@@ -529,6 +536,8 @@ def forward(self, *inputs):
         raise NotImplementedError
 
 
+
+
 class ModuleDict(Module):
 
     def __init__(self, modules=None):
@@ -680,11 +689,14 @@ def __call__(self, input):
 
 
 
-class ParameterDict(Module):
-
+class ParameterDict():
     def __init__(self, parameters=None):
         super(ParameterDict, self).__init__()
         self._initialized = True
+
+        # Bypass the __setattr__ method's restriction by directly setting _parameters
+        self.__dict__['_parameters'] = OrderedDict()
+        
         if parameters is not None:
             self.update(parameters)
 
@@ -699,12 +711,21 @@ def __getitem__(self, key):
     def __setitem__(self, key, parameter):
         self.register_parameter(key, parameter)
 
+    def register_parameter(self, key, parameter):
+        # Ensure that parameter is of type jt.Var or jt.nn.Parameter
+        if not isinstance(parameter, (jt.Var, jt.nn.Parameter)):
+            raise TypeError(f"Expected jt.nn.Parameter or jt.Var, but got {type(parameter)} for key '{key}'")
+        
+        # Add the parameter to the _parameters dictionary
+        self._parameters[key] = parameter
+        print(f"Registered parameter: {key} -> type: {type(parameter)}, shape: {parameter.shape}")
+
     def __delitem__(self, key):
         del self._parameters[key]
 
     def __setattr__(self, key, value):
         if getattr(self, "_initialized", False):
-            if not hasattr(self, key) and not isinstance(value, jt.nn.Parameter):
+            if not hasattr(self, key) and not isinstance(value, (jt.nn.Parameter, jt.Var)):
                 warnings.warn("Setting attributes on ParameterDict is not supported.")
         super(ParameterDict, self).__setattr__(key, value)
 
@@ -718,7 +739,6 @@ def __reversed__(self):
         return reversed(list(self._parameters.keys()))
 
     def copy(self):
-
         return ParameterDict(self._parameters.copy())
 
     def __contains__(self, key):
@@ -742,58 +762,35 @@ def popitem(self):
         return self._parameters.popitem()
 
     def get(self, key, default=None):
-
         return self._parameters.get(key, default)
 
     def fromkeys(self, keys, default=None):
-
-        return ParameterDict(self._parameters.fromkeys(keys, default))  # type: ignore[arg-type]
+        return ParameterDict(self._parameters.fromkeys(keys, default))
 
     def keys(self):
-
         return self._parameters.keys()
 
     def items(self):
-
         return self._parameters.items()
 
     def values(self):
-
         return self._parameters.values()
 
     def update(self, parameters):
-        if not isinstance(parameters, container_abcs.Iterable):
-            raise TypeError(
-                "ParametersDict.update should be called with an "
-                "iterable of key/value pairs, but got " + type(parameters).__name__
-            )
-
-        if isinstance(parameters, (OrderedDict, ParameterDict)):
-            for key, parameter in parameters.items():
-                self[key] = parameter
-        elif isinstance(parameters, container_abcs.Mapping):
-            for key, parameter in sorted(parameters.items()):
-                self[key] = parameter
-        else:
-            for j, p in enumerate(parameters):
-                if not isinstance(p, container_abcs.Iterable):
-                    raise TypeError(
-                        "ParameterDict update sequence element "
-                        "#" + str(j) + " should be Iterable; is" + type(p).__name__
-                    )
-                if not len(p) == 2:
-                    raise ValueError(
-                        "ParameterDict update sequence element "
-                        "#" + str(j) + " has length " + str(len(p)) + "; 2 is required"
-                    )
-                # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
-                self[p[0]] = p[1]  # type: ignore[assignment]
+        if not isinstance(parameters, dict):
+            raise TypeError("ParametersDict.update should be called with a dictionary.")
+        
+        for key, parameter in parameters.items():
+            self[key] = parameter
 
     def __call__(self, input):
         raise RuntimeError('ParameterDict should not be called.')
+    
 
+    
 
 def _valid_index(layer_num, index):
+
     if not isinstance(index, int):
         raise TypeError("Index {} is not int type")
     if not -layer_num <= index < layer_num:
@@ -801,6 +798,8 @@ def _valid_index(layer_num, index):
     return index % layer_num
 
 
+
+
 def _valid_module(layer):
     if issubclass(layer.__class__, Module):
         return True
diff --git a/tensorlayerx/nn/layers/convolution/deformable_conv.py b/tensorlayerx/nn/layers/convolution/deformable_conv.py
index db74a43..299d1c2 100644
--- a/tensorlayerx/nn/layers/convolution/deformable_conv.py
+++ b/tensorlayerx/nn/layers/convolution/deformable_conv.py
@@ -89,8 +89,15 @@ def __init__(
         self.in_channels = in_channels
 
         self.kernel_n = kernel_size[0] * kernel_size[1]
-        if self.offset_layer.get_shape()[-1] != 2 * self.kernel_n:
-            raise AssertionError("offset.get_shape()[-1] is not equal to: %d" % 2 * self.kernel_n)
+
+        # Check if offset_layer has get_shape method, if not use reshape
+        if hasattr(self.offset_layer, 'get_shape'):
+            offset_shape = self.offset_layer.get_shape()[-1]
+        else:
+            offset_shape = self.offset_layer.shape[-1]
+
+        if offset_shape != 2 * self.kernel_n:
+            raise AssertionError("offset shape[-1] is not equal to: %d" % (2 * self.kernel_n))
 
         logging.info(
             "DeformableConv2d %s: out_channels: %d, kernel_size: %s act: %s" % (
diff --git a/tensorlayerx/optimizers/jittor_optimizers.py b/tensorlayerx/optimizers/jittor_optimizers.py
index 4905940..fb5c70f 100644
--- a/tensorlayerx/optimizers/jittor_optimizers.py
+++ b/tensorlayerx/optimizers/jittor_optimizers.py
@@ -28,10 +28,58 @@ def app_gradients(self):
         raise Exception('Adagrad optimizer function not implemented')
 
 
+            # params, 
+        # self.optimizer = optimizer.Adam(
+        #     params, 
+        #     lr=lr, 
+        #     eps=eps, 
+        #     betas=(beta_1, beta_2), 
+        #     weight_decay=weight_decay)
+    # @jt.no_grad()
+    # def apply_gradients(self, loss, weights):
+    #     if weights is None:
+    #         raise AttributeError("Parameter train_weights must be entered.")
+        
+    #     if not self.init_optim:
+    #         self.optimizer_adam = optimizer.Adam(
+    #             params=weights, lr=get_lr(self.lr), betas=self.betas, eps=self.eps,
+    #             weight_decay=self.weight_decay
+    #         )
+    #         self.init_optim = True
+        
+    #     self.optimizer_adam.zero_grad()
+        
+    #     # Compute and apply gradients
+    #     self.optimizer_adam.step(loss)
+
+
+    # def gradient(self, loss, weights=None, return_grad=True):
+    #     if weights is None:
+    #         raise AttributeError("Parameter train_weights must be entered.")
+        
+    #     if not self.init_optim:
+    #         self.optimizer_adam = optimizer.Adam(
+    #             params=weights, lr=get_lr(self.lr), betas=self.betas, eps=self.eps,
+    #             weight_decay=self.weight_decay
+    #         )
+    #         self.init_optim = True
+        
+    #     self.optimizer_adam.zero_grad()
+        
+    #     # Compute gradients
+    #     self.optimizer_adam.step(loss)
+        
+    #     grads = [p.opt_grad(self.optimizer_adam) for p in weights]
+        
+    #     # Optionally clip gradients
+    #     if self.grad_clip is not None:
+    #         self.grad_clip(grads, self.optimizer_adam)
+        
+    #     if return_grad:
+            # return grads
 class Adam(object):
     def __init__(
             self,
-            params, 
             lr=0.001, 
             beta_1=0.9, 
             beta_2=0.999, 
@@ -39,134 +87,103 @@ def __init__(
             weight_decay=0.0,
             momentum = 0.0,
             grad_clip=None                    
-
             ):
         
-        self.optimizer = optimizer.Adam(
-            params, 
-            lr=lr, 
-            eps=eps, 
-            betas=(beta_1, beta_2), 
-            weight_decay=weight_decay)
-
         self.lr = lr
         self.beta_1 = beta_1
         self.beta_2 = beta_2
+        self.betas = (beta_1,beta_2)
         self.eps = eps
         self.init_optim = False
         self.weight_decay = weight_decay
         self.grad_clip = grad_clip
 
-
-    @jt.no_grad()
-    def apply_gradients(self, grads_and_vars=None, closure=None):
-        if not self.init_optim:
-            raise AttributeError("Can not apply gradients before zero_grad call.")
-        loss = None
-        if closure is not None:
-            with jt.enable_grad():
-                loss = closure()
-
-        for group in self.optimizer_adam.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avg_sqs = []
-            max_exp_avg_sqs = []
-            state_steps = []
-            beta1, beta2 = group['betas']
-
-            for p in group['params']:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    if p.grad.is_sparse:
-                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-                    grads.append(p.grad)
-
-                    state = self.optimizer_adam.state[p]
-                    # Lazy state initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        # Exponential moving average of gradient values
-                        state['exp_avg'] = jt.zeros_like(p)
-                        # Exponential moving average of squared gradient values
-                        state['exp_avg_sq'] = jt.zeros_like(p)
-                        if group['amsgrad']:
-                            # Maintains max of all exp. moving avg. of sq. grad. values
-                            state['max_exp_avg_sq'] = jt.zeros_like(p)
-
-                    exp_avgs.append(state['exp_avg'])
-                    exp_avg_sqs.append(state['exp_avg_sq'])
-
-                    if group['amsgrad']:
-                        max_exp_avg_sqs.append(state['max_exp_avg_sq'])
-
-                    # update the steps for each param group update
-                    state['step'] += 1
-                    # record the step after step update
-                    state_steps.append(state['step'])
-
-            jt.optim.Adam(params_with_grad,
-                   grads,
-                   exp_avgs,
-                   exp_avg_sqs,
-                   max_exp_avg_sqs,
-                   state_steps,
-                   amsgrad=group['amsgrad'],
-                   beta1=beta1,
-                   beta2=beta2,
-                   lr=get_lr(self.lr),
-                   weight_decay=group['weight_decay'],
-                   eps=group['eps'])
-        return loss
-
-    def gradient(self, loss, weights=None, return_grad=True):
-        if weights is None:
-            raise AttributeError("Parameter train_weights must be entered.")
+    def set(self, weights):
         if not self.init_optim:
             self.optimizer_adam = optimizer.Adam(
-                params=weights, lr=get_lr(self.lr), betas=(self.beta_1, self.beta_2), eps=self.eps,
+                params=weights, lr=self.lr, betas=self.betas, eps=self.eps,
                 weight_decay=self.weight_decay
             )
             self.init_optim = True
-        self.optimizer_adam.zero_grad()
-        self.optimizer_adam.step(loss)
 
-        if self.grad_clip is not None:
-            self.grad_clip(weights)
+    def zero_grad(self):
+        self.optimizer_adam.zero_grad()
 
-        if return_grad ==True:
-            return _grads(weights)
-        else:
-            return None
+    def step(self, loss=None):
+        self.optimizer_adam.step(loss)
 
+class AdamW(object):
+    def __init__(
+            self,
+            lr=0.001, 
+            beta_1=0.9, 
+            beta_2=0.999, 
+            eps=1e-8, 
+            weight_decay=0.01,
+            grad_clip=None                    
+            ):
+        
+        self.lr = lr
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.betas = (beta_1, beta_2)
+        self.eps = eps
+        self.init_optim = False
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
 
-    def step(self, loss=None):
-        self.optimizer.step(loss)
+    def set(self, weights):
+        if not self.init_optim:
+            self.optimizer_adamw = optimizer.AdamW(
+                params=weights, lr=self.lr, betas=self.betas, eps=self.eps,
+                weight_decay=self.weight_decay
+            )
+            self.init_optim = True
 
     def zero_grad(self):
-        self.optimizer.zero_grad()
-
-class AdamW(object):
-    def __init__(self, params, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, weight_decay=0.0):
-        self.optimizer = optimizer.AdamW(params, lr=lr, eps=eps, betas=(beta_1, beta_2), weight_decay=weight_decay)
+        self.optimizer_adamw.zero_grad()
 
     def step(self, loss=None):
-        self.optimizer.step(loss)
+        self.optimizer_adamw.step(loss)
 
-    def zero_grad(self):
-        self.optimizer.zero_grad()
 
 
 class Adan(object):
-    def __init__(self, params, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, weight_decay=0.0):
-        self.optimizer = optimizer.Adan(params, lr=lr, eps=eps, betas=(beta_1, beta_2), weight_decay=weight_decay)
+    def __init__(
+            self,
+            lr=0.001, 
+            beta_1=0.9, 
+            beta_2=0.999, 
+            beta_3=0.99,
+            eps=1e-8, 
+            weight_decay=0.0,
+            grad_clip=None                    
+            ):
+        
+        self.lr = lr
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.beta_3 = beta_3
+        self.betas = (beta_1, beta_2, beta_3)
+        self.eps = eps
+        self.init_optim = False
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
 
-    def step(self, loss=None):
-        self.optimizer.step(loss)
+    def set(self, weights):
+        if not self.init_optim:
+            self.optimizer_adan = optimizer.Adan(
+                params=weights, lr=self.lr, betas=self.betas, eps=self.eps,
+                weight_decay=self.weight_decay
+            )
+            self.init_optim = True
 
     def zero_grad(self):
-        self.optimizer.zero_grad()
+        self.optimizer_adan.zero_grad()
+
+    def step(self, loss=None):
+        self.optimizer_adan.step(loss)
+
 
 
 class Adamax(object):
@@ -204,201 +221,81 @@ def gradient(self, train_weights=None):
 
 
 class RMSprop(object):
-
     def __init__(
-        self,
-        lr=0.001,
-        rho=0.99,
-        momentum=0.0,
-        eps=1e-08,
-        centered=False,
-        weight_decay=0.0,
-        grad_clip=None,
-    ):
+            self,
+            lr=0.001, 
+            eps=1e-8, 
+            alpha=0.99, 
+            # weight_decay=0.0,
+            grad_clip=None                    
+            ):
+        
         self.lr = lr
-        self.rho = rho
-        self.momentum = momentum
         self.eps = eps
-        self.centered = centered
+        self.alpha = alpha
         self.init_optim = False
-        self.weight_decay = weight_decay
+        # self.weight_decay = weight_decay
         self.grad_clip = grad_clip
 
-    @jt.no_grad()
-    def apply_gradients(self, grads_and_vars=None, closure=None):
-        if not self.init_optim:
-            raise AttributeError("Can not apply gradients before zero_grad call.")
-
-        loss = None
-        if closure is not None:
-            with jt.enable_grad():
-                loss = closure()
-
-        for group in self.optimizer_rmsprop.param_groups:
-            params_with_grad = []
-            grads = []
-            square_avgs = []
-            grad_avgs = []
-            momentum_buffer_list = []
-
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                params_with_grad.append(p)
-
-                if p.grad.is_sparse:
-                    raise RuntimeError('RMSprop does not support sparse gradients')
-                grads.append(p.grad)
-
-                state = self.optimizer_rmsprop.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    state['square_avg'] = jt.zeros_like(p)
-                    if group['momentum'] > 0:
-                        state['momentum_buffer'] = jt.zeros_like(p)
-                    if group['centered']:
-                        state['grad_avg'] = jt.zeros_like(p)
-
-                square_avgs.append(state['square_avg'])
-
-                if group['momentum'] > 0:
-                    momentum_buffer_list.append(state['momentum_buffer'])
-                if group['centered']:
-                    grad_avgs.append(state['grad_avg'])
-
-                state['step'] += 1
-
-            optimizer.RMSprop(params_with_grad,
-                      grads,
-                      square_avgs,
-                      grad_avgs,
-                      momentum_buffer_list,
-                      lr=get_lr(self.lr),
-                      alpha=group['alpha'],
-                      eps=group['eps'],
-                      weight_decay=group['weight_decay'],
-                      momentum=group['momentum'],
-                      centered=group['centered'])
-
-        return loss
-
-    def gradient(self, loss, weights=None, return_grad=True):
-        if weights is None:
-            raise AttributeError("Parameter train_weights must be entered.")
+    def set(self, weights):
         if not self.init_optim:
             self.optimizer_rmsprop = optimizer.RMSprop(
-                params=weights, lr=get_lr(self.lr), alpha=self.rho, eps=self.eps, momentum=self.momentum,
-                centered=self.centered, weight_decay=self.weight_decay
+                params=weights, lr=self.lr, eps=self.eps, alpha=self.alpha,
             )
             self.init_optim = True
+
+    def zero_grad(self):
         self.optimizer_rmsprop.zero_grad()
-        loss.backward()
 
-        if self.grad_clip is not None:
-            self.grad_clip(weights)
+    def step(self, loss=None):
+        self.optimizer_rmsprop.step(loss)
 
-        if return_grad ==True:
-            return _grads(weights)
-        else:
-            return None
 
 
 class SGD(object):
-
     def __init__(
-        self,
-        lr=0.001,
-        momentum=0,
-        weight_decay=0.0,
-        grad_clip=None,
-    ):
+            self,
+            lr=0.01,
+            momentum=0.0,
+            weight_decay=0.0,
+            dampening=0.0,
+            nesterov=False,
+            grad_clip=None
+            ):
+
         self.lr = lr
         self.momentum = momentum
-        self.init_optim = False
         self.weight_decay = weight_decay
+        self.dampening = dampening
+        self.nesterov = nesterov
+        self.init_optim = False
         self.grad_clip = grad_clip
 
-    @jt.no_grad()
-    def apply_gradients(self, grads_and_vars=None, closure=None):
-        if not self.init_optim:
-            raise AttributeError("Can not apply gradients before zero_grad call.")
-
-        loss = None
-        if closure is not None:
-            with jt.enable_grad():
-                loss = closure()
-
-        for group in self.optimizer_sgd.param_groups:
-            params_with_grad = []
-            d_p_list = []
-            momentum_buffer_list = []
-            weight_decay = group['weight_decay']
-            momentum = group['momentum']
-            dampening = group['dampening']
-            nesterov = group['nesterov']
-            lr = get_lr(self.lr)
-
-            for p in group['params']:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    d_p_list.append(p.grad)
-
-                    state = self.optimizer_sgd.state[p]
-                    if 'momentum_buffer' not in state:
-                        momentum_buffer_list.append(None)
-                    else:
-                        momentum_buffer_list.append(state['momentum_buffer'])
-
-            optimizer.SGD(params_with_grad,
-                  d_p_list,
-                  momentum_buffer_list,
-                  weight_decay=weight_decay,
-                  momentum=momentum,
-                  lr=lr,
-                  dampening=dampening,
-                  nesterov=nesterov)
-
-            # update momentum_buffers in state
-            for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
-                state = self.optimizer_sgd.state[p]
-                state['momentum_buffer'] = momentum_buffer
-
-        return loss
-
-    def gradient(self, loss, weights=None, return_grad=True):
-        if weights is None:
-            raise AttributeError("Parameter train_weights must be entered.")
+    def set(self, weights):
         if not self.init_optim:
             self.optimizer_sgd = optimizer.SGD(
-                params=weights, lr=get_lr(self.lr), momentum=self.momentum, weight_decay=self.weight_decay
+                params=weights, lr=self.lr, momentum=self.momentum, weight_decay=self.weight_decay,
+                dampening=self.dampening, nesterov=self.nesterov
             )
             self.init_optim = True
-        self.optimizer_sgd.zero_grad()
-        loss.backward()
 
-        if self.grad_clip is not None:
-            self.grad_clip(weights)
-
-        if return_grad ==True:
-            return _grads(weights)
-        else:
-            return None
+    def zero_grad(self):
+        self.optimizer_sgd.zero_grad()
 
+    def step(self, loss=None):
+        self.optimizer_sgd.step(loss)
 
 
 class Momentum(object):
-
     def __init__(
-        self,
-        params,  # Add params to the constructor
-        lr=0.001,
-        momentum=0.9,
-        weight_decay=0.0,
-        nesterov=False,
-        grad_clip=None,
-    ):
+            self,
+            lr=0.001, 
+            momentum=0.9,
+            weight_decay=0.0,
+            nesterov=False,
+            grad_clip=None                    
+            ):
+        
         self.lr = lr
         self.momentum = momentum
         self.weight_decay = weight_decay
@@ -406,76 +303,19 @@ def __init__(
         self.grad_clip = grad_clip
         self.init_optim = False
 
-        self.optimizer = optimizer.SGD(  # Initialize the Jittor SGD optimizer
-            params, lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov
-        )
-
-    @jt.no_grad()
-    def apply_gradients(self, grads_and_vars=None, closure=None):
-        if not self.init_optim:
-            raise AttributeError("Cannot apply gradients before zero_grad call.")
-
-        loss = None
-        if closure is not None:
-            with jt.enable_grad():
-                loss = closure()
-
-        for group in self.optimizer.param_groups:
-            params_with_grad = []
-            d_p_list = []
-            momentum_buffer_list = []
-
-            for p in group['params']:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    d_p_list.append(p.grad)
-
-                    state = self.optimizer.state[p]
-                    if 'momentum_buffer' not in state:
-                        momentum_buffer_list.append(None)
-                    else:
-                        momentum_buffer_list.append(state['momentum_buffer'])
-
-            optimizer.SGD(params_with_grad,
-                          d_p_list,
-                          momentum_buffer_list,
-                          weight_decay=group['weight_decay'],
-                          momentum=group['momentum'],
-                          lr=self.lr,
-                          dampening=group['dampening'],
-                          nesterov=group['nesterov'])
-
-            # Update momentum_buffers in state
-            for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
-                state = self.optimizer.state[p]
-                state['momentum_buffer'] = momentum_buffer
-
-        return loss
-
-    def gradient(self, loss, weights=None, return_grad=True):
-        if weights is None:
-            raise AttributeError("Parameter train_weights must be entered.")
+    def set(self, weights):
         if not self.init_optim:
-            self.optimizer = optimizer.SGD(
+            self.optimizer_momentum = optimizer.SGD(
                 params=weights, lr=self.lr, momentum=self.momentum, weight_decay=self.weight_decay, nesterov=self.nesterov
             )
             self.init_optim = True
-        self.optimizer.zero_grad()
-        loss.backward()
 
-        if self.grad_clip is not None:
-            self.grad_clip(weights)
-
-        if return_grad:
-            return _grads(weights)
-        else:
-            return None
+    def zero_grad(self):
+        self.optimizer_momentum.zero_grad()
 
     def step(self, loss=None):
-        self.optimizer.step(loss)
+        self.optimizer_momentum.step(loss)
 
-    def zero_grad(self):
-        self.optimizer.zero_grad()
 
 
 
@@ -487,10 +327,10 @@ def LARS(**kwargs):
     raise Exception('LARS optimizer function not implemented')
 
 
-def _grads(weights, optimizer_adam):
+def _grads(weights, optimizer):
     grads = []
     for w in weights:
-        grads.append(w.opt_grad(optimizer_adam))
+        grads.append(w.opt_grad(optimizer))
     return grads