-- Loading model -- Tokenizer: /home/john/Projects/Python/text-models/text-generation-webui/models/TheBloke_guanaco-33B-GPTQ/tokenizer.model -- Model config: /home/john/Projects/Python/text-models/text-generation-webui/models/TheBloke_guanaco-33B-GPTQ/config.json -- Model: /home/john/Projects/Python/text-models/text-generation-webui/models/TheBloke_guanaco-33B-GPTQ/Guanaco-33B-GPTQ-4bit.act-order.safetensors -- Sequence length: 2048 -- Options: ['attention: pytorch_scaled_dp', 'matmul: switched', 'mlp: normal', 'perplexity', 'validate', 'debug', 'gpu_split: 4,20'] !! Available CUDA devices: " !! - cuda:0: NVIDIA GeForce RTX 4090 " !! - cuda:1: NVIDIA GeForce RTX 4090 !! Loading safetensors file: /home/john/Projects/Python/text-models/text-generation-webui/models/TheBloke_guanaco-33B-GPTQ/Guanaco-33B-GPTQ-4bit.act-order.safetensors !! Begin auto device map !! Decoder size: 267,855,616 bytes !! Decoder size, DQ: 1,070,098,432 bytes !! Norm size: 13,312 bytes !! Head size: 425,984,000 bytes !! Device map: !! - embed_tokens: cpu !! - layers [0:10]: cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 !! - layers [10:20]: cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:1 cuda:1 cuda:1 cuda:1 !! - layers [20:30]: cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 !! - layers [30:40]: cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 !! - layers [40:50]: cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 !! - layers [50:60]: cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 !! - norm: cuda:1 !! - lm_head: cuda:1 !! Begin load tensors !! - lm_head.weight read: device: cpu, shape: [32000, 6656], dtype: bfloat16 !! - lm_head.weight map: device: cuda:1, shape: [32000, 6656], dtype: float16, min: -0.398438, max: 0.335938, std: 0.018646 !! - model.embed_tokens.weight read: device: cpu, shape: [32000, 6656], dtype: bfloat16 !! - model.embed_tokens.weight map: device: cpu, shape: [32000, 6656], dtype: float16 !! - model.layers.0.input_layernorm.weight read: device: cpu, shape: [6656], dtype: bfloat16 !! - model.layers.0.input_layernorm.weight map: device: cuda:0, shape: [6656], dtype: float16, min: 0.000099, max: 0.431641, std: 0.028687 !! - model.layers.0.mlp.down_proj.qweight read: device: cpu, shape: [2240, 6656], dtype: int32 !! - model.layers.0.mlp.down_proj.qweight map: device: cuda:0, shape: [2240, 6656], dtype: int32, min: -2146138552, max: 2146069173 !! - model.layers.0.mlp.down_proj.qzeros read: device: cpu, shape: [1, 832], dtype: int32 !! - model.layers.0.mlp.down_proj.qzeros map: device: cuda:0, shape: [1, 832], dtype: int32, min: -2040174985, max: 2021090934 !! - model.layers.0.mlp.down_proj.scales read: device: cpu, shape: [1, 6656], dtype: float32 !! - model.layers.0.mlp.down_proj.scales map: device: cuda:0, shape: [1, 6656], dtype: float16, min: 0.006851, max: 0.076843, std: 0.001938 !! - model.layers.0.mlp.gate_proj.qweight read: device: cpu, shape: [832, 17920], dtype: int32 !! - model.layers.0.mlp.gate_proj.qweight map: device: cuda:0, shape: [832, 17920], dtype: int32, min: -2146081160, max: 2145818758 !! - model.layers.0.mlp.gate_proj.qzeros read: device: cpu, shape: [1, 2240], dtype: int32 !! - model.layers.0.mlp.gate_proj.qzeros map: device: cuda:0, shape: [1, 2240], dtype: int32, min: -2056882553, max: 2022205030 !! - model.layers.0.mlp.gate_proj.scales read: device: cpu, shape: [1, 17920], dtype: float32 !! - model.layers.0.mlp.gate_proj.scales map: device: cuda:0, shape: [1, 17920], dtype: float16, min: 0.003914, max: 0.053009, std: 0.001388 !! - model.layers.0.mlp.up_proj.qweight read: device: cpu, shape: [832, 17920], dtype: int32 !! - model.layers.0.mlp.up_proj.qweight map: device: cuda:0, shape: [832, 17920], dtype: int32, min: -2146989736, max: 2146015097 !! - model.layers.0.mlp.up_proj.qzeros read: device: cpu, shape: [1, 2240], dtype: int32 !! - model.layers.0.mlp.up_proj.qzeros map: device: cuda:0, shape: [1, 2240], dtype: int32, min: -2056882570, max: 2022078054 !! - model.layers.0.mlp.up_proj.scales read: device: cpu, shape: [1, 17920], dtype: float32 !! - model.layers.0.mlp.up_proj.scales map: device: cuda:0, shape: [1, 17920], dtype: float16, min: 0.003979, max: 0.028458, std: 0.000864 !! - model.layers.0.post_attention_layernorm.weight read: device: cpu, shape: [6656], dtype: bfloat16 !! - model.layers.0.post_attention_layernorm.weight map: device: cuda:0, shape: [6656], dtype: float16, min: -0.147461, max: 0.371094, std: 0.015366 !! - model.layers.0.self_attn.k_proj.qweight read: device: cpu, shape: [832, 6656], dtype: int32 !! - model.layers.0.self_attn.k_proj.qweight map: device: cuda:0, shape: [832, 6656], dtype: int32, min: -2145941149, max: 2144831623 !! - model.layers.0.self_attn.k_proj.qzeros read: device: cpu, shape: [1, 832], dtype: int32 !! - model.layers.0.self_attn.k_proj.qzeros map: device: cuda:0, shape: [1, 832], dtype: int32, min: -2058000042, max: 2022082182 !! - model.layers.0.self_attn.k_proj.scales read: device: cpu, shape: [1, 6656], dtype: float32 !! - model.layers.0.self_attn.k_proj.scales map: device: cuda:0, shape: [1, 6656], dtype: float16, min: 0.003426, max: 0.092468, std: 0.007042 !! - model.layers.0.self_attn.o_proj.qweight read: device: cpu, shape: [832, 6656], dtype: int32 !! - model.layers.0.self_attn.o_proj.qweight map: device: cuda:0, shape: [832, 6656], dtype: int32, min: -2146968999, max: 2147125857 !! - model.layers.0.self_attn.o_proj.qzeros read: device: cpu, shape: [1, 832], dtype: int32 !! - model.layers.0.self_attn.o_proj.qzeros map: device: cuda:0, shape: [1, 832], dtype: int32, min: -2057935001, max: 2022078310 !! - model.layers.0.self_attn.o_proj.scales read: device: cpu, shape: [1, 6656], dtype: float32 !! - model.layers.0.self_attn.o_proj.scales map: device: cuda:0, shape: [1, 6656], dtype: float16, min: 0.004063, max: 0.066406, std: 0.001719 !! - model.layers.0.self_attn.q_proj.qweight read: device: cpu, shape: [832, 6656], dtype: int32 !! - model.layers.0.self_attn.q_proj.qweight map: device: cuda:0, shape: [832, 6656], dtype: int32, min: -2144888953, max: 2144892026 !! - model.layers.0.self_attn.q_proj.qzeros read: device: cpu, shape: [1, 832], dtype: int32 !! - model.layers.0.self_attn.q_proj.qzeros map: device: cuda:0, shape: [1, 832], dtype: int32, min: -2055838105, max: 2022143846 !! - model.layers.0.self_attn.q_proj.scales read: device: cpu, shape: [1, 6656], dtype: float32 !! - model.layers.0.self_attn.q_proj.scales map: device: cuda:0, shape: [1, 6656], dtype: float16, min: 0.003687, max: 0.076843, std: 0.005463 !! - model.layers.0.self_attn.v_proj.qweight read: device: cpu, shape: [832, 6656], dtype: int32 !! - model.layers.0.self_attn.v_proj.qweight map: device: cuda:0, shape: [832, 6656], dtype: int32, min: -2145871737, max: 2146928759 !! - model.layers.0.self_attn.v_proj.qzeros read: device: cpu, shape: [1, 832], dtype: int32 !! - model.layers.0.self_attn.v_proj.qzeros map: device: cuda:0, shape: [1, 832], dtype: int32, min: -2040109209, max: 2021025654 !! - model.layers.0.self_attn.v_proj.scales read: device: cpu, shape: [1, 6656], dtype: float32 !! - model.layers.0.self_attn.v_proj.scales map: device: cuda:0, shape: [1, 6656], dtype: float16, min: 0.002279, max: 0.012497, std: 0.001468 !! - model.norm.weight read: device: cpu, shape: [6656], dtype: bfloat16 !! - model.norm.weight map: device: cuda:1, shape: [6656], dtype: float16, min: 0.140625, max: 1.851562, std: 0.055695 !! Computing RoPE table for seq length: 2048 !! - stored for device: cuda:1 !! - stored for device: cuda:0 ** Time, Load model: 3.41 seconds -- Groupsize (inferred): None -- Act-order (inferred): no ** VRAM, Model: [cuda:0] 4,142.07 MB - [cuda:1] 11,797.71 MB !! Inference, debug pass !! Begin forward pass !! Built initial hidden state: device: cpu, shape: [1, 1920, 6656], dtype: float16, min: -0.138672, max: 0.148438, std: 0.014305 !! Prepared buffer for device: cuda:1 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! Prepared buffer for device: cuda:0 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! Moving hidden states from cpu to cuda:0 ------- tensor([[[-0.0104980469, 0.0034790039, -0.0078735352, ..., -0.0140380859, -0.0246582031, -0.0111694336], [-0.0006103516, -0.0145263672, 0.0233154297, ..., -0.0128173828, 0.0093383789, -0.0061340332], [-0.0069885254, -0.0322265625, 0.0021667480, ..., 0.0125732422, -0.0085449219, 0.0125122070], ..., [-0.0069885254, -0.0048217773, 0.0253906250, ..., 0.0015029907, 0.0145263672, -0.0108032227], [ 0.0019073486, -0.0035400391, 0.0410156250, ..., 0.0023803711, 0.0057678223, 0.0026397705], [ 0.0058593750, 0.0101928711, 0.0178222656, ..., 0.0197753906, -0.0424804688, 0.0251464844]]], dtype=torch.float16) ------- tensor([[[-0.0104980469, 0.0034790039, -0.0078735352, ..., -0.0140380859, -0.0246582031, -0.0111694336], [-0.0006103516, -0.0145263672, 0.0233154297, ..., -0.0128173828, 0.0093383789, -0.0061340332], [-0.0069885254, -0.0322265625, 0.0021667480, ..., 0.0125732422, -0.0085449219, 0.0125122070], ..., [-0.0069885254, -0.0048217773, 0.0253906250, ..., 0.0015029907, 0.0145263672, -0.0108032227], [ 0.0019073486, -0.0035400391, 0.0410156250, ..., 0.0023803711, 0.0057678223, 0.0026397705], [ 0.0058593750, 0.0101928711, 0.0178222656, ..., 0.0197753906, -0.0424804688, 0.0251464844]]], device='cuda:0', dtype=torch.float16) ------- !! Begin decoder 0 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -0.138672, max: 0.148438, std: 0.014305 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.000099, max: 0.431641, std: 0.028687 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.003687/0.076843/0.005463 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003426/0.092468/0.007042 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.002279/0.012497/0.001468 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.004063/0.066406/0.001719 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -1.016602, max: 0.744141, std: 0.021774 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: -0.147461, max: 0.371094, std: 0.015366 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.003914/0.053009/0.001388 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.003979/0.028458/0.000864 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.006851/0.076843/0.001938 !! - method: normal !! Begin decoder 1 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -22.890625, max: 23.984375, std: 0.128662 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.015381, max: 0.324219, std: 0.018555 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.002424/0.050140/0.005402 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.002337/0.058197/0.006481 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.002319/0.011330/0.001104 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.004444/0.069519/0.001976 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -19.781250, max: 21.156250, std: 0.085571 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: -0.000671, max: 0.155273, std: 0.010117 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006138/0.075256/0.001514 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.005680/0.052460/0.001062 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007244/0.147339/0.002571 !! - method: normal !! Begin decoder 2 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -25.562500, max: 27.906250, std: 0.096436 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.000816, max: 0.427734, std: 0.013031 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004768/0.062744/0.005123 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003914/0.052094/0.004726 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.003191/0.016769/0.000727 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.004917/0.069275/0.001596 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -24.421875, max: 26.859375, std: 0.095215 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.007019, max: 0.184570, std: 0.009277 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006039/0.066162/0.001669 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.005939/0.041138/0.000927 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007339/0.080444/0.001779 !! - method: normal !! Begin decoder 3 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -25.343750, max: 28.343750, std: 0.143066 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.054688, max: 0.427734, std: 0.013016 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004475/0.063416/0.004517 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003727/0.061981/0.005016 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.003084/0.020569/0.000896 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005047/0.087769/0.002256 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -21.562500, max: 22.906250, std: 0.187866 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.008911, max: 0.202148, std: 0.009621 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005939/0.066040/0.001658 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.005714/0.056122/0.001195 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007324/0.267822/0.003691 !! - method: normal !! Begin decoder 4 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -90.062500, max: 2140.000000, std: 0.827148 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.017456, max: 0.480469, std: 0.015640 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004036/0.045837/0.003664 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003801/0.035950/0.003613 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.003468/0.015656/0.000794 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005535/0.054565/0.001162 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -88.437500, max: 2138.000000, std: 0.847168 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.009338, max: 0.158203, std: 0.009377 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006054/0.055481/0.001560 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.005939/0.055725/0.001166 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007454/0.062866/0.001432 !! - method: normal !! Begin decoder 5 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -88.000000, max: 2140.000000, std: 0.847168 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.020508, max: 0.478516, std: 0.016983 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004158/0.042328/0.003759 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004135/0.033783/0.003658 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.004028/0.013931/0.000834 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005470/0.041016/0.001056 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -86.312500, max: 2140.000000, std: 0.870117 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.025024, max: 0.162109, std: 0.008591 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005760/0.046295/0.001429 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.005550/0.052094/0.001036 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007374/0.055206/0.001354 !! - method: normal !! Begin decoder 6 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -86.062500, max: 2142.000000, std: 0.865723 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.018799, max: 0.472656, std: 0.015656 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004192/0.052856/0.004028 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004208/0.035034/0.003809 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.004158/0.014488/0.000650 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005615/0.058594/0.001074 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -85.437500, max: 2142.000000, std: 0.898438 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.029297, max: 0.173828, std: 0.008652 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006023/0.051697/0.001435 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006023/0.049622/0.001033 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007359/0.056915/0.001345 !! - method: normal !! Begin decoder 7 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -85.437500, max: 2142.000000, std: 0.890625 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.023682, max: 0.492188, std: 0.014145 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004395/0.043488/0.003618 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004539/0.032104/0.003489 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.004982/0.012306/0.000669 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005760/0.042175/0.000939 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -83.250000, max: 2140.000000, std: 0.923828 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.026489, max: 0.183594, std: 0.009079 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006382/0.038269/0.001271 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006382/0.050140/0.001077 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007210/0.052338/0.001207 !! - method: normal !! Begin decoder 8 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -83.250000, max: 2140.000000, std: 0.911621 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.026978, max: 0.531250, std: 0.014130 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.003889/0.046234/0.003653 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003979/0.031189/0.003502 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.004803/0.012077/0.000616 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005909/0.044922/0.000819 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -85.187500, max: 2140.000000, std: 0.943359 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.037109, max: 0.194336, std: 0.008949 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006462/0.044006/0.001260 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006542/0.049744/0.001063 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007534/0.072937/0.001604 !! - method: normal !! Begin decoder 9 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -85.125000, max: 2142.000000, std: 0.928711 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.031250, max: 0.527344, std: 0.014053 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004444/0.045441/0.003422 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004379/0.032166/0.003338 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.004589/0.014809/0.000655 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005989/0.047913/0.001200 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -88.875000, max: 2142.000000, std: 0.959473 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.051270, max: 0.249023, std: 0.008759 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006332/0.044128/0.001369 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006542/0.048187/0.001155 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007454/0.119019/0.002102 !! - method: normal !! Begin decoder 10 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -86.625000, max: 2170.000000, std: 0.958008 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.039062, max: 0.550781, std: 0.014366 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004013/0.052612/0.003538 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004108/0.037903/0.003376 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.005028/0.012306/0.000686 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.006054/0.056641/0.001063 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -89.750000, max: 2168.000000, std: 0.992188 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.052246, max: 0.439453, std: 0.009697 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005795/0.053131/0.001467 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006512/0.050903/0.001198 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007534/0.173462/0.002787 !! - method: normal !! Begin decoder 11 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -90.687500, max: 2508.000000, std: 1.178711 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.046143, max: 0.574219, std: 0.015808 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004021/0.051056/0.004143 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003994/0.034119/0.003323 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.005028/0.017609/0.000787 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.006104/0.071594/0.001420 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -88.875000, max: 2510.000000, std: 1.208984 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.041016, max: 0.218750, std: 0.009247 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006248/0.064209/0.001389 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006641/0.054565/0.001169 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007259/0.103394/0.001874 !! - method: normal !! Begin decoder 12 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -88.500000, max: 2512.000000, std: 1.197266 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.052246, max: 0.601562, std: 0.015060 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004402/0.053131/0.003906 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003906/0.042969/0.003477 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.005226/0.018158/0.000777 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.006069/0.082825/0.001306 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -88.187500, max: 2514.000000, std: 1.227539 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.059570, max: 0.228516, std: 0.008789 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005695/0.050140/0.001423 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006218/0.060944/0.001321 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007359/0.105713/0.001951 !! - method: normal !! Begin decoder 13 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -87.500000, max: 2516.000000, std: 1.217773 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.064941, max: 0.613281, std: 0.016144 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004116/0.049225/0.003771 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004044/0.036194/0.003403 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.003922/0.019241/0.000883 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.006184/0.059113/0.001060 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -89.062500, max: 2516.000000, std: 1.253906 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.050049, max: 0.229492, std: 0.008698 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006023/0.040558/0.001410 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006332/0.045776/0.001092 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007633/0.083618/0.001544 !! - method: normal !! Begin decoder 14 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -88.937500, max: 2516.000000, std: 1.247070 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.067871, max: 0.640625, std: 0.016388 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004021/0.046356/0.003773 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003752/0.035431/0.003366 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.005142/0.018295/0.000790 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.006248/0.069519/0.001388 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -89.250000, max: 2520.000000, std: 1.276367 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.049805, max: 0.229492, std: 0.008499 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005959/0.048950/0.001568 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006447/0.066467/0.001188 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007404/0.095337/0.001607 !! - method: normal !! Begin decoder 15 !! Begin self-attention !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -89.187500, max: 2520.000000, std: 1.260742 !! - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.069336, max: 0.664062, std: 0.015411 eps: 0.00000100 !! - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004143/0.051575/0.003738 !! - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003906/0.040497/0.003508 !! - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.005436/0.019791/0.000878 !! - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005760/0.103882/0.001384 !! - cache device: cuda:0, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -88.750000, max: 2522.000000, std: 1.292969 !! - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.065430, max: 0.235352, std: 0.008469 eps: 0.00000100 !! - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005028/0.074768/0.001600 !! - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006348/0.064697/0.001232 !! - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007439/0.103882/0.002073 !! - method: normal !! Moving hidden states from cuda:0 to cuda:1 ------- tensor([[[ 0.0593261719, -0.7446289062, 0.7128906250, ..., 0.6777343750, 0.3395996094, 0.5122070312], [ 0.0980224609, -0.7451171875, 0.6752929688, ..., 0.6308593750, 0.3728027344, 0.4899902344], [ 0.1042480469, -0.7041015625, 0.6972656250, ..., 0.6777343750, 0.3779296875, 0.5922851562], ..., [-0.5024414062, 0.1296386719, -0.2453613281, ..., 0.4074707031, 0.2282714844, -0.1484375000], [-0.1105957031, -0.2471923828, -0.4714355469, ..., -0.6914062500, 0.2412109375, -0.0307006836], [ 0.3864746094, -0.3850097656, -0.7495117188, ..., -0.5483398438, -0.1068115234, 0.2222900391]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]], device='cuda:1', dtype=torch.float16) ------- !! Begin decoder 16 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.059570, max: 0.671875, std: 0.015488 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.004230/0.053131/0.003641 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003994/0.037048/0.003208 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005451/0.025070/0.000992 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006023/0.060425/0.001118 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.035156, max: 0.241211, std: 0.009331 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.005974/0.067078/0.001477 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006657/0.039703/0.001064 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007553/0.096619/0.001308 !! - method: normal !! Begin decoder 17 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.060791, max: 0.671875, std: 0.015564 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003914/0.048035/0.003668 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003134/0.035675/0.003126 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004890/0.022491/0.000873 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006447/0.095825/0.001544 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.035400, max: 0.244141, std: 0.009109 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006039/0.052856/0.001426 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006721/0.053528/0.001071 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007359/0.071350/0.001134 !! - method: normal !! Begin decoder 18 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.068848, max: 0.679688, std: 0.015129 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003613/0.045837/0.003527 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003523/0.039978/0.003304 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004982/0.015686/0.000898 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.005760/0.064819/0.000910 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.044922, max: 0.245117, std: 0.008545 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.005142/0.056000/0.001453 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006641/0.053253/0.001026 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007584/0.077881/0.001364 !! - method: normal !! Begin decoder 19 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.065430, max: 0.687500, std: 0.014000 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003801/0.046082/0.003609 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003393/0.037170/0.003099 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005112/0.022858/0.000912 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.005894/0.059906/0.001084 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.039062, max: 0.249023, std: 0.008957 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.005161/0.037506/0.001387 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006382/0.036560/0.001002 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007668/0.082825/0.001348 !! - method: normal !! Begin decoder 20 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.056641, max: 0.703125, std: 0.013672 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003784/0.052338/0.003933 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003353/0.035614/0.003248 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005272/0.020309/0.000875 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006577/0.106506/0.002047 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.025269, max: 0.257812, std: 0.009445 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006088/0.049469/0.001395 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006721/0.061584/0.001181 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007633/0.085693/0.001389 !! - method: normal !! Begin decoder 21 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.059814, max: 0.734375, std: 0.013329 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003801/0.053009/0.003752 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003256/0.033722/0.003275 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005386/0.020508/0.000772 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006233/0.066162/0.001053 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.035400, max: 0.257812, std: 0.009483 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006313/0.056824/0.001446 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006641/0.038788/0.000963 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007713/0.060425/0.001002 !! - method: normal !! Begin decoder 22 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.059326, max: 0.703125, std: 0.012497 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002922/0.052460/0.004002 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002678/0.033142/0.003107 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.006039/0.018646/0.000729 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006592/0.086182/0.001471 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.020386, max: 0.267578, std: 0.010315 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006397/0.047668/0.001308 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006542/0.045624/0.001020 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007713/0.107056/0.001574 !! - method: normal !! Begin decoder 23 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.060303, max: 0.765625, std: 0.014275 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003565/0.055847/0.004059 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002800/0.033905/0.003366 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005077/0.016434/0.000742 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006382/0.047272/0.000874 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.022949, max: 0.275391, std: 0.010490 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006203/0.045044/0.001418 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006577/0.035034/0.000887 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007553/0.071472/0.001110 !! - method: normal !! Begin decoder 24 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.060547, max: 0.703125, std: 0.013268 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003345/0.049866/0.004017 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002897/0.032166/0.003105 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005859/0.016769/0.000706 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.057556/0.001293 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.021973, max: 0.279297, std: 0.010460 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006382/0.041412/0.001295 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.058868/0.001081 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007668/0.084534/0.001464 !! - method: normal !! Begin decoder 25 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.062988, max: 0.679688, std: 0.013092 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003069/0.050903/0.003702 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002857/0.039062/0.003319 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005581/0.018814/0.000751 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006657/0.099243/0.001510 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031982, max: 0.285156, std: 0.010880 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.005581/0.047394/0.001300 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006706/0.043243/0.001004 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007618/0.081482/0.001511 !! - method: normal !! Begin decoder 26 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.053467, max: 0.718750, std: 0.013550 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003443/0.057953/0.004169 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003181/0.035095/0.003294 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005405/0.019562/0.000778 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006836/0.027542/0.000661 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.012085, max: 0.296875, std: 0.012276 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.052094/0.001257 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006607/0.030212/0.000802 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007618/0.063660/0.001120 !! - method: normal !! Begin decoder 27 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.041504, max: 0.664062, std: 0.013268 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003246/0.061188/0.004597 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002808/0.042969/0.003384 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004257/0.020767/0.000847 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006737/0.140137/0.002386 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.008850, max: 0.306641, std: 0.012772 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006397/0.040039/0.001224 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006512/0.039978/0.000906 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007519/0.074219/0.001472 !! - method: normal !! Begin decoder 28 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036865, max: 0.734375, std: 0.013893 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003670/0.061188/0.004440 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003345/0.039459/0.003338 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005760/0.014290/0.000641 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006886/0.062225/0.000916 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.003601, max: 0.314453, std: 0.013245 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006756/0.039124/0.001158 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006626/0.041290/0.000786 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007584/0.099976/0.001548 !! - method: normal !! Begin decoder 29 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.030640, max: 0.750000, std: 0.014641 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003246/0.070435/0.004562 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002693/0.036469/0.003307 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.006248/0.022598/0.000674 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006870/0.058594/0.001038 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: -0.000912, max: 0.320312, std: 0.013840 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006821/0.035034/0.001091 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006786/0.052734/0.000846 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007504/0.074219/0.001188 !! - method: normal !! Begin decoder 30 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.032715, max: 0.750000, std: 0.015472 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003101/0.059509/0.004513 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003157/0.040894/0.003395 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005550/0.015137/0.000741 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006802/0.122681/0.001928 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: -0.002228, max: 0.328125, std: 0.014160 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006966/0.035156/0.001039 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006626/0.033020/0.000750 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007423/0.075500/0.001152 !! - method: normal !! Begin decoder 31 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.023193, max: 0.769531, std: 0.015266 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003410/0.072510/0.004620 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002954/0.043610/0.003323 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005630/0.015366/0.000771 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006950/0.072021/0.001338 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: -0.003433, max: 0.335938, std: 0.014183 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007114/0.036194/0.001136 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006657/0.045319/0.000835 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007423/0.084534/0.001247 !! - method: normal !! Begin decoder 32 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031006, max: 0.726562, std: 0.015465 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003475/0.058594/0.004498 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003166/0.039581/0.003384 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004539/0.016357/0.000813 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006935/0.147949/0.002407 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.001793, max: 0.345703, std: 0.014183 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006901/0.047150/0.001182 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006657/0.044800/0.000997 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007603/0.066406/0.001269 !! - method: normal !! Begin decoder 33 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031982, max: 0.828125, std: 0.016769 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003117/0.058472/0.004299 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002913/0.041931/0.003408 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005207/0.013672/0.000735 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006916/0.049622/0.000923 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.005981, max: 0.375000, std: 0.014603 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006981/0.073547/0.001303 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006737/0.063049/0.000965 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007389/0.092957/0.001473 !! - method: normal !! Begin decoder 34 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031494, max: 0.785156, std: 0.016815 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003500/0.060822/0.004780 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003117/0.043488/0.003405 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005714/0.016998/0.000655 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006851/0.058990/0.001354 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: -0.000751, max: 0.343750, std: 0.015251 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006821/0.052094/0.001061 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006737/0.075500/0.001045 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007488/0.100403/0.001640 !! - method: normal !! Begin decoder 35 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.032715, max: 0.804688, std: 0.017563 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002930/0.060150/0.004387 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002848/0.042847/0.003531 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004963/0.013313/0.000789 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007030/0.063660/0.001132 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.004547, max: 0.353516, std: 0.015144 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006901/0.048370/0.001162 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006821/0.033997/0.000875 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007504/0.095032/0.001330 !! - method: normal !! Begin decoder 36 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.029785, max: 0.839844, std: 0.017242 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003223/0.058716/0.004532 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002962/0.041290/0.003490 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004574/0.019562/0.000918 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007195/0.117737/0.001760 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.010925, max: 0.359375, std: 0.015305 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007030/0.061859/0.001213 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.048553/0.000885 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007374/0.075012/0.001202 !! - method: normal !! Begin decoder 37 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.025879, max: 0.828125, std: 0.017578 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003199/0.067688/0.004490 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003044/0.045441/0.003639 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.006004/0.021515/0.000705 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006870/0.085144/0.001255 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.009827, max: 0.361328, std: 0.015839 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007210/0.050079/0.001101 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006821/0.032745/0.000821 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007633/0.119019/0.001538 !! - method: normal !! Begin decoder 38 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.027100, max: 0.843750, std: 0.018066 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003328/0.055847/0.004547 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003012/0.039703/0.003664 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005695/0.021423/0.000836 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007195/0.046753/0.001000 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.005066, max: 0.382812, std: 0.016144 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007160/0.043304/0.001045 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006786/0.029694/0.000715 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007095/0.076050/0.001014 !! - method: normal !! Begin decoder 39 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.029907, max: 0.808594, std: 0.017258 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002848/0.063660/0.004723 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002539/0.042847/0.003702 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005501/0.018784/0.000795 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007030/0.089844/0.001469 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.022949, max: 0.371094, std: 0.016068 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.061584/0.001096 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006851/0.065613/0.000909 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007584/0.081482/0.001101 !! - method: normal !! Begin decoder 40 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031982, max: 0.898438, std: 0.019562 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003418/0.057556/0.004429 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002800/0.037903/0.003567 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005974/0.018036/0.000762 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007046/0.041718/0.000777 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.008423, max: 0.394531, std: 0.016434 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007114/0.048431/0.001036 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006851/0.026566/0.000744 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007633/0.073181/0.000994 !! - method: normal !! Begin decoder 41 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031250, max: 0.867188, std: 0.019302 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002848/0.075500/0.005047 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002645/0.038727/0.003561 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005421/0.020111/0.000769 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007030/0.158813/0.002529 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.015076, max: 0.402344, std: 0.016510 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007195/0.052460/0.001122 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006886/0.036011/0.000782 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007488/0.067078/0.000942 !! - method: normal !! Begin decoder 42 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.028442, max: 0.855469, std: 0.018463 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003141/0.062225/0.004654 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002987/0.042847/0.003576 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005077/0.019669/0.000777 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006916/0.068726/0.001397 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.013184, max: 0.404297, std: 0.016388 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007179/0.059631/0.001117 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006836/0.029617/0.000745 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007080/0.058197/0.000825 !! - method: normal !! Begin decoder 43 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.045166, max: 0.882812, std: 0.020325 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003279/0.056366/0.004562 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002865/0.035950/0.003454 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005421/0.019440/0.000834 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007195/0.037384/0.000866 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.034912, max: 0.408203, std: 0.015991 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007275/0.040222/0.001178 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006901/0.043488/0.000858 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007439/0.083313/0.001134 !! - method: normal !! Begin decoder 44 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.038818, max: 0.804688, std: 0.019043 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002954/0.067444/0.004791 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002790/0.041138/0.003475 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.006023/0.017288/0.000857 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007145/0.081787/0.001302 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.037354, max: 0.406250, std: 0.016190 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.059753/0.001092 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006737/0.027084/0.000705 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.005371/0.114075/0.001456 !! - method: normal !! Begin decoder 45 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.033691, max: 0.839844, std: 0.020630 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003304/0.061340/0.004360 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002890/0.035950/0.003561 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004173/0.022720/0.000947 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006981/0.033539/0.000868 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.023560, max: 0.417969, std: 0.016037 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007244/0.053528/0.001163 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.030991/0.000765 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007423/0.060822/0.001042 !! - method: normal !! Begin decoder 46 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.034912, max: 0.812500, std: 0.018784 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002995/0.049866/0.004936 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002539/0.037384/0.003443 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005161/0.013802/0.000974 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006935/0.116150/0.001753 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.038574, max: 0.423828, std: 0.015762 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006966/0.081482/0.001361 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.050446/0.000865 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007603/0.110168/0.001574 !! - method: normal !! Begin decoder 47 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036133, max: 0.839844, std: 0.019562 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002832/0.056763/0.004982 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002758/0.040100/0.003574 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004833/0.013931/0.000822 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007210/0.078125/0.001239 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036621, max: 0.441406, std: 0.015945 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007046/0.041809/0.001195 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006657/0.032410/0.000795 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007046/0.097656/0.001348 !! - method: normal !! Begin decoder 48 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036133, max: 0.800781, std: 0.019073 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003084/0.055328/0.004616 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002783/0.037445/0.003414 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004135/0.015594/0.000752 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007145/0.035614/0.000791 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036133, max: 0.437500, std: 0.016052 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007179/0.036835/0.001137 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006836/0.042114/0.000801 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.006218/0.084351/0.001168 !! - method: normal !! Begin decoder 49 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.043213, max: 0.902344, std: 0.019379 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003296/0.063049/0.004436 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002987/0.038025/0.003288 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005974/0.023300/0.001044 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007553/0.071106/0.001290 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036377, max: 0.421875, std: 0.016068 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006802/0.048309/0.001147 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006935/0.041656/0.000799 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007195/0.068481/0.001121 !! - method: normal !! Begin decoder 50 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.037109, max: 0.847656, std: 0.019165 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003109/0.070923/0.004910 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002613/0.042694/0.003321 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005192/0.015526/0.000921 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007454/0.074768/0.001302 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.037842, max: 0.433594, std: 0.016251 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007065/0.037750/0.001138 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006721/0.028976/0.000724 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.006168/0.105713/0.001479 !! - method: normal !! Begin decoder 51 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.038330, max: 0.867188, std: 0.019226 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003418/0.064209/0.004681 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002613/0.034637/0.003159 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.003540/0.012924/0.001039 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007259/0.076538/0.001249 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.044189, max: 0.449219, std: 0.016373 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007080/0.050140/0.001146 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006935/0.028000/0.000729 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007469/0.057434/0.001116 !! - method: normal !! Begin decoder 52 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.061035, max: 0.679688, std: 0.020767 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002872/0.047668/0.004078 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002050/0.034241/0.003626 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.003979/0.017059/0.000966 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006981/0.065491/0.001202 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.044189, max: 0.464844, std: 0.016251 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007130/0.061981/0.001325 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006363/0.082153/0.001008 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007618/0.079712/0.001740 !! - method: normal !! Begin decoder 53 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.060303, max: 0.863281, std: 0.020859 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002743/0.058868/0.003712 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002295/0.036194/0.003431 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004250/0.014290/0.000860 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007015/0.103149/0.001647 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.045166, max: 0.443359, std: 0.016922 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007015/0.035278/0.001202 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006737/0.030212/0.000880 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007454/0.096619/0.001512 !! - method: normal !! Begin decoder 54 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.053711, max: 0.890625, std: 0.020660 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002783/0.047668/0.004032 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002377/0.036774/0.003460 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005142/0.014290/0.000920 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007534/0.093506/0.001501 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.054199, max: 0.464844, std: 0.017532 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007244/0.093079/0.001365 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006886/0.059692/0.000940 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007683/0.149780/0.002497 !! - method: normal !! Begin decoder 55 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.058105, max: 0.937500, std: 0.021545 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002678/0.043610/0.003986 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002100/0.040741/0.003601 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005127/0.014778/0.001210 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007469/0.052094/0.001225 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.054199, max: 0.480469, std: 0.017639 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007080/0.041656/0.001164 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006721/0.044983/0.000927 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007618/0.155151/0.002712 !! - method: normal !! Begin decoder 56 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.063477, max: 0.882812, std: 0.020828 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002434/0.054810/0.004154 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002108/0.037903/0.003489 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005451/0.014809/0.000838 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007534/0.059235/0.001205 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.045166, max: 0.562500, std: 0.018280 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007130/0.046600/0.001348 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006557/0.044922/0.001155 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007763/0.253174/0.003845 !! - method: normal !! Begin decoder 57 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.000561, max: 0.945312, std: 0.023651 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002205/0.064087/0.005341 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.001758/0.045197/0.003632 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004539/0.013252/0.001018 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007160/0.059235/0.001537 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.057129, max: 0.578125, std: 0.018356 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007046/0.050385/0.001329 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006592/0.057037/0.001279 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007713/0.225464/0.004005 !! - method: normal !! Begin decoder 58 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.018066, max: 0.968750, std: 0.025986 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.001770/0.091675/0.005989 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002068/0.038147/0.003706 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005371/0.024643/0.000950 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006836/0.104675/0.001700 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.116211, max: 0.498047, std: 0.016815 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006691/0.038422/0.001431 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006039/0.072632/0.001469 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007713/0.087219/0.003716 !! - method: normal !! Begin decoder 59 !! Begin self-attention !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.040283, max: 1.000000, std: 0.027527 eps: 0.00000100 !! - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.001673/0.081787/0.006100 !! - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.001400/0.034760/0.003492 !! - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.003922/0.018036/0.001408 !! - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006851/0.092957/0.002872 !! - cache device: cuda:1, seq_len: 0 !! Begin MLP !! - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.148438, max: 0.500000, std: 0.024963 eps: 0.00000100 !! - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006203/0.051300/0.001739 !! - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.005501/0.087769/0.002033 !! - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.008026/0.162476/0.005074 !! - method: normal !! pre norm, hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! pre lm_head, hidden_states: device: cuda:1, shape: [1, 1, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 !! logits: device: cuda:1, shape: [1, 1, 32000], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000 ** Time, Inference: 1.74 seconds -- Loading dataset... -- Testing------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 1.7089843750e-02, -5.3100585938e-03, 2.8076171875e-02, ..., -7.9345703125e-03, 2.4414062500e-02, -1.3610839844e-02], [ 7.5378417969e-03, -3.9367675781e-03, -6.3781738281e-03, ..., -7.5073242188e-03, -1.3977050781e-02, -4.0893554688e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 1.7089843750e-02, -5.3100585938e-03, 2.8076171875e-02, ..., -7.9345703125e-03, 2.4414062500e-02, -1.3610839844e-02], [ 7.5378417969e-03, -3.9367675781e-03, -6.3781738281e-03, ..., -7.5073242188e-03, -1.3977050781e-02, -4.0893554688e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [-5.4931640625e-04, 1.3427734375e-02, 4.5776367188e-03, ..., -4.3945312500e-03, 1.3122558594e-02, 1.3549804688e-02], [ 3.1738281250e-03, 8.7280273438e-03, 2.0385742188e-02, ..., -1.6113281250e-02, -4.0039062500e-02, -1.2145996094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [-5.4931640625e-04, 1.3427734375e-02, 4.5776367188e-03, ..., -4.3945312500e-03, 1.3122558594e-02, 1.3549804688e-02], [ 3.1738281250e-03, 8.7280273438e-03, 2.0385742188e-02, ..., -1.6113281250e-02, -4.0039062500e-02, -1.2145996094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 2.9541015625e-02, 1.8066406250e-02, 1.1352539062e-02, ..., -3.4423828125e-02, -7.0800781250e-03, -2.9182434082e-04], ..., [ 7.3852539062e-03, -1.2145996094e-02, -1.8798828125e-02, ..., -1.3885498047e-03, -2.1972656250e-03, 1.9989013672e-03], [ 1.6357421875e-02, 4.2724609375e-03, 2.3071289062e-02, ..., -1.7623901367e-03, 2.0874023438e-02, 1.0192871094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 2.9541015625e-02, 1.8066406250e-02, 1.1352539062e-02, ..., -3.4423828125e-02, -7.0800781250e-03, -2.9182434082e-04], ..., [ 7.3852539062e-03, -1.2145996094e-02, -1.8798828125e-02, ..., -1.3885498047e-03, -2.1972656250e-03, 1.9989013672e-03], [ 1.6357421875e-02, 4.2724609375e-03, 2.3071289062e-02, ..., -1.7623901367e-03, 2.0874023438e-02, 1.0192871094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-4.9743652344e-03, -3.2424926758e-04, 1.1108398438e-02, ..., 1.6326904297e-03, 3.7536621094e-03, 2.4414062500e-02], ..., [ 3.3935546875e-02, 1.9683837891e-03, 2.2460937500e-02, ..., 2.2705078125e-02, -7.2021484375e-03, -1.0498046875e-02], [ 3.4667968750e-02, 1.1230468750e-02, 1.6723632812e-02, ..., -4.3106079102e-04, -1.2111663818e-04, 1.4221191406e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-4.9743652344e-03, -3.2424926758e-04, 1.1108398438e-02, ..., 1.6326904297e-03, 3.7536621094e-03, 2.4414062500e-02], ..., [ 3.3935546875e-02, 1.9683837891e-03, 2.2460937500e-02, ..., 2.2705078125e-02, -7.2021484375e-03, -1.0498046875e-02], [ 3.4667968750e-02, 1.1230468750e-02, 1.6723632812e-02, ..., -4.3106079102e-04, -1.2111663818e-04, 1.4221191406e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0142822266, -0.0049438477, 0.0106201172, ..., -0.0164794922, 0.0082397461, -0.0002498627], [ 0.0161132812, 0.0095825195, 0.0114135742, ..., 0.0008850098, -0.0012664795, 0.0119018555], ..., [ 0.0383300781, 0.0007209778, -0.0052185059, ..., 0.0031585693, 0.0004558563, 0.0014114380], [-0.0084228516, 0.0233154297, 0.0415039062, ..., -0.0336914062, -0.0076904297, 0.0037841797], [ 0.0094604492, -0.0095825195, -0.0241699219, ..., -0.0134887695, -0.0072326660, 0.0074157715]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0142822266, -0.0049438477, 0.0106201172, ..., -0.0164794922, 0.0082397461, -0.0002498627], [ 0.0161132812, 0.0095825195, 0.0114135742, ..., 0.0008850098, -0.0012664795, 0.0119018555], ..., [ 0.0383300781, 0.0007209778, -0.0052185059, ..., 0.0031585693, 0.0004558563, 0.0014114380], [-0.0084228516, 0.0233154297, 0.0415039062, ..., -0.0336914062, -0.0076904297, 0.0037841797], [ 0.0094604492, -0.0095825195, -0.0241699219, ..., -0.0134887695, -0.0072326660, 0.0074157715]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-6.5504000000e+04, -6.5504000000e+04, -6.5504000000e+04, ..., 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00], [ 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00, ..., 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00], [ 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00, ..., 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00], ..., [ 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00, ..., 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00], [ 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00, ..., 1.8681640625e+00, -1.0320000000e+03, 1.8691406250e+00], [-3.9160156250e+00, 1.8710937500e+00, -1.5566406250e+00, ..., 1.8671875000e+00, -1.1572265625e+00, 1.8691406250e+00]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-5.6152343750e-03, -2.3437500000e-02, -4.7851562500e-02, ..., 1.2145996094e-02, -1.2023925781e-02, -6.4697265625e-03], ..., [-1.0131835938e-02, -1.1413574219e-02, 4.9133300781e-03, ..., 2.1972656250e-02, -2.7587890625e-02, 3.5644531250e-02], [ 5.9204101562e-03, -3.8909912109e-03, -1.3671875000e-02, ..., 4.7302246094e-03, -1.9989013672e-03, -3.1250000000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-5.6152343750e-03, -2.3437500000e-02, -4.7851562500e-02, ..., 1.2145996094e-02, -1.2023925781e-02, -6.4697265625e-03], ..., [-1.0131835938e-02, -1.1413574219e-02, 4.9133300781e-03, ..., 2.1972656250e-02, -2.7587890625e-02, 3.5644531250e-02], [ 5.9204101562e-03, -3.8909912109e-03, -1.3671875000e-02, ..., 4.7302246094e-03, -1.9989013672e-03, -3.1250000000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.0136718750, 0.5136718750, -0.4663085938, ..., 0.7558593750, 0.0037231445, 0.1134643555], [-0.3908691406, 0.3562011719, -0.2714843750, ..., -0.1633300781, -0.6416015625, 0.3332519531], [-0.6704101562, -1.0908203125, 1.3046875000, ..., -0.9755859375, 0.6386718750, 1.8837890625], ..., [-0.9624023438, 0.4008789062, -0.2507324219, ..., -0.1226806641, -0.0985107422, -0.1401367188], [ 0.2436523438, -0.2132568359, 0.0051879883, ..., 0.3486328125, -0.1623535156, -0.1499023438], [-0.0914306641, 0.4765625000, -0.1501464844, ..., -0.4157714844, 0.2880859375, 0.3554687500]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 3.0639648438e-02, 5.9509277344e-03, 2.9373168945e-04, ..., -2.4902343750e-02, 4.6081542969e-03, -2.0019531250e-02], ..., [-1.4282226562e-02, -8.2778930664e-04, 5.7067871094e-03, ..., 1.0803222656e-02, 1.0986328125e-02, -1.8310546875e-04], [ 2.0751953125e-02, 1.8554687500e-02, -4.9209594727e-04, ..., 8.9111328125e-03, 5.7678222656e-03, 2.2338867188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 3.0639648438e-02, 5.9509277344e-03, 2.9373168945e-04, ..., -2.4902343750e-02, 4.6081542969e-03, -2.0019531250e-02], ..., [-1.4282226562e-02, -8.2778930664e-04, 5.7067871094e-03, ..., 1.0803222656e-02, 1.0986328125e-02, -1.8310546875e-04], [ 2.0751953125e-02, 1.8554687500e-02, -4.9209594727e-04, ..., 8.9111328125e-03, 5.7678222656e-03, 2.2338867188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.0589904785, -0.3146972656, 0.1052856445, ..., 0.0791015625, -0.6176757812, -0.4550781250], [ 0.8803710938, 0.0490112305, -1.2441406250, ..., -0.6054687500, 1.0312500000, 0.1311035156], [-0.0364685059, -1.4716796875, 0.2253417969, ..., -0.4050292969, 1.0771484375, 0.9663085938], ..., [-0.1828613281, -0.1293945312, -0.2341308594, ..., -0.3740234375, -0.0330200195, -0.2900390625], [-0.2705078125, 0.3598632812, 0.6093750000, ..., -0.6264648438, 0.0189208984, -0.0944824219], [-0.0614013672, 0.3945312500, 0.0489501953, ..., -0.3039550781, -0.0233764648, -0.1340332031]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.8188476562e-02, -3.4667968750e-02, 2.1850585938e-02, ..., -1.2817382812e-02, -1.7578125000e-02, 1.5869140625e-02], ..., [-3.1127929688e-03, 9.3994140625e-03, 1.1413574219e-02, ..., 4.8522949219e-03, -6.0729980469e-03, -1.4648437500e-02], [-1.7929077148e-03, 8.0566406250e-03, 2.0996093750e-02, ..., 1.4831542969e-02, -7.7209472656e-03, 1.2207031250e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.8188476562e-02, -3.4667968750e-02, 2.1850585938e-02, ..., -1.2817382812e-02, -1.7578125000e-02, 1.5869140625e-02], ..., [-3.1127929688e-03, 9.3994140625e-03, 1.1413574219e-02, ..., 4.8522949219e-03, -6.0729980469e-03, -1.4648437500e-02], [-1.7929077148e-03, 8.0566406250e-03, 2.0996093750e-02, ..., 1.4831542969e-02, -7.7209472656e-03, 1.2207031250e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.0624389648, -0.0535278320, -0.3315429688, ..., 0.3723144531, 0.0762939453, -0.2093505859], [-0.3833007812, -0.2397460938, -0.5288085938, ..., -0.3215332031, 0.9023437500, 0.7578125000], [-0.8349609375, 1.0791015625, 0.8325195312, ..., 0.2423095703, 0.5688476562, -0.7285156250], ..., [ 0.3789062500, 0.2065429688, 0.4116210938, ..., 0.1331787109, 1.4189453125, 0.6328125000], [-2.1347656250, -4.2734375000, 0.5483398438, ..., -0.9853515625, 0.3745117188, 0.9931640625], [-0.4565429688, 0.1628417969, 0.0540771484, ..., -0.0531005859, 0.3945312500, -0.3032226562]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.0681152344e-02, 1.2969970703e-03, -1.1474609375e-02, ..., -1.8157958984e-03, -1.7822265625e-02, 1.0070800781e-02], [ 1.5441894531e-02, 2.7923583984e-03, 6.7138671875e-03, ..., 1.1901855469e-02, -1.4343261719e-02, 3.9367675781e-03], ..., [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], [ 1.9531250000e-02, 1.4221191406e-02, 1.6723632812e-02, ..., 1.0681152344e-02, 6.4468383789e-04, 9.2773437500e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.0681152344e-02, 1.2969970703e-03, -1.1474609375e-02, ..., -1.8157958984e-03, -1.7822265625e-02, 1.0070800781e-02], [ 1.5441894531e-02, 2.7923583984e-03, 6.7138671875e-03, ..., 1.1901855469e-02, -1.4343261719e-02, 3.9367675781e-03], ..., [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], [ 1.9531250000e-02, 1.4221191406e-02, 1.6723632812e-02, ..., 1.0681152344e-02, 6.4468383789e-04, 9.2773437500e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-9.2138671875e-01, -1.0097656250e+00, 1.9765625000e+00, ..., -1.7626953125e+00, 4.3286132812e-01, 2.6582031250e+00], [-1.5195312500e+00, -1.6250000000e+00, 3.1386718750e+00, ..., 1.0898437500e+00, 1.9848632812e-01, 6.0429687500e+00], [-9.2431640625e-01, -2.4570312500e+00, 1.7851562500e+00, ..., -1.4638671875e+00, 3.4472656250e-01, 2.9296875000e+00], ..., [-7.5244140625e-01, -4.3847656250e-01, -3.8354492188e-01, ..., -1.5112304688e-01, 1.5026855469e-01, 9.7778320312e-02], [-1.2753906250e+00, 5.0659179688e-03, -3.6523437500e-01, ..., 6.0156250000e-01, -8.5388183594e-02, -5.7617187500e-02], [-1.4916992188e-01, -5.8349609375e-02, -7.9589843750e-01, ..., -3.1176757812e-01, -2.7172851562e-01, -8.0749511719e-02]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [-512.0000000000, -7.4960937500, -512.0000000000, ..., 0.0000000000, 0.0000000000, 0.0000000000], [ 0.0000000000, 0.0000000000, 0.0000000000, ..., 0.0000000000, 0.0000000000, 0.0000000000], [ 0.0000000000, 0.0000000000, 0.0000000000, ..., -7.4960937500, -512.0000000000, -7.4960937500]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 5.5847167969e-03, -8.3618164062e-03, 4.1198730469e-03, ..., -2.7618408203e-03, -6.5612792969e-03, -1.4160156250e-02], ..., [-7.7209472656e-03, -1.0192871094e-02, 1.2512207031e-02, ..., -2.0874023438e-02, 1.3351440430e-03, 3.3874511719e-03], [-1.1230468750e-02, 1.1657714844e-02, 9.3994140625e-03, ..., -1.2573242188e-02, 2.4536132812e-02, 2.0751953125e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 5.5847167969e-03, -8.3618164062e-03, 4.1198730469e-03, ..., -2.7618408203e-03, -6.5612792969e-03, -1.4160156250e-02], ..., [-7.7209472656e-03, -1.0192871094e-02, 1.2512207031e-02, ..., -2.0874023438e-02, 1.3351440430e-03, 3.3874511719e-03], [-1.1230468750e-02, 1.1657714844e-02, 9.3994140625e-03, ..., -1.2573242188e-02, 2.4536132812e-02, 2.0751953125e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- .------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 2.7465820312e-03, -4.6997070312e-03, -9.5825195312e-03, ..., -2.3559570312e-02, 1.5380859375e-02, -3.4484863281e-03], [ 1.9287109375e-02, -1.5380859375e-02, -1.0253906250e-02, ..., -2.9052734375e-02, 1.5869140625e-02, -1.2451171875e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 2.7465820312e-03, -4.6997070312e-03, -9.5825195312e-03, ..., -2.3559570312e-02, 1.5380859375e-02, -3.4484863281e-03], [ 1.9287109375e-02, -1.5380859375e-02, -1.0253906250e-02, ..., -2.9052734375e-02, 1.5869140625e-02, -1.2451171875e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.3781738281, -0.5107421875, -0.3872070312, ..., 3.2597656250, -0.4221191406, -0.0762329102], [-2.0175781250, -3.2656250000, 1.3632812500, ..., 1.2978515625, 0.4873046875, 2.0937500000], [-1.0644531250, -1.6757812500, 0.7480468750, ..., -1.4501953125, -2.7070312500, -0.2126464844], ..., [ 0.0731201172, -0.0905761719, -0.5668945312, ..., -0.0690917969, 0.3610839844, -0.1778564453], [-0.3122558594, 0.2282714844, 0.2294921875, ..., -0.3735351562, 0.5595703125, 0.1550292969], [ 0.0814208984, 0.4826660156, -0.4240722656, ..., -0.1112670898, -0.1386718750, 0.3964843750]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 5.7220458984e-04, -2.0599365234e-03, 1.8920898438e-02, ..., -2.1240234375e-02, -1.8310546875e-02, -1.2084960938e-02], ..., [-4.2724609375e-03, -1.2817382812e-02, 1.4160156250e-02, ..., 2.2029876709e-04, 9.0332031250e-03, -3.6132812500e-02], [ 1.2207031250e-02, -2.6123046875e-02, -2.2705078125e-02, ..., -9.6435546875e-03, -1.0620117188e-02, -6.8359375000e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 5.7220458984e-04, -2.0599365234e-03, 1.8920898438e-02, ..., -2.1240234375e-02, -1.8310546875e-02, -1.2084960938e-02], ..., [-4.2724609375e-03, -1.2817382812e-02, 1.4160156250e-02, ..., 2.2029876709e-04, 9.0332031250e-03, -3.6132812500e-02], [ 1.2207031250e-02, -2.6123046875e-02, -2.2705078125e-02, ..., -9.6435546875e-03, -1.0620117188e-02, -6.8359375000e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.9836425781e-04, 7.2021484375e-03, -1.1779785156e-02, ..., -3.6865234375e-02, -2.1728515625e-02, -1.1367797852e-03], [-1.3656616211e-03, 1.9165039062e-02, -7.6293945312e-03, ..., -5.5236816406e-03, -1.0253906250e-02, -3.1433105469e-03], ..., [-1.1352539062e-02, -6.6223144531e-03, 2.0996093750e-02, ..., -3.6865234375e-02, -3.8085937500e-02, -2.9754638672e-03], [-5.4121017456e-05, -1.5487670898e-03, 3.8818359375e-02, ..., -1.7089843750e-03, 1.0375976562e-03, -1.5075683594e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.9836425781e-04, 7.2021484375e-03, -1.1779785156e-02, ..., -3.6865234375e-02, -2.1728515625e-02, -1.1367797852e-03], [-1.3656616211e-03, 1.9165039062e-02, -7.6293945312e-03, ..., -5.5236816406e-03, -1.0253906250e-02, -3.1433105469e-03], ..., [-1.1352539062e-02, -6.6223144531e-03, 2.0996093750e-02, ..., -3.6865234375e-02, -3.8085937500e-02, -2.9754638672e-03], [-5.4121017456e-05, -1.5487670898e-03, 3.8818359375e-02, ..., -1.7089843750e-03, 1.0375976562e-03, -1.5075683594e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.5688476562, -0.2102050781, -1.6777343750, ..., 0.6181640625, 0.3344726562, -0.1972656250], [-0.8613281250, 0.1494140625, 1.7050781250, ..., -0.5405273438, -0.0888671875, 0.4533691406], [ 0.7724609375, -0.3144531250, 0.9404296875, ..., 1.0205078125, 1.4980468750, -0.0450439453], ..., [ 0.2871093750, 0.2452392578, -0.2333984375, ..., 0.2312011719, 0.1772460938, -0.0313720703], [ 0.0098876953, 0.1782226562, -0.0915527344, ..., -0.3532714844, 0.0632324219, -0.0461425781], [ 0.0128173828, 0.2849121094, -0.2863769531, ..., -0.2073974609, 0.1999511719, 0.0579833984]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.8125000000e-03, 1.5625000000e-02, -1.0009765625e-02, ..., -2.3925781250e-02, -7.6293945312e-04, 8.4228515625e-03], [ 7.8125000000e-03, 1.6307830811e-04, 8.4228515625e-03, ..., -1.2573242188e-02, 2.5177001953e-03, 7.3852539062e-03], ..., [-4.5166015625e-02, 8.2015991211e-04, 7.3623657227e-04, ..., -7.8582763672e-04, 1.5830993652e-04, 3.9367675781e-03], [-1.2207031250e-03, -3.1982421875e-02, -2.4047851562e-02, ..., -6.4086914062e-04, -9.8266601562e-03, -2.2827148438e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.8125000000e-03, 1.5625000000e-02, -1.0009765625e-02, ..., -2.3925781250e-02, -7.6293945312e-04, 8.4228515625e-03], [ 7.8125000000e-03, 1.6307830811e-04, 8.4228515625e-03, ..., -1.2573242188e-02, 2.5177001953e-03, 7.3852539062e-03], ..., [-4.5166015625e-02, 8.2015991211e-04, 7.3623657227e-04, ..., -7.8582763672e-04, 1.5830993652e-04, 3.9367675781e-03], [-1.2207031250e-03, -3.1982421875e-02, -2.4047851562e-02, ..., -6.4086914062e-04, -9.8266601562e-03, -2.2827148438e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.6699218750, 0.9267578125, -0.4541015625, ..., -0.3295898438, -0.5942382812, -0.3322753906], [-0.3154296875, -1.4218750000, 0.0122070312, ..., -0.1420898438, 0.2741699219, -0.7402343750], [-0.7377929688, 0.0358886719, 4.0156250000, ..., 1.0888671875, 1.0058593750, 2.5156250000], ..., [-0.9023437500, 0.2814941406, -0.2851562500, ..., -0.4978027344, -0.6440429688, -0.5273437500], [-1.8935546875, -1.7753906250, 0.0541992188, ..., 1.2871093750, 1.0449218750, 3.0625000000], [-0.5644531250, -1.5683593750, 1.3076171875, ..., 1.2050781250, -0.5078125000, 3.8085937500]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-1.8676757812e-02, 4.0893554688e-03, -1.0314941406e-02, ..., -1.5747070312e-02, 1.3732910156e-03, -2.0507812500e-02], ..., [-1.2329101562e-02, -2.0996093750e-02, -1.1840820312e-02, ..., 2.6092529297e-03, -1.6845703125e-02, -5.4016113281e-03], [ 6.2011718750e-02, 5.7373046875e-03, -3.0395507812e-02, ..., 7.9345703125e-03, 1.1291503906e-03, 3.6163330078e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-1.8676757812e-02, 4.0893554688e-03, -1.0314941406e-02, ..., -1.5747070312e-02, 1.3732910156e-03, -2.0507812500e-02], ..., [-1.2329101562e-02, -2.0996093750e-02, -1.1840820312e-02, ..., 2.6092529297e-03, -1.6845703125e-02, -5.4016113281e-03], [ 6.2011718750e-02, 5.7373046875e-03, -3.0395507812e-02, ..., 7.9345703125e-03, 1.1291503906e-03, 3.6163330078e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.2489013672, 0.3498535156, 0.5263671875, ..., -0.0549316406, -0.4404296875, -0.7827148438], [ 0.0475158691, -1.2060546875, -1.0488281250, ..., -2.3613281250, 2.5820312500, 3.2968750000], [-0.5874023438, -1.6640625000, 1.6923828125, ..., -0.8608398438, 0.0735473633, 3.1328125000], ..., [ 0.3847656250, -0.0551452637, -0.1420898438, ..., 0.6083984375, 0.0056762695, -0.2707519531], [-0.0474853516, -0.0794677734, -0.6372070312, ..., 0.0113830566, 0.3256835938, -0.4748535156], [-0.3881835938, 0.0191040039, -0.2829589844, ..., -0.6284179688, -0.0999755859, 0.3271484375]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., -65504., -65504., -65504.], ..., [ 0., 0., 0., ..., -65504., -65504., -65504.], [-65504., -65504., -65504., ..., -65504., -65504., -65504.], [-65504., -65504., -65504., ..., -65504., -65504., -65504.]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 8.4228515625e-03, 1.7089843750e-02, -2.3498535156e-03, ..., -1.1901855469e-02, -8.2397460938e-03, -1.7822265625e-02], ..., [ 7.8125000000e-02, 3.0517578125e-03, 3.3447265625e-02, ..., -1.1535644531e-02, 1.7089843750e-02, -1.9775390625e-02], [ 6.2179565430e-04, -8.7356567383e-04, 4.1259765625e-02, ..., -2.8198242188e-02, -5.9814453125e-03, -6.7138671875e-04], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 8.4228515625e-03, 1.7089843750e-02, -2.3498535156e-03, ..., -1.1901855469e-02, -8.2397460938e-03, -1.7822265625e-02], ..., [ 7.8125000000e-02, 3.0517578125e-03, 3.3447265625e-02, ..., -1.1535644531e-02, 1.7089843750e-02, -1.9775390625e-02], [ 6.2179565430e-04, -8.7356567383e-04, 4.1259765625e-02, ..., -2.8198242188e-02, -5.9814453125e-03, -6.7138671875e-04], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.2043457031, -0.1730957031, 5.8125000000, ..., -3.6328125000, -3.7050781250, 3.1582031250], [ 1.5136718750, 1.9453125000, 4.7382812500, ..., 0.6528320312, -0.5512695312, 2.2636718750], [ 0.1496582031, 0.9736328125, 1.4873046875, ..., 0.7114257812, 0.2968750000, -0.4135742188], ..., [ 0.1815185547, -0.7128906250, -0.1015625000, ..., -0.3068847656, -0.2612304688, 0.1342773438], [ 0.5654296875, -0.4191894531, 0.1219482422, ..., 0.0346069336, -0.3771972656, 1.0673828125], [ 0.4660644531, -0.4299316406, -0.6635742188, ..., -0.5507812500, -0.0261230469, 0.7236328125]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-6.5504000000e+04, -6.5504000000e+04, -6.5504000000e+04, ..., -1.3769531250e-01, 6.9580078125e-03, -1.5588378906e-01], [ 1.2084960938e-01, -1.2805175781e-01, 3.7231445312e-01, ..., 6.5039062500e-01, -1.5563964844e-01, -2.5073242188e-01], [ 2.7053833008e-02, -1.2658691406e-01, 8.4277343750e-01, ..., 6.4257812500e-01, 2.5927734375e-01, -2.4658203125e-02], ..., [ 3.3630371094e-02, 3.4973144531e-02, 3.5186767578e-02, ..., 3.6560058594e-02, 3.5369873047e-02, 3.3142089844e-02], [ 2.0370483398e-02, 0.0000000000e+00, 0.0000000000e+00, ..., 4.5104980469e-02, 0.0000000000e+00, 0.0000000000e+00], [ 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00, ..., 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 2.8930664062e-02, 1.0986328125e-02, 2.0217895508e-04, ..., -2.6245117188e-02, -3.7841796875e-03, 5.3710937500e-03], [ 2.1728515625e-02, -4.6081542969e-03, 5.1269531250e-03, ..., -7.3242187500e-03, -4.5776367188e-03, 2.9602050781e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 2.8930664062e-02, 1.0986328125e-02, 2.0217895508e-04, ..., -2.6245117188e-02, -3.7841796875e-03, 5.3710937500e-03], [ 2.1728515625e-02, -4.6081542969e-03, 5.1269531250e-03, ..., -7.3242187500e-03, -4.5776367188e-03, 2.9602050781e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-2.2988281250e+00, -2.5371093750e+00, -1.9531250000e+00, ..., -2.4628906250e+00, -1.6943359375e+00, -1.0097656250e+00], [-1.6621093750e+00, -3.6206054688e-01, -1.7011718750e+00, ..., 6.6357421875e-01, -1.1679687500e+00, 1.0781250000e+00], [ 1.0888671875e-01, -1.0205078125e-01, -1.3125000000e+00, ..., 4.3320312500e+00, 1.2072753906e-01, 4.3632812500e+00], ..., [ 4.8828125000e-01, 7.1435546875e-01, 1.7395019531e-01, ..., 3.7988281250e-01, 1.2768554688e-01, -5.9326171875e-01], [-2.0751953125e-03, 1.5356445312e-01, -6.8054199219e-02, ..., 9.1491699219e-02, -1.7260742188e-01, 2.0520019531e-01], [ 6.1083984375e-01, 9.0917968750e-01, 3.8159179688e-01, ..., 1.1456298828e-01, -2.1484375000e-01, -5.5566406250e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-6.5504000000e+04, -6.5504000000e+04, -6.5504000000e+04, ..., 7.2148437500e+00, 3.8964843750e-01, 1.4218750000e+00], [-5.4218750000e+00, 3.1816406250e+00, -3.6914062500e-01, ..., 6.2187500000e+00, -9.1406250000e-01, 1.4492187500e+00], [-3.8828125000e+00, 2.2265625000e+00, -1.1132812500e-01, ..., 6.1406250000e+00, -2.0917968750e+00, 1.7500000000e+00], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 1.6113281250e-02, -5.1879882812e-04, -1.3122558594e-03, ..., 2.8533935547e-03, -3.9062500000e-03, 1.3046264648e-03], [-1.0192871094e-02, 9.0942382812e-03, -4.6386718750e-03, ..., 1.2634277344e-02, -1.3656616211e-03, -6.5307617188e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 1.6113281250e-02, -5.1879882812e-04, -1.3122558594e-03, ..., 2.8533935547e-03, -3.9062500000e-03, 1.3046264648e-03], [-1.0192871094e-02, 9.0942382812e-03, -4.6386718750e-03, ..., 1.2634277344e-02, -1.3656616211e-03, -6.5307617188e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-1.6044921875, 0.8100585938, 2.3671875000, ..., -0.5493164062, -0.8740234375, 1.5058593750], [-3.1875000000, -2.3828125000, 1.0224609375, ..., -2.1640625000, -2.8437500000, 0.0739746094], [-0.1015625000, 0.8139648438, 1.4423828125, ..., 1.0712890625, 0.0764160156, 0.7373046875], ..., [-2.0195312500, -3.9140625000, -0.1146240234, ..., -2.6289062500, 1.0195312500, 2.6230468750], [-0.5141601562, 0.5087890625, 1.7314453125, ..., -0.0693359375, -1.6894531250, -0.6147460938], [-0.0394287109, 0.1286621094, 0.4343261719, ..., -0.2247314453, -0.6845703125, 0.3295898438]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.3956298828e-03, -8.7280273438e-03, -1.6113281250e-02, ..., 9.4413757324e-05, -2.5939941406e-03, 1.1169433594e-02], [-4.4250488281e-03, -8.0566406250e-03, -2.4414062500e-03, ..., 3.2958984375e-03, 1.0070800781e-02, 2.2705078125e-02], ..., [ 2.1240234375e-02, 5.0964355469e-03, -8.6669921875e-03, ..., -8.2397460938e-03, -2.1514892578e-03, -1.9897460938e-02], [-3.7353515625e-02, -2.5634765625e-03, 1.0498046875e-02, ..., 8.5449218750e-03, 3.2043457031e-04, -6.1645507812e-03], [-5.7983398438e-03, -7.8735351562e-03, 1.7456054688e-02, ..., -5.0354003906e-03, -6.5994262695e-04, -1.5869140625e-02]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.3956298828e-03, -8.7280273438e-03, -1.6113281250e-02, ..., 9.4413757324e-05, -2.5939941406e-03, 1.1169433594e-02], [-4.4250488281e-03, -8.0566406250e-03, -2.4414062500e-03, ..., 3.2958984375e-03, 1.0070800781e-02, 2.2705078125e-02], ..., [ 2.1240234375e-02, 5.0964355469e-03, -8.6669921875e-03, ..., -8.2397460938e-03, -2.1514892578e-03, -1.9897460938e-02], [-3.7353515625e-02, -2.5634765625e-03, 1.0498046875e-02, ..., 8.5449218750e-03, 3.2043457031e-04, -6.1645507812e-03], [-5.7983398438e-03, -7.8735351562e-03, 1.7456054688e-02, ..., -5.0354003906e-03, -6.5994262695e-04, -1.5869140625e-02]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-1.2314453125, -3.8769531250, 1.3085937500, ..., -4.9453125000, -1.7353515625, 3.1835937500], [-0.3657226562, 1.2548828125, -0.1363525391, ..., -1.4257812500, 1.2978515625, 0.5463867188], [-0.3271484375, -2.0937500000, 1.6542968750, ..., -1.9345703125, -0.7041015625, 1.3984375000], ..., [-0.1384277344, -0.2343750000, -0.3310546875, ..., 0.3222656250, -0.5517578125, -0.0854492188], [-0.3371582031, 0.1511230469, 0.1896972656, ..., 0.4018554688, -0.6450195312, -0.1232910156], [-0.0537109375, 1.4150390625, 0.2307128906, ..., 1.0039062500, -0.2548828125, 0.1217041016]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-5.1200000000e+02, -7.4960937500e+00, -5.1200000000e+02, ..., 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00], [ 0.0000000000e+00, 0.0000000000e+00, 0.0000000000e+00, ..., 9.8632812500e-01, -7.8125000000e-03, -1.5302734375e+00], [-0.0000000000e+00, 1.4912109375e+00, 7.8125000000e-03, ..., -1.6923828125e+00, 5.1200000000e+02, 1.7148437500e+00], ..., [ 6.5722656250e-01, 1.8798828125e-02, 1.6582031250e+00, ..., -2.5073242188e-01, -6.6894531250e-02, 7.1105957031e-02], [-9.4970703125e-02, 8.2714843750e-01, -1.6796875000e-01, ..., -6.3964843750e-02, 1.2519531250e+00, 6.9384765625e-01], [-5.8398437500e-01, -1.0126953125e+00, -5.0244140625e-01, ..., -5.5322265625e-01, -4.8974609375e-01, 5.5078125000e-01]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 6.8054199219e-03, -2.8076171875e-02, -1.3061523438e-02, ..., 4.0292739868e-05, 2.5939941406e-03, -8.4228515625e-03], [-3.0212402344e-03, 2.7770996094e-03, 1.6845703125e-02, ..., 8.7738037109e-04, 1.4648437500e-02, 1.3122558594e-02], ..., [ 2.8686523438e-03, 4.4921875000e-02, 2.1209716797e-03, ..., -2.9174804688e-02, 1.3671875000e-02, -9.6511840820e-04], [ 4.1503906250e-03, -6.7749023438e-03, 1.5136718750e-02, ..., -2.4658203125e-02, -3.2470703125e-02, -3.1982421875e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 6.8054199219e-03, -2.8076171875e-02, -1.3061523438e-02, ..., 4.0292739868e-05, 2.5939941406e-03, -8.4228515625e-03], [-3.0212402344e-03, 2.7770996094e-03, 1.6845703125e-02, ..., 8.7738037109e-04, 1.4648437500e-02, 1.3122558594e-02], ..., [ 2.8686523438e-03, 4.4921875000e-02, 2.1209716797e-03, ..., -2.9174804688e-02, 1.3671875000e-02, -9.6511840820e-04], [ 4.1503906250e-03, -6.7749023438e-03, 1.5136718750e-02, ..., -2.4658203125e-02, -3.2470703125e-02, -3.1982421875e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.1318359375, 0.0963745117, -0.5239257812, ..., 0.3188476562, 0.0900878906, 0.1636962891], [ 0.4804687500, -0.2299804688, -0.0261840820, ..., -0.2802734375, 0.5561523438, -0.2253417969], [ 1.3398437500, -4.8203125000, -1.2441406250, ..., -4.6718750000, -0.2766113281, 1.4287109375], ..., [-0.6743164062, 3.6367187500, 0.4086914062, ..., 1.4257812500, 1.0976562500, -1.0175781250], [-3.0156250000, -5.7421875000, 0.8837890625, ..., -3.3593750000, 0.2739257812, 3.8808593750], [-0.1469726562, -2.5097656250, -0.5380859375, ..., -2.8593750000, -1.3115234375, 0.4545898438]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[ 0., -65504., -65504., ..., -65504., -65504., -65504.], [-65504., -65504., -65504., ..., 0., 0., 0.], [ 0., 0., 0., ..., -65504., -65504., -65504.], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- .------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [-8.4838867188e-03, -6.0424804688e-03, 9.8419189453e-04, ..., 1.4709472656e-02, -1.9042968750e-02, 5.0354003906e-03], [ 2.6367187500e-02, 1.3977050781e-02, -1.4099121094e-02, ..., 1.6113281250e-02, 1.1352539062e-02, -1.1825561523e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [-8.4838867188e-03, -6.0424804688e-03, 9.8419189453e-04, ..., 1.4709472656e-02, -1.9042968750e-02, 5.0354003906e-03], [ 2.6367187500e-02, 1.3977050781e-02, -1.4099121094e-02, ..., 1.6113281250e-02, 1.1352539062e-02, -1.1825561523e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.6308593750, 1.9941406250, -0.4223632812, ..., -1.0732421875, -0.8720703125, -0.4919433594], [-0.5996093750, -1.7587890625, -0.4052734375, ..., -0.0158691406, -1.3974609375, 0.7192382812], [ 0.0781250000, 0.3374023438, -1.5878906250, ..., -0.5761718750, -0.6762695312, 1.9990234375], ..., [-0.2332763672, -0.3906250000, -0.3171386719, ..., 0.1889648438, 0.0993652344, -0.0518798828], [ 0.0469055176, -0.0645751953, -0.6201171875, ..., -0.0428161621, -0.2553710938, -0.2454833984], [ 0.0382080078, 0.3908691406, -0.8168945312, ..., -0.3842773438, 0.0972290039, 0.2756347656]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-3.7597656250e-02, -3.3416748047e-03, -3.0761718750e-02, ..., -2.0874023438e-02, 7.3242187500e-03, -1.2023925781e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04], ..., [ 3.4423828125e-02, -8.3160400391e-04, 1.5075683594e-02, ..., -7.5988769531e-03, -1.1779785156e-02, -1.4526367188e-02], [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-3.7597656250e-02, -3.3416748047e-03, -3.0761718750e-02, ..., -2.0874023438e-02, 7.3242187500e-03, -1.2023925781e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04], ..., [ 3.4423828125e-02, -8.3160400391e-04, 1.5075683594e-02, ..., -7.5988769531e-03, -1.1779785156e-02, -1.4526367188e-02], [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-2.6953125000, -1.3037109375, 0.0882568359, ..., -0.4042968750, -0.3505859375, 1.2705078125], [-1.7607421875, -1.4277343750, -3.6230468750, ..., -1.3632812500, 1.0800781250, 2.9960937500], [-3.4140625000, -3.5644531250, 2.8750000000, ..., -2.4531250000, -3.8203125000, -0.3706054688], ..., [-0.3364257812, -0.1379394531, -0.6284179688, ..., 0.1129150391, -0.4257812500, -0.2125244141], [-0.4912109375, -0.4804687500, -0.5825195312, ..., -0.0738525391, -0.3735351562, -0.7504882812], [-1.3222656250, 2.8007812500, -0.3537597656, ..., -0.5585937500, -0.1903076172, 0.7119140625]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-3.1250000000e-02, -4.4555664062e-03, 1.9149780273e-03, ..., -1.4160156250e-02, -1.3061523438e-02, -3.3721923828e-03], ..., [ 7.8125000000e-02, 3.0517578125e-03, 3.3447265625e-02, ..., -1.1535644531e-02, 1.7089843750e-02, -1.9775390625e-02], [ 5.2185058594e-03, 4.0283203125e-03, -1.9287109375e-02, ..., -3.0273437500e-02, 7.9345703125e-04, 1.1169433594e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-3.1250000000e-02, -4.4555664062e-03, 1.9149780273e-03, ..., -1.4160156250e-02, -1.3061523438e-02, -3.3721923828e-03], ..., [ 7.8125000000e-02, 3.0517578125e-03, 3.3447265625e-02, ..., -1.1535644531e-02, 1.7089843750e-02, -1.9775390625e-02], [ 5.2185058594e-03, 4.0283203125e-03, -1.9287109375e-02, ..., -3.0273437500e-02, 7.9345703125e-04, 1.1169433594e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.5800781250, -1.3828125000, -1.0693359375, ..., -0.4638671875, 0.1979980469, -0.0833740234], [ 0.4604492188, -2.0488281250, 0.5131835938, ..., -2.6406250000, -0.0357666016, 2.3144531250], [ 0.7021484375, -0.7167968750, 1.7490234375, ..., 0.6284179688, -0.0223999023, 1.2919921875], ..., [-0.7890625000, -0.4025878906, -0.7333984375, ..., -0.0662231445, 0.0307617188, -0.7353515625], [-0.5185546875, 0.3505859375, 0.3596191406, ..., 0.6430664062, -0.1447753906, -1.0166015625], [-0.3505859375, 0.5634765625, -0.1955566406, ..., -0.3876953125, -0.3950195312, 0.2958984375]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0142822266, -0.0049438477, 0.0106201172, ..., -0.0164794922, 0.0082397461, -0.0002498627], [-0.0231933594, -0.0295410156, 0.0126342773, ..., -0.0260009766, 0.0244140625, -0.0332031250], ..., [-0.0167236328, 0.0161132812, -0.0090332031, ..., -0.0142822266, 0.0156250000, -0.0004920959], [-0.0169677734, -0.0128173828, 0.0153808594, ..., -0.0273437500, 0.0087890625, 0.0100097656], [-0.0163574219, 0.0156250000, 0.0030212402, ..., -0.0072326660, 0.0015869141, 0.0134887695]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0142822266, -0.0049438477, 0.0106201172, ..., -0.0164794922, 0.0082397461, -0.0002498627], [-0.0231933594, -0.0295410156, 0.0126342773, ..., -0.0260009766, 0.0244140625, -0.0332031250], ..., [-0.0167236328, 0.0161132812, -0.0090332031, ..., -0.0142822266, 0.0156250000, -0.0004920959], [-0.0169677734, -0.0128173828, 0.0153808594, ..., -0.0273437500, 0.0087890625, 0.0100097656], [-0.0163574219, 0.0156250000, 0.0030212402, ..., -0.0072326660, 0.0015869141, 0.0134887695]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-3.9575195312e-01, -3.6816406250e-01, 1.4953613281e-01, ..., -6.4404296875e-01, -1.7297363281e-01, -1.0870361328e-01], [ 2.3864746094e-01, -1.1074218750e+00, 6.2792968750e-01, ..., -5.5761718750e-01, -1.2744140625e+00, -6.4013671875e-01], [-1.0961914062e-01, -2.5957031250e+00, 1.1025390625e+00, ..., -1.7150878906e-02, 7.6074218750e-01, 1.0068359375e+00], ..., [-9.7167968750e-02, -1.9760131836e-02, -5.5908203125e-01, ..., 1.9799804688e-01, -3.8476562500e-01, 9.8022460938e-02], [-3.0224609375e-01, -3.7036132812e-01, -3.1713867188e-01, ..., -2.6196289062e-01, -1.6931152344e-01, 4.2724609375e-04], [-1.4599609375e-01, 3.3032226562e-01, -6.1279296875e-01, ..., -6.4550781250e-01, -4.2480468750e-02, 7.0751953125e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.2326660156e-03, -2.1850585938e-02, 1.6601562500e-02, ..., 8.7280273438e-03, 3.6315917969e-03, -1.7333984375e-02], [-7.2021484375e-03, 9.9487304688e-03, 1.7211914062e-02, ..., -7.1105957031e-03, -2.1972656250e-02, -3.9062500000e-03], ..., [-1.7944335938e-02, -1.6357421875e-02, 2.3315429688e-02, ..., -1.6357421875e-02, 1.4591217041e-04, -8.7280273438e-03], [ 1.2756347656e-02, -2.3925781250e-02, -9.5214843750e-03, ..., 7.5531005859e-04, 1.3183593750e-02, 1.6845703125e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.2326660156e-03, -2.1850585938e-02, 1.6601562500e-02, ..., 8.7280273438e-03, 3.6315917969e-03, -1.7333984375e-02], [-7.2021484375e-03, 9.9487304688e-03, 1.7211914062e-02, ..., -7.1105957031e-03, -2.1972656250e-02, -3.9062500000e-03], ..., [-1.7944335938e-02, -1.6357421875e-02, 2.3315429688e-02, ..., -1.6357421875e-02, 1.4591217041e-04, -8.7280273438e-03], [ 1.2756347656e-02, -2.3925781250e-02, -9.5214843750e-03, ..., 7.5531005859e-04, 1.3183593750e-02, 1.6845703125e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.6074218750, -3.5761718750, 2.1191406250, ..., -1.4365234375, -0.1625976562, 3.0507812500], [-1.0371093750, -0.2348632812, -2.3964843750, ..., 0.6806640625, -0.8071289062, -0.1777343750], [-0.7314453125, 1.1503906250, 1.4345703125, ..., 0.8510742188, -0.5678710938, -1.4355468750], ..., [ 5.1054687500, 3.3632812500, 3.4023437500, ..., 1.6025390625, -1.2871093750, -2.7656250000], [ 0.4121093750, -0.4467773438, -0.4606933594, ..., -0.0379333496, -0.3156738281, -0.0278320312], [-0.2481689453, 0.0076599121, -0.1416015625, ..., 0.5722656250, 0.0829467773, -0.4086914062]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [ 7.6293945312e-03, -5.3100585938e-03, 1.5441894531e-02, ..., 7.7514648438e-03, -2.5512695312e-02, 1.0009765625e-02], [-1.0070800781e-02, 2.3437500000e-02, 8.8500976562e-03, ..., 6.8359375000e-03, -1.5258789062e-03, 7.6599121094e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [ 7.6293945312e-03, -5.3100585938e-03, 1.5441894531e-02, ..., 7.7514648438e-03, -2.5512695312e-02, 1.0009765625e-02], [-1.0070800781e-02, 2.3437500000e-02, 8.8500976562e-03, ..., 6.8359375000e-03, -1.5258789062e-03, 7.6599121094e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-6.8969726562e-03, -7.6904296875e-03, 3.1494140625e-02, ..., 9.6435546875e-03, -5.0964355469e-03, -6.5002441406e-03], ..., [-7.5683593750e-03, -1.2664794922e-03, -9.0942382812e-03, ..., 6.5917968750e-03, -5.5236816406e-03, -1.1108398438e-02], [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-6.8969726562e-03, -7.6904296875e-03, 3.1494140625e-02, ..., 9.6435546875e-03, -5.0964355469e-03, -6.5002441406e-03], ..., [-7.5683593750e-03, -1.2664794922e-03, -9.0942382812e-03, ..., 6.5917968750e-03, -5.5236816406e-03, -1.1108398438e-02], [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.2128906250, 1.8789062500, 1.9375000000, ..., 2.1015625000, 0.6791992188, -2.0761718750], [ 2.4648437500, 5.0507812500, 0.0464477539, ..., 4.6601562500, 0.9545898438, 0.5942382812], [ 0.0637207031, -0.9951171875, 3.8007812500, ..., -1.9941406250, 0.5761718750, 1.8623046875], ..., [ 0.0494384766, 0.2133789062, -0.0300445557, ..., 0.2875976562, -0.0150146484, 0.3435058594], [ 0.1953125000, -0.0679931641, -0.5278320312, ..., -0.4758300781, 0.0186767578, -0.4399414062], [ 0.1525878906, 0.1134033203, -0.4414062500, ..., -0.1617431641, -0.1402587891, 0.5087890625]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [ 0.0079956055, 0.0258789062, 0.0180664062, ..., -0.0344238281, 0.0090332031, 0.0066528320], [ 0.0228271484, -0.0239257812, 0.0194091797, ..., -0.0033569336, 0.0014724731, 0.0109863281], ..., [-0.0244140625, 0.0125732422, 0.0218505859, ..., 0.0031280518, -0.0044555664, -0.0008850098], [ 0.0139160156, -0.0341796875, 0.0354003906, ..., -0.0183105469, -0.0405273438, -0.0191650391], [-0.0163574219, 0.0156250000, 0.0030212402, ..., -0.0072326660, 0.0015869141, 0.0134887695]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [ 0.0079956055, 0.0258789062, 0.0180664062, ..., -0.0344238281, 0.0090332031, 0.0066528320], [ 0.0228271484, -0.0239257812, 0.0194091797, ..., -0.0033569336, 0.0014724731, 0.0109863281], ..., [-0.0244140625, 0.0125732422, 0.0218505859, ..., 0.0031280518, -0.0044555664, -0.0008850098], [ 0.0139160156, -0.0341796875, 0.0354003906, ..., -0.0183105469, -0.0405273438, -0.0191650391], [-0.0163574219, 0.0156250000, 0.0030212402, ..., -0.0072326660, 0.0015869141, 0.0134887695]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [ 1.1047363281e-02, 4.4250488281e-03, 5.6457519531e-03, ..., 1.6479492188e-02, 8.1787109375e-03, 8.1176757812e-03], [ 1.0559082031e-02, -1.1108398438e-02, 7.9345703125e-03, ..., -1.0986328125e-02, -4.5013427734e-04, 6.1645507812e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [ 1.1047363281e-02, 4.4250488281e-03, 5.6457519531e-03, ..., 1.6479492188e-02, 8.1787109375e-03, 8.1176757812e-03], [ 1.0559082031e-02, -1.1108398438e-02, 7.9345703125e-03, ..., -1.0986328125e-02, -4.5013427734e-04, 6.1645507812e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4404296875e-02, 5.0354003906e-03, 1.9287109375e-02, ..., -1.0681152344e-03, -8.3007812500e-03, 1.2084960938e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [-1.5441894531e-02, 5.5694580078e-04, -1.0314941406e-02, ..., 1.1367797852e-03, 2.2430419922e-03, -2.8991699219e-03], [-1.9042968750e-02, -6.5917968750e-03, 2.6123046875e-02, ..., -9.1552734375e-03, 1.0498046875e-02, 2.3126602173e-05], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4404296875e-02, 5.0354003906e-03, 1.9287109375e-02, ..., -1.0681152344e-03, -8.3007812500e-03, 1.2084960938e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [-1.5441894531e-02, 5.5694580078e-04, -1.0314941406e-02, ..., 1.1367797852e-03, 2.2430419922e-03, -2.8991699219e-03], [-1.9042968750e-02, -6.5917968750e-03, 2.6123046875e-02, ..., -9.1552734375e-03, 1.0498046875e-02, 2.3126602173e-05], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-1.2783203125, 0.6689453125, 3.7343750000, ..., -1.0371093750, -1.5341796875, 2.3242187500], [ 2.8203125000, -1.1601562500, 2.6445312500, ..., -3.0000000000, -1.9716796875, 0.3395996094], [-1.7988281250, -2.2753906250, 0.9199218750, ..., -0.9360351562, 0.3703613281, 2.0546875000], ..., [-0.6987304688, 0.3447265625, -0.0333251953, ..., 0.2822265625, -0.0956420898, 0.0727539062], [-0.3383789062, 0.4497070312, -0.0281982422, ..., -0.3754882812, -0.4851074219, -0.8305664062], [-0.6103515625, -0.0689086914, -0.3156738281, ..., 0.1789550781, 0.3417968750, 0.3320312500]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- .------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [-7.2326660156e-03, 7.5531005859e-04, -1.4953613281e-03, ..., -8.5830688477e-04, -3.4179687500e-03, 4.9438476562e-03], [ 3.9978027344e-03, 1.7333984375e-02, 2.3071289062e-02, ..., 7.9345703125e-03, 9.3994140625e-03, 1.0620117188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [-7.2326660156e-03, 7.5531005859e-04, -1.4953613281e-03, ..., -8.5830688477e-04, -3.4179687500e-03, 4.9438476562e-03], [ 3.9978027344e-03, 1.7333984375e-02, 2.3071289062e-02, ..., 7.9345703125e-03, 9.3994140625e-03, 1.0620117188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 6.0424804688e-03, 4.0588378906e-03, 2.9052734375e-02, ..., -1.9653320312e-02, 5.4931640625e-03, 9.3994140625e-03], [-7.2021484375e-03, 3.4332275391e-03, 8.7280273438e-03, ..., -2.1972656250e-02, -2.3841857910e-04, 1.4526367188e-02], ..., [ 5.3405761719e-03, -3.2226562500e-02, 1.6601562500e-02, ..., -3.0761718750e-02, 2.1606445312e-02, -1.8676757812e-02], [ 2.8564453125e-02, 1.6937255859e-03, 2.2338867188e-02, ..., 1.3732910156e-02, -1.2695312500e-02, -2.2949218750e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 6.0424804688e-03, 4.0588378906e-03, 2.9052734375e-02, ..., -1.9653320312e-02, 5.4931640625e-03, 9.3994140625e-03], [-7.2021484375e-03, 3.4332275391e-03, 8.7280273438e-03, ..., -2.1972656250e-02, -2.3841857910e-04, 1.4526367188e-02], ..., [ 5.3405761719e-03, -3.2226562500e-02, 1.6601562500e-02, ..., -3.0761718750e-02, 2.1606445312e-02, -1.8676757812e-02], [ 2.8564453125e-02, 1.6937255859e-03, 2.2338867188e-02, ..., 1.3732910156e-02, -1.2695312500e-02, -2.2949218750e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [ 2.1240234375e-02, 5.0964355469e-03, -8.6669921875e-03, ..., -8.2397460938e-03, -2.1514892578e-03, -1.9897460938e-02], [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [ 2.1240234375e-02, 5.0964355469e-03, -8.6669921875e-03, ..., -8.2397460938e-03, -2.1514892578e-03, -1.9897460938e-02], [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [-9.0332031250e-02, -2.3803710938e-03, -4.2724609375e-03, ..., -1.8081665039e-03, -1.6250610352e-03, -9.9182128906e-04], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [-9.0332031250e-02, -2.3803710938e-03, -4.2724609375e-03, ..., -1.8081665039e-03, -1.6250610352e-03, -9.9182128906e-04], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 2.2460937500e-02, 1.0620117188e-02, 1.6235351562e-02, ..., -9.7656250000e-03, -9.7656250000e-03, -5.2795410156e-03], [-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], ..., [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [ 2.1240234375e-02, 5.0964355469e-03, -8.6669921875e-03, ..., -8.2397460938e-03, -2.1514892578e-03, -1.9897460938e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 2.2460937500e-02, 1.0620117188e-02, 1.6235351562e-02, ..., -9.7656250000e-03, -9.7656250000e-03, -5.2795410156e-03], [-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], ..., [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [ 2.1240234375e-02, 5.0964355469e-03, -8.6669921875e-03, ..., -8.2397460938e-03, -2.1514892578e-03, -1.9897460938e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [ 7.2326660156e-03, -2.1850585938e-02, 1.6601562500e-02, ..., 8.7280273438e-03, 3.6315917969e-03, -1.7333984375e-02], [-1.9073486328e-03, -3.6621093750e-02, -1.0986328125e-02, ..., 4.1809082031e-03, 2.4414062500e-03, 1.8554687500e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-4.8217773438e-03, -6.3476562500e-03, 1.8188476562e-02, ..., 2.0751953125e-03, 5.2490234375e-03, 8.9111328125e-03], [ 3.9978027344e-03, -1.0986328125e-03, -2.1240234375e-02, ..., -1.6601562500e-02, -1.2084960938e-02, 1.7700195312e-02], ..., [ 7.2326660156e-03, -2.1850585938e-02, 1.6601562500e-02, ..., 8.7280273438e-03, 3.6315917969e-03, -1.7333984375e-02], [-1.9073486328e-03, -3.6621093750e-02, -1.0986328125e-02, ..., 4.1809082031e-03, 2.4414062500e-03, 1.8554687500e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 2.2460937500e-02, 1.0620117188e-02, 1.6235351562e-02, ..., -9.7656250000e-03, -9.7656250000e-03, -5.2795410156e-03], [-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], ..., [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 1.4465332031e-02, -1.5258789062e-02, 1.0253906250e-02, ..., 4.4250488281e-03, 4.4250488281e-03, -2.2216796875e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 2.2460937500e-02, 1.0620117188e-02, 1.6235351562e-02, ..., -9.7656250000e-03, -9.7656250000e-03, -5.2795410156e-03], [-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], ..., [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 1.4465332031e-02, -1.5258789062e-02, 1.0253906250e-02, ..., 4.4250488281e-03, 4.4250488281e-03, -2.2216796875e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 5.4321289062e-03, -7.8735351562e-03, 2.0141601562e-03, ..., -3.2653808594e-03, 7.4157714844e-03, -3.4179687500e-03], [ 4.8339843750e-02, 2.6855468750e-03, -1.7166137695e-03, ..., 1.3061523438e-02, 3.2501220703e-03, -5.0354003906e-03], ..., [-1.6723632812e-02, -3.1738281250e-03, -6.6528320312e-03, ..., -1.2695312500e-02, 1.2817382812e-02, 1.4648437500e-03], [-9.0332031250e-02, -2.3803710938e-03, -4.2724609375e-03, ..., -1.8081665039e-03, -1.6250610352e-03, -9.9182128906e-04], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 5.4321289062e-03, -7.8735351562e-03, 2.0141601562e-03, ..., -3.2653808594e-03, 7.4157714844e-03, -3.4179687500e-03], [ 4.8339843750e-02, 2.6855468750e-03, -1.7166137695e-03, ..., 1.3061523438e-02, 3.2501220703e-03, -5.0354003906e-03], ..., [-1.6723632812e-02, -3.1738281250e-03, -6.6528320312e-03, ..., -1.2695312500e-02, 1.2817382812e-02, 1.4648437500e-03], [-9.0332031250e-02, -2.3803710938e-03, -4.2724609375e-03, ..., -1.8081665039e-03, -1.6250610352e-03, -9.9182128906e-04], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 5.4321289062e-03, -7.8735351562e-03, 2.0141601562e-03, ..., -3.2653808594e-03, 7.4157714844e-03, -3.4179687500e-03], [ 4.8339843750e-02, 2.6855468750e-03, -1.7166137695e-03, ..., 1.3061523438e-02, 3.2501220703e-03, -5.0354003906e-03], ..., [-6.8969726562e-03, -5.2490234375e-03, 2.0141601562e-02, ..., 3.1127929688e-03, 5.1574707031e-03, 8.1176757812e-03], [-1.1474609375e-02, -1.6357421875e-02, 2.7832031250e-02, ..., -1.9653320312e-02, 1.3183593750e-02, 1.7089843750e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 5.4321289062e-03, -7.8735351562e-03, 2.0141601562e-03, ..., -3.2653808594e-03, 7.4157714844e-03, -3.4179687500e-03], [ 4.8339843750e-02, 2.6855468750e-03, -1.7166137695e-03, ..., 1.3061523438e-02, 3.2501220703e-03, -5.0354003906e-03], ..., [-6.8969726562e-03, -5.2490234375e-03, 2.0141601562e-02, ..., 3.1127929688e-03, 5.1574707031e-03, 8.1176757812e-03], [-1.1474609375e-02, -1.6357421875e-02, 2.7832031250e-02, ..., -1.9653320312e-02, 1.3183593750e-02, 1.7089843750e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-1.4238281250, -0.9462890625, 0.7260742188, ..., -2.0332031250, -2.5429687500, 1.9179687500], [-1.1484375000, -2.3203125000, -0.1903076172, ..., 1.2607421875, 1.0263671875, -0.0539550781], [-1.4785156250, -5.3281250000, 0.8876953125, ..., -0.8183593750, 0.7666015625, 1.6718750000], ..., [-0.2241210938, 0.4738769531, -0.3852539062, ..., 0.2832031250, -0.4494628906, -0.1879882812], [-0.6098632812, 0.7626953125, -0.2126464844, ..., 0.3410644531, 0.0756835938, -0.2875976562], [ 0.5546875000, 0.5639648438, 0.1552734375, ..., 0.2604980469, -0.5405273438, -0.8398437500]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0598144531, -0.0134887695, 0.0020141602, ..., -0.0185546875, 0.0093383789, -0.0118408203], [-0.0224609375, 0.0041198730, 0.0139160156, ..., 0.0057067871, -0.0009307861, -0.0195312500], ..., [-0.0167236328, -0.0031738281, -0.0066528320, ..., -0.0126953125, 0.0128173828, 0.0014648438], [ 0.0541992188, -0.0002574921, -0.0100708008, ..., 0.0102539062, 0.0046997070, 0.0020446777], [ 0.0620117188, 0.0057373047, -0.0303955078, ..., 0.0079345703, 0.0011291504, 0.0036163330]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0598144531, -0.0134887695, 0.0020141602, ..., -0.0185546875, 0.0093383789, -0.0118408203], [-0.0224609375, 0.0041198730, 0.0139160156, ..., 0.0057067871, -0.0009307861, -0.0195312500], ..., [-0.0167236328, -0.0031738281, -0.0066528320, ..., -0.0126953125, 0.0128173828, 0.0014648438], [ 0.0541992188, -0.0002574921, -0.0100708008, ..., 0.0102539062, 0.0046997070, 0.0020446777], [ 0.0620117188, 0.0057373047, -0.0303955078, ..., 0.0079345703, 0.0011291504, 0.0036163330]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- .------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-5.9814453125e-02, -1.3488769531e-02, 2.0141601562e-03, ..., -1.8554687500e-02, 9.3383789062e-03, -1.1840820312e-02], [-2.2460937500e-02, 4.1198730469e-03, 1.3916015625e-02, ..., 5.7067871094e-03, -9.3078613281e-04, -1.9531250000e-02], ..., [ 4.6691894531e-03, -2.3315429688e-02, -1.5380859375e-02, ..., -1.9531250000e-02, -5.3100585938e-03, 1.7944335938e-02], [-1.6723632812e-02, -3.2714843750e-02, 3.6865234375e-02, ..., -2.1606445312e-02, 5.7678222656e-03, 7.7209472656e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-5.9814453125e-02, -1.3488769531e-02, 2.0141601562e-03, ..., -1.8554687500e-02, 9.3383789062e-03, -1.1840820312e-02], [-2.2460937500e-02, 4.1198730469e-03, 1.3916015625e-02, ..., 5.7067871094e-03, -9.3078613281e-04, -1.9531250000e-02], ..., [ 4.6691894531e-03, -2.3315429688e-02, -1.5380859375e-02, ..., -1.9531250000e-02, -5.3100585938e-03, 1.7944335938e-02], [-1.6723632812e-02, -3.2714843750e-02, 3.6865234375e-02, ..., -2.1606445312e-02, 5.7678222656e-03, 7.7209472656e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-8.5449218750e-03, 3.4484863281e-03, -9.7045898438e-03, ..., -2.0996093750e-02, -1.0498046875e-02, -4.6386718750e-03], ..., [ 5.0354003906e-03, -3.4942626953e-03, -4.5166015625e-03, ..., -1.3854980469e-02, 7.2021484375e-03, 9.7656250000e-03], [ 1.2023925781e-02, 4.1809082031e-03, -2.6611328125e-02, ..., 1.0803222656e-02, -2.0874023438e-02, -9.2163085938e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-8.5449218750e-03, 3.4484863281e-03, -9.7045898438e-03, ..., -2.0996093750e-02, -1.0498046875e-02, -4.6386718750e-03], ..., [ 5.0354003906e-03, -3.4942626953e-03, -4.5166015625e-03, ..., -1.3854980469e-02, 7.2021484375e-03, 9.7656250000e-03], [ 1.2023925781e-02, 4.1809082031e-03, -2.6611328125e-02, ..., 1.0803222656e-02, -2.0874023438e-02, -9.2163085938e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 2.9541015625e-02, 1.8066406250e-02, 1.1352539062e-02, ..., -3.4423828125e-02, -7.0800781250e-03, -2.9182434082e-04], ..., [-7.2326660156e-03, 7.5531005859e-04, -1.4953613281e-03, ..., -8.5830688477e-04, -3.4179687500e-03, 4.9438476562e-03], [ 4.1748046875e-02, 3.4179687500e-03, -9.2773437500e-03, ..., 8.4228515625e-03, -8.6669921875e-03, 3.7231445312e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 2.9541015625e-02, 1.8066406250e-02, 1.1352539062e-02, ..., -3.4423828125e-02, -7.0800781250e-03, -2.9182434082e-04], ..., [-7.2326660156e-03, 7.5531005859e-04, -1.4953613281e-03, ..., -8.5830688477e-04, -3.4179687500e-03, 4.9438476562e-03], [ 4.1748046875e-02, 3.4179687500e-03, -9.2773437500e-03, ..., 8.4228515625e-03, -8.6669921875e-03, 3.7231445312e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 2.3808593750, -0.2951660156, 0.9565429688, ..., 0.8349609375, -2.6015625000, 0.3461914062], [-0.5219726562, -0.6601562500, 0.4135742188, ..., -0.0653076172, 1.2050781250, 3.3906250000], [ 1.0058593750, -1.4501953125, -0.3403320312, ..., -2.1738281250, -0.5957031250, -0.0131225586], ..., [-0.3132324219, -0.0123901367, -0.6240234375, ..., 0.0076904297, 0.0312500000, -0.1198120117], [-0.6757812500, 0.4482421875, -0.7832031250, ..., 0.3239746094, -0.1018676758, -0.1125488281], [ 0.3178710938, 0.2556152344, -0.5839843750, ..., 0.1662597656, -0.5253906250, -0.0218658447]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 8.1787109375e-03, 1.4038085938e-02, -1.1230468750e-02, ..., -7.8125000000e-03, -1.9165039062e-02, 3.3935546875e-02], ..., [ 4.3945312500e-02, -4.8522949219e-03, 1.7944335938e-02, ..., -4.3869018555e-04, -7.9956054688e-03, -2.8533935547e-03], [-2.6550292969e-03, 7.8125000000e-03, -8.1176757812e-03, ..., -8.1787109375e-03, 1.5380859375e-02, -1.2695312500e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 8.1787109375e-03, 1.4038085938e-02, -1.1230468750e-02, ..., -7.8125000000e-03, -1.9165039062e-02, 3.3935546875e-02], ..., [ 4.3945312500e-02, -4.8522949219e-03, 1.7944335938e-02, ..., -4.3869018555e-04, -7.9956054688e-03, -2.8533935547e-03], [-2.6550292969e-03, 7.8125000000e-03, -8.1176757812e-03, ..., -8.1787109375e-03, 1.5380859375e-02, -1.2695312500e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-4.8193359375e-01, 3.2324218750e-01, -6.2988281250e-01, ..., 3.7011718750e-01, -3.3105468750e-01, 3.4414062500e+00], [-8.7695312500e-01, -2.4570312500e+00, 5.4931640625e-01, ..., -2.1308593750e+00, 1.4716796875e+00, 1.5722656250e+00], [-4.8193359375e-01, -1.1503906250e+00, -2.1289062500e-01, ..., 6.2890625000e-01, 1.3701171875e+00, -5.5566406250e-01], ..., [-2.6000976562e-01, 2.1728515625e-01, 2.1459960938e-01, ..., 1.8920898438e-01, 2.4383544922e-02, -2.5927734375e-01], [-2.3791503906e-01, -4.8046875000e-01, -9.8437500000e-01, ..., -1.2145996094e-01, -3.5717773438e-01, 1.8164062500e-01], [-1.7163085938e-01, -9.7656250000e-04, -8.4667968750e-01, ..., 4.2968750000e-01, 7.5073242188e-02, -1.7492675781e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.2021484375e-03, -6.0119628906e-03, 9.5825195312e-03, ..., -1.2634277344e-02, -2.5268554688e-02, -6.6757202148e-04], [-7.1105957031e-03, 3.0029296875e-02, 9.0332031250e-03, ..., -4.7912597656e-03, 4.6691894531e-03, 3.2226562500e-02], ..., [ 3.6865234375e-02, 9.5214843750e-03, 3.3447265625e-02, ..., -2.4047851562e-02, 2.5268554688e-02, -1.3427734375e-02], [ 1.2329101562e-02, 1.8554687500e-02, 6.7138671875e-03, ..., 1.4709472656e-02, 1.6845703125e-02, 1.3671875000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.2021484375e-03, -6.0119628906e-03, 9.5825195312e-03, ..., -1.2634277344e-02, -2.5268554688e-02, -6.6757202148e-04], [-7.1105957031e-03, 3.0029296875e-02, 9.0332031250e-03, ..., -4.7912597656e-03, 4.6691894531e-03, 3.2226562500e-02], ..., [ 3.6865234375e-02, 9.5214843750e-03, 3.3447265625e-02, ..., -2.4047851562e-02, 2.5268554688e-02, -1.3427734375e-02], [ 1.2329101562e-02, 1.8554687500e-02, 6.7138671875e-03, ..., 1.4709472656e-02, 1.6845703125e-02, 1.3671875000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-1.7607421875e+00, 2.1757812500e+00, 1.0527343750e+00, ..., 3.8691406250e+00, 9.1455078125e-01, 1.6149902344e-01], [ 2.9824218750e+00, 1.3549804688e-02, -1.0117187500e+00, ..., -1.2578125000e+00, 2.2924804688e-01, 1.0449218750e+00], [-9.9072265625e-01, 9.9316406250e-01, -9.2333984375e-01, ..., 1.7753906250e+00, 3.7011718750e-01, -3.7524414062e-01], ..., [ 1.9042968750e-01, -5.9765625000e-01, -5.5175781250e-01, ..., -6.1401367188e-02, 3.2885742188e-01, -2.9541015625e-01], [-3.0517578125e-05, -3.8378906250e-01, -7.8320312500e-01, ..., 6.2011718750e-02, 8.4228515625e-02, 4.8828125000e-01], [-1.5625000000e-01, 5.3344726562e-02, -5.3417968750e-01, ..., 1.0443115234e-01, -1.6577148438e-01, 7.5976562500e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.0009765625e-02, -5.6982040405e-05, -1.8676757812e-02, ..., -1.0742187500e-02, 6.2561035156e-03, -7.3852539062e-03], [-8.9721679688e-03, -7.0800781250e-03, 3.1738281250e-02, ..., 3.7109375000e-02, -3.6376953125e-02, 1.0009765625e-02], ..., [ 1.1474609375e-02, -8.7890625000e-03, 2.3315429688e-02, ..., -1.0681152344e-02, 8.1787109375e-03, 4.7912597656e-03], [ 3.2958984375e-02, -1.5747070312e-02, -4.1198730469e-03, ..., 2.9296875000e-02, -9.4604492188e-03, 5.3100585938e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.0009765625e-02, -5.6982040405e-05, -1.8676757812e-02, ..., -1.0742187500e-02, 6.2561035156e-03, -7.3852539062e-03], [-8.9721679688e-03, -7.0800781250e-03, 3.1738281250e-02, ..., 3.7109375000e-02, -3.6376953125e-02, 1.0009765625e-02], ..., [ 1.1474609375e-02, -8.7890625000e-03, 2.3315429688e-02, ..., -1.0681152344e-02, 8.1787109375e-03, 4.7912597656e-03], [ 3.2958984375e-02, -1.5747070312e-02, -4.1198730469e-03, ..., 2.9296875000e-02, -9.4604492188e-03, 5.3100585938e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.6440429688, -1.8105468750, 1.4062500000, ..., -0.2878417969, -1.4677734375, 1.1972656250], [ 0.4714355469, -0.7587890625, -1.4453125000, ..., -0.4243164062, -2.9804687500, 3.6445312500], [-1.1005859375, -0.6689453125, -2.3925781250, ..., 2.3183593750, -0.9458007812, -0.5898437500], ..., [-0.0181884766, -0.2792968750, -0.2565917969, ..., 0.3291015625, -0.6860351562, -0.6875000000], [-0.3947753906, -1.7089843750, 0.0820312500, ..., 0.1977539062, -1.3369140625, 2.5488281250], [ 0.3374023438, -0.1308593750, -0.4106445312, ..., 0.0267028809, -0.0668945312, 0.2648925781]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.2329101562e-02, -4.0283203125e-03, -2.3559570312e-02, ..., -1.7822265625e-02, 7.9345703125e-03, -1.4038085938e-02], [-1.2878417969e-02, 6.0424804688e-03, -3.8719177246e-04, ..., -1.6601562500e-02, 4.7119140625e-02, 1.2634277344e-02], ..., [-2.2460937500e-02, 8.8500976562e-04, -1.2817382812e-02, ..., -1.4221191406e-02, 1.0742187500e-02, 3.4942626953e-03], [ 4.5898437500e-02, -1.4572143555e-03, -6.5917968750e-03, ..., 1.5625000000e-02, 2.9754638672e-03, 3.9978027344e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.2329101562e-02, -4.0283203125e-03, -2.3559570312e-02, ..., -1.7822265625e-02, 7.9345703125e-03, -1.4038085938e-02], [-1.2878417969e-02, 6.0424804688e-03, -3.8719177246e-04, ..., -1.6601562500e-02, 4.7119140625e-02, 1.2634277344e-02], ..., [-2.2460937500e-02, 8.8500976562e-04, -1.2817382812e-02, ..., -1.4221191406e-02, 1.0742187500e-02, 3.4942626953e-03], [ 4.5898437500e-02, -1.4572143555e-03, -6.5917968750e-03, ..., 1.5625000000e-02, 2.9754638672e-03, 3.9978027344e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 1.5937500000, -0.6425781250, 2.1367187500, ..., -3.7519531250, -0.7695312500, 2.1835937500], [-3.2226562500, -0.7177734375, -3.0449218750, ..., -1.5556640625, 0.4963378906, 4.4726562500], [-0.3339843750, -0.9204101562, 1.4257812500, ..., -0.8842773438, -0.7705078125, 0.7387695312], ..., [ 0.4680175781, 1.9306640625, 3.4707031250, ..., 0.4721679688, -3.6250000000, 0.6494140625], [ 0.2241210938, -0.0957031250, 0.1861572266, ..., -0.1574707031, 0.1049804688, -0.6401367188], [ 0.1937255859, 0.0987548828, -0.4792480469, ..., 0.1159667969, 0.4902343750, -0.2866210938]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [-4.3515625000, 2.7675781250, 2.2929687500, ..., 0.9462890625, 0.5664062500, -4.2226562500], [-4.0507812500, -0.0681152344, 1.1875000000, ..., 1.1591796875, -2.8417968750, 2.4101562500], [-7.5117187500, -0.0412597656, 6.7578125000, ..., 4.4882812500, 1.2441406250, 1.8671875000]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.0009765625e-02, -3.6926269531e-03, -9.5214843750e-03, ..., -1.4221191406e-02, -1.3885498047e-03, -4.7874450684e-04], [-1.8798828125e-02, 5.3710937500e-03, 1.5014648438e-02, ..., 3.7231445312e-03, -1.4648437500e-02, -2.9541015625e-02], ..., [ 1.8066406250e-02, 3.9482116699e-04, -3.5644531250e-02, ..., -2.0996093750e-02, 3.7841796875e-03, -3.8909912109e-03], [ 2.5634765625e-03, 1.5869140625e-02, 1.3366699219e-02, ..., -2.1972656250e-02, -1.9165039062e-02, -1.4587402344e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.0009765625e-02, -3.6926269531e-03, -9.5214843750e-03, ..., -1.4221191406e-02, -1.3885498047e-03, -4.7874450684e-04], [-1.8798828125e-02, 5.3710937500e-03, 1.5014648438e-02, ..., 3.7231445312e-03, -1.4648437500e-02, -2.9541015625e-02], ..., [ 1.8066406250e-02, 3.9482116699e-04, -3.5644531250e-02, ..., -2.0996093750e-02, 3.7841796875e-03, -3.8909912109e-03], [ 2.5634765625e-03, 1.5869140625e-02, 1.3366699219e-02, ..., -2.1972656250e-02, -1.9165039062e-02, -1.4587402344e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.0009765625e-02, -5.6982040405e-05, -1.8676757812e-02, ..., -1.0742187500e-02, 6.2561035156e-03, -7.3852539062e-03], [-8.9721679688e-03, -7.0800781250e-03, 3.1738281250e-02, ..., 3.7109375000e-02, -3.6376953125e-02, 1.0009765625e-02], ..., [-6.2866210938e-03, 1.9836425781e-03, -2.5634765625e-02, ..., -7.9345703125e-03, -1.0375976562e-02, 1.0498046875e-02], [-5.0048828125e-03, -6.3781738281e-03, 2.5268554688e-02, ..., -1.1596679688e-02, -8.6669921875e-03, -8.5449218750e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.0009765625e-02, -5.6982040405e-05, -1.8676757812e-02, ..., -1.0742187500e-02, 6.2561035156e-03, -7.3852539062e-03], [-8.9721679688e-03, -7.0800781250e-03, 3.1738281250e-02, ..., 3.7109375000e-02, -3.6376953125e-02, 1.0009765625e-02], ..., [-6.2866210938e-03, 1.9836425781e-03, -2.5634765625e-02, ..., -7.9345703125e-03, -1.0375976562e-02, 1.0498046875e-02], [-5.0048828125e-03, -6.3781738281e-03, 2.5268554688e-02, ..., -1.1596679688e-02, -8.6669921875e-03, -8.5449218750e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.0009765625e-02, -5.6982040405e-05, -1.8676757812e-02, ..., -1.0742187500e-02, 6.2561035156e-03, -7.3852539062e-03], [-8.9721679688e-03, -7.0800781250e-03, 3.1738281250e-02, ..., 3.7109375000e-02, -3.6376953125e-02, 1.0009765625e-02], ..., [-4.5166015625e-02, 8.2015991211e-04, 7.3623657227e-04, ..., -7.8582763672e-04, 1.5830993652e-04, 3.9367675781e-03], [-2.1728515625e-02, -2.6489257812e-02, 9.4604492188e-03, ..., 1.5441894531e-02, -1.2390136719e-02, -3.1494140625e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.0009765625e-02, -5.6982040405e-05, -1.8676757812e-02, ..., -1.0742187500e-02, 6.2561035156e-03, -7.3852539062e-03], [-8.9721679688e-03, -7.0800781250e-03, 3.1738281250e-02, ..., 3.7109375000e-02, -3.6376953125e-02, 1.0009765625e-02], ..., [-4.5166015625e-02, 8.2015991211e-04, 7.3623657227e-04, ..., -7.8582763672e-04, 1.5830993652e-04, 3.9367675781e-03], [-2.1728515625e-02, -2.6489257812e-02, 9.4604492188e-03, ..., 1.5441894531e-02, -1.2390136719e-02, -3.1494140625e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- .------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.1362304688e-02, -2.5939941406e-03, 5.0354003906e-04, ..., 1.1901855469e-02, 2.7008056641e-03, -8.7890625000e-03], [-8.5449218750e-03, 2.2705078125e-02, -4.1259765625e-02, ..., -2.8564453125e-02, 3.7841796875e-02, -1.5487670898e-03], ..., [-1.2329101562e-02, -4.0283203125e-03, -2.3559570312e-02, ..., -1.7822265625e-02, 7.9345703125e-03, -1.4038085938e-02], [-1.7089843750e-02, -3.1250000000e-02, 9.8876953125e-03, ..., -4.8217773438e-03, -2.5390625000e-02, 3.8909912109e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.1362304688e-02, -2.5939941406e-03, 5.0354003906e-04, ..., 1.1901855469e-02, 2.7008056641e-03, -8.7890625000e-03], [-8.5449218750e-03, 2.2705078125e-02, -4.1259765625e-02, ..., -2.8564453125e-02, 3.7841796875e-02, -1.5487670898e-03], ..., [-1.2329101562e-02, -4.0283203125e-03, -2.3559570312e-02, ..., -1.7822265625e-02, 7.9345703125e-03, -1.4038085938e-02], [-1.7089843750e-02, -3.1250000000e-02, 9.8876953125e-03, ..., -4.8217773438e-03, -2.5390625000e-02, 3.8909912109e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 9.3994140625e-03, -5.9204101562e-03, 9.4604492188e-03, ..., -2.5634765625e-02, 1.0803222656e-02, 4.0283203125e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [-1.4953613281e-02, -1.4099121094e-02, 3.0517578125e-02, ..., -4.7302246094e-03, 5.5847167969e-03, -2.6855468750e-02], [ 1.5380859375e-02, 1.5502929688e-02, -4.6386718750e-03, ..., -1.5441894531e-02, 9.5214843750e-03, -1.1230468750e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 9.3994140625e-03, -5.9204101562e-03, 9.4604492188e-03, ..., -2.5634765625e-02, 1.0803222656e-02, 4.0283203125e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [-1.4953613281e-02, -1.4099121094e-02, 3.0517578125e-02, ..., -4.7302246094e-03, 5.5847167969e-03, -2.6855468750e-02], [ 1.5380859375e-02, 1.5502929688e-02, -4.6386718750e-03, ..., -1.5441894531e-02, 9.5214843750e-03, -1.1230468750e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-3.3691406250e-02, 1.3198852539e-03, -1.4221191406e-02, ..., -1.5869140625e-03, -1.1657714844e-02, 1.5869140625e-02], [-4.8828125000e-03, -1.2695312500e-02, 5.1513671875e-02, ..., -6.4697265625e-03, -4.0588378906e-03, -2.6245117188e-02], ..., [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], [-3.4637451172e-03, 9.5825195312e-03, 1.0620117188e-02, ..., 9.5825195312e-03, 3.0029296875e-02, -1.6967773438e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-3.3691406250e-02, 1.3198852539e-03, -1.4221191406e-02, ..., -1.5869140625e-03, -1.1657714844e-02, 1.5869140625e-02], [-4.8828125000e-03, -1.2695312500e-02, 5.1513671875e-02, ..., -6.4697265625e-03, -4.0588378906e-03, -2.6245117188e-02], ..., [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], [-3.4637451172e-03, 9.5825195312e-03, 1.0620117188e-02, ..., 9.5825195312e-03, 3.0029296875e-02, -1.6967773438e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.0009765625e-02, -5.6982040405e-05, -1.8676757812e-02, ..., -1.0742187500e-02, 6.2561035156e-03, -7.3852539062e-03], [-8.9721679688e-03, -7.0800781250e-03, 3.1738281250e-02, ..., 3.7109375000e-02, -3.6376953125e-02, 1.0009765625e-02], ..., [ 4.7607421875e-02, 1.1779785156e-02, 1.5747070312e-02, ..., -1.7456054688e-02, 9.5214843750e-03, 1.0299682617e-03], [-1.9042968750e-02, 1.5563964844e-02, 1.3504028320e-03, ..., 1.5735626221e-04, 3.0136108398e-04, 1.6601562500e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.0009765625e-02, -5.6982040405e-05, -1.8676757812e-02, ..., -1.0742187500e-02, 6.2561035156e-03, -7.3852539062e-03], [-8.9721679688e-03, -7.0800781250e-03, 3.1738281250e-02, ..., 3.7109375000e-02, -3.6376953125e-02, 1.0009765625e-02], ..., [ 4.7607421875e-02, 1.1779785156e-02, 1.5747070312e-02, ..., -1.7456054688e-02, 9.5214843750e-03, 1.0299682617e-03], [-1.9042968750e-02, 1.5563964844e-02, 1.3504028320e-03, ..., 1.5735626221e-04, 3.0136108398e-04, 1.6601562500e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [ 0.0087890625, -0.0128784180, 0.0027923584, ..., -0.0402832031, 0.0026397705, -0.0168457031], [-0.0341796875, 0.0134277344, 0.0156250000, ..., 0.0118408203, -0.0336914062, -0.0251464844], ..., [ 0.0084228516, -0.0246582031, -0.0029602051, ..., -0.0027313232, -0.0336914062, 0.0017700195], [ 0.0266113281, -0.0119018555, -0.0019989014, ..., 0.0081787109, 0.0125732422, -0.0113525391], [-0.0163574219, 0.0156250000, 0.0030212402, ..., -0.0072326660, 0.0015869141, 0.0134887695]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [ 0.0087890625, -0.0128784180, 0.0027923584, ..., -0.0402832031, 0.0026397705, -0.0168457031], [-0.0341796875, 0.0134277344, 0.0156250000, ..., 0.0118408203, -0.0336914062, -0.0251464844], ..., [ 0.0084228516, -0.0246582031, -0.0029602051, ..., -0.0027313232, -0.0336914062, 0.0017700195], [ 0.0266113281, -0.0119018555, -0.0019989014, ..., 0.0081787109, 0.0125732422, -0.0113525391], [-0.0163574219, 0.0156250000, 0.0030212402, ..., -0.0072326660, 0.0015869141, 0.0134887695]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], [ 1.3366699219e-02, -8.6975097656e-04, 1.8554687500e-02, ..., 8.2397460938e-04, -1.0620117188e-02, 1.4709472656e-02], ..., [ 1.0070800781e-02, 8.3160400391e-04, 3.8337707520e-04, ..., 3.8146972656e-04, 1.1749267578e-03, 5.7220458984e-04], [ 1.2817382812e-02, 1.4770507812e-02, 1.3885498047e-03, ..., -1.0498046875e-02, -7.8125000000e-03, -1.5380859375e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], [ 1.3366699219e-02, -8.6975097656e-04, 1.8554687500e-02, ..., 8.2397460938e-04, -1.0620117188e-02, 1.4709472656e-02], ..., [ 1.0070800781e-02, 8.3160400391e-04, 3.8337707520e-04, ..., 3.8146972656e-04, 1.1749267578e-03, 5.7220458984e-04], [ 1.2817382812e-02, 1.4770507812e-02, 1.3885498047e-03, ..., -1.0498046875e-02, -7.8125000000e-03, -1.5380859375e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [-0.1798095703, -0.0490722656, -0.2359619141, ..., 0.2432861328, -0.7338867188, 0.3510742188], [ 0.5107421875, -0.2202148438, 0.8100585938, ..., -0.4462890625, -0.6870117188, 0.2132568359], [ 0.3837890625, -0.7670898438, -1.0976562500, ..., -4.5039062500, 2.0000000000, 1.9833984375]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0124511719, 0.0184326172, -0.0012817383, ..., -0.0180664062, 0.0018463135, -0.0009002686], [-0.0055541992, 0.0071105957, -0.0132446289, ..., 0.0083007812, -0.0150756836, 0.0220947266], ..., [ 0.0417480469, 0.0034179688, -0.0092773438, ..., 0.0084228516, -0.0086669922, 0.0037231445], [ 0.0212402344, 0.0050964355, -0.0086669922, ..., -0.0082397461, -0.0021514893, -0.0198974609], [-0.0057983398, -0.0078735352, 0.0174560547, ..., -0.0050354004, -0.0006599426, -0.0158691406]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0124511719, 0.0184326172, -0.0012817383, ..., -0.0180664062, 0.0018463135, -0.0009002686], [-0.0055541992, 0.0071105957, -0.0132446289, ..., 0.0083007812, -0.0150756836, 0.0220947266], ..., [ 0.0417480469, 0.0034179688, -0.0092773438, ..., 0.0084228516, -0.0086669922, 0.0037231445], [ 0.0212402344, 0.0050964355, -0.0086669922, ..., -0.0082397461, -0.0021514893, -0.0198974609], [-0.0057983398, -0.0078735352, 0.0174560547, ..., -0.0050354004, -0.0006599426, -0.0158691406]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 2.5421142578e-02, 7.1875000000e-01, -1.3417968750e+00, ..., -1.7548828125e+00, 1.0058593750e+00, 6.9335937500e-02], [-2.1484375000e+00, -1.4965820312e-01, 2.3027343750e+00, ..., -2.1933593750e+00, -1.9003906250e+00, 4.4128417969e-02], [ 3.6926269531e-03, -2.3867187500e+00, 2.0332031250e+00, ..., 3.6972656250e+00, 9.3066406250e-01, -1.3710937500e+00], ..., [ 2.4707031250e-01, -2.4072265625e-01, -1.0473632812e-01, ..., 2.7539062500e-01, -4.2846679688e-01, -1.5051269531e-01], [-5.1953125000e-01, -7.2363281250e-01, 4.2749023438e-01, ..., 1.3232421875e-01, -8.1494140625e-01, -1.0703125000e+00], [-3.0029296875e-02, 3.1616210938e-02, -5.7556152344e-02, ..., 3.2421875000e-01, -1.5942382812e-01, -2.1948242188e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.7395019531e-03, -7.6904296875e-03, -7.5683593750e-03, ..., 1.0604858398e-03, 3.4332275391e-03, 2.8198242188e-02], [ 2.3803710938e-02, 4.7302246094e-03, 9.0332031250e-03, ..., 2.5756835938e-02, -6.1035156250e-03, 1.6723632812e-02], ..., [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], [ 1.0681152344e-03, -1.0131835938e-02, 3.5644531250e-02, ..., -1.6967773438e-02, -1.3732910156e-02, -1.6357421875e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.7395019531e-03, -7.6904296875e-03, -7.5683593750e-03, ..., 1.0604858398e-03, 3.4332275391e-03, 2.8198242188e-02], [ 2.3803710938e-02, 4.7302246094e-03, 9.0332031250e-03, ..., 2.5756835938e-02, -6.1035156250e-03, 1.6723632812e-02], ..., [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], [ 1.0681152344e-03, -1.0131835938e-02, 3.5644531250e-02, ..., -1.6967773438e-02, -1.3732910156e-02, -1.6357421875e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 5.3100585938e-03, -1.2207031250e-03, 1.7456054688e-02, ..., -1.2084960938e-02, -3.1250000000e-02, 1.7623901367e-03], ..., [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 2.8991699219e-03, -2.1118164062e-02, -1.0009765625e-02, ..., -1.3916015625e-02, -1.5625000000e-02, 5.6762695312e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 5.3100585938e-03, -1.2207031250e-03, 1.7456054688e-02, ..., -1.2084960938e-02, -3.1250000000e-02, 1.7623901367e-03], ..., [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 2.8991699219e-03, -2.1118164062e-02, -1.0009765625e-02, ..., -1.3916015625e-02, -1.5625000000e-02, 5.6762695312e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-3.5552978516e-03, 8.3007812500e-03, 3.0639648438e-02, ..., -4.5898437500e-02, 5.3405761719e-03, 2.5756835938e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [ 3.0395507812e-02, -2.7770996094e-03, 3.9062500000e-03, ..., 1.1474609375e-02, -4.0588378906e-03, 1.8554687500e-02], [ 6.8664550781e-03, 4.1503906250e-03, 3.0273437500e-02, ..., -1.0986328125e-02, 5.0659179688e-03, 1.6601562500e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-3.5552978516e-03, 8.3007812500e-03, 3.0639648438e-02, ..., -4.5898437500e-02, 5.3405761719e-03, 2.5756835938e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [ 3.0395507812e-02, -2.7770996094e-03, 3.9062500000e-03, ..., 1.1474609375e-02, -4.0588378906e-03, 1.8554687500e-02], [ 6.8664550781e-03, 4.1503906250e-03, 3.0273437500e-02, ..., -1.0986328125e-02, 5.0659179688e-03, 1.6601562500e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.4611816406, 2.8691406250, -2.0117187500, ..., 5.3593750000, 0.8193359375, -3.0546875000], [ 1.4003906250, -0.1439208984, 4.0390625000, ..., -1.3525390625, -1.0234375000, 1.0585937500], [-0.6352539062, -0.2695312500, 0.1967773438, ..., 1.5576171875, -2.4101562500, 1.2626953125], ..., [-2.2480468750, -1.4912109375, -3.7675781250, ..., 0.4819335938, 1.0634765625, 2.5625000000], [-0.5502929688, -0.3811035156, -0.1346435547, ..., -0.2130126953, -0.3305664062, -0.5556640625], [ 0.5332031250, -1.7089843750, -0.1943359375, ..., -0.7802734375, -0.5400390625, 1.5888671875]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- .------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.0681152344e-03, -1.0131835938e-02, 3.5644531250e-02, ..., -1.6967773438e-02, -1.3732910156e-02, -1.6357421875e-02], ..., [ 1.2084960938e-02, 9.7656250000e-03, -1.6479492188e-02, ..., -8.5067749023e-04, 1.1962890625e-02, 2.9296875000e-03], [-9.1552734375e-03, 8.6669921875e-03, 2.8686523438e-02, ..., -2.4902343750e-02, -2.2216796875e-02, 1.5014648438e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.0681152344e-03, -1.0131835938e-02, 3.5644531250e-02, ..., -1.6967773438e-02, -1.3732910156e-02, -1.6357421875e-02], ..., [ 1.2084960938e-02, 9.7656250000e-03, -1.6479492188e-02, ..., -8.5067749023e-04, 1.1962890625e-02, 2.9296875000e-03], [-9.1552734375e-03, 8.6669921875e-03, 2.8686523438e-02, ..., -2.4902343750e-02, -2.2216796875e-02, 1.5014648438e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 5.5541992188e-03, -1.2023925781e-02, -6.8359375000e-03, ..., 1.2817382812e-02, 9.0942382812e-03, -7.1716308594e-03], [-8.4304809570e-04, -1.0803222656e-02, 6.8664550781e-03, ..., -3.6315917969e-03, -2.7954101562e-02, 1.0192871094e-02], ..., [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], [ 5.3100585938e-03, -1.2207031250e-03, 1.7456054688e-02, ..., -1.2084960938e-02, -3.1250000000e-02, 1.7623901367e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 5.5541992188e-03, -1.2023925781e-02, -6.8359375000e-03, ..., 1.2817382812e-02, 9.0942382812e-03, -7.1716308594e-03], [-8.4304809570e-04, -1.0803222656e-02, 6.8664550781e-03, ..., -3.6315917969e-03, -2.7954101562e-02, 1.0192871094e-02], ..., [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], [ 5.3100585938e-03, -1.2207031250e-03, 1.7456054688e-02, ..., -1.2084960938e-02, -3.1250000000e-02, 1.7623901367e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 1.7675781250, -1.0664062500, -0.6508789062, ..., 0.9580078125, 1.5019531250, -1.6914062500], [-0.5043945312, -1.8154296875, -0.2263183594, ..., -1.5634765625, 0.9941406250, 0.0864257812], [ 0.7875976562, -2.3417968750, 0.3540039062, ..., -0.2065429688, 1.2705078125, 0.2165527344], ..., [-0.6567382812, -0.5854492188, -0.8876953125, ..., -0.2868652344, -0.3029785156, -0.2316894531], [-0.4267578125, -0.2888183594, -0.7734375000, ..., -0.9594726562, 0.3344726562, -0.0925292969], [-0.0974731445, -0.1306152344, -0.4289550781, ..., -0.7089843750, 0.4907226562, 0.6621093750]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., 0., 0., 0.], [ 0., 0., 0., ..., -65504., -65504., -65504.], [-65504., -65504., -65504., ..., -65504., -65504., -65504.], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.9956054688e-03, 2.5878906250e-02, 1.8066406250e-02, ..., -3.4423828125e-02, 9.0332031250e-03, 6.6528320312e-03], [ 1.0253906250e-02, -3.1433105469e-03, 1.8554687500e-02, ..., 1.4404296875e-02, -2.2216796875e-02, 2.5756835938e-02], ..., [ 1.9897460938e-02, 1.1108398438e-02, 1.9653320312e-02, ..., 8.4228515625e-03, -8.6669921875e-03, -9.4604492188e-03], [ 3.6315917969e-03, -2.2949218750e-02, 7.6904296875e-03, ..., -2.2094726562e-02, -2.0874023438e-02, 1.0147094727e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.9956054688e-03, 2.5878906250e-02, 1.8066406250e-02, ..., -3.4423828125e-02, 9.0332031250e-03, 6.6528320312e-03], [ 1.0253906250e-02, -3.1433105469e-03, 1.8554687500e-02, ..., 1.4404296875e-02, -2.2216796875e-02, 2.5756835938e-02], ..., [ 1.9897460938e-02, 1.1108398438e-02, 1.9653320312e-02, ..., 8.4228515625e-03, -8.6669921875e-03, -9.4604492188e-03], [ 3.6315917969e-03, -2.2949218750e-02, 7.6904296875e-03, ..., -2.2094726562e-02, -2.0874023438e-02, 1.0147094727e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.6708374023e-03, -4.8522949219e-03, 2.1362304688e-02, ..., -3.0029296875e-02, -6.5002441406e-03, -1.2329101562e-02], [ 1.3488769531e-02, 1.0986328125e-03, 6.6406250000e-02, ..., -2.7847290039e-04, -3.8909912109e-03, 4.6691894531e-03], ..., [ 4.1748046875e-02, 3.4179687500e-03, -9.2773437500e-03, ..., 8.4228515625e-03, -8.6669921875e-03, 3.7231445312e-03], [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.6708374023e-03, -4.8522949219e-03, 2.1362304688e-02, ..., -3.0029296875e-02, -6.5002441406e-03, -1.2329101562e-02], [ 1.3488769531e-02, 1.0986328125e-03, 6.6406250000e-02, ..., -2.7847290039e-04, -3.8909912109e-03, 4.6691894531e-03], ..., [ 4.1748046875e-02, 3.4179687500e-03, -9.2773437500e-03, ..., 8.4228515625e-03, -8.6669921875e-03, 3.7231445312e-03], [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], [ 1.3366699219e-02, -8.6975097656e-04, 1.8554687500e-02, ..., 8.2397460938e-04, -1.0620117188e-02, 1.4709472656e-02], ..., [ 2.5024414062e-02, 6.1340332031e-03, 8.3618164062e-03, ..., -1.0375976562e-03, 5.7373046875e-03, -1.1718750000e-02], [ 4.5898437500e-02, -1.4572143555e-03, -6.5917968750e-03, ..., 1.5625000000e-02, 2.9754638672e-03, 3.9978027344e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], [ 1.3366699219e-02, -8.6975097656e-04, 1.8554687500e-02, ..., 8.2397460938e-04, -1.0620117188e-02, 1.4709472656e-02], ..., [ 2.5024414062e-02, 6.1340332031e-03, 8.3618164062e-03, ..., -1.0375976562e-03, 5.7373046875e-03, -1.1718750000e-02], [ 4.5898437500e-02, -1.4572143555e-03, -6.5917968750e-03, ..., 1.5625000000e-02, 2.9754638672e-03, 3.9978027344e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-2.0446777344e-03, 1.2756347656e-02, 1.8920898438e-02, ..., 1.4099121094e-02, 4.9743652344e-03, -1.7700195312e-02], ..., [-7.2326660156e-03, 7.5531005859e-04, -1.4953613281e-03, ..., -8.5830688477e-04, -3.4179687500e-03, 4.9438476562e-03], [ 3.4423828125e-02, -8.3160400391e-04, 1.5075683594e-02, ..., -7.5988769531e-03, -1.1779785156e-02, -1.4526367188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-2.0446777344e-03, 1.2756347656e-02, 1.8920898438e-02, ..., 1.4099121094e-02, 4.9743652344e-03, -1.7700195312e-02], ..., [-7.2326660156e-03, 7.5531005859e-04, -1.4953613281e-03, ..., -8.5830688477e-04, -3.4179687500e-03, 4.9438476562e-03], [ 3.4423828125e-02, -8.3160400391e-04, 1.5075683594e-02, ..., -7.5988769531e-03, -1.1779785156e-02, -1.4526367188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.0681152344e-03, -1.0131835938e-02, 3.5644531250e-02, ..., -1.6967773438e-02, -1.3732910156e-02, -1.6357421875e-02], ..., [-9.2163085938e-03, 1.0864257812e-02, 1.8066406250e-02, ..., -2.1057128906e-03, -1.0147094727e-03, -2.7313232422e-03], [-2.5146484375e-02, 7.3242187500e-03, 2.6367187500e-02, ..., -2.5177001953e-04, -2.1850585938e-02, -9.1171264648e-04], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.0681152344e-03, -1.0131835938e-02, 3.5644531250e-02, ..., -1.6967773438e-02, -1.3732910156e-02, -1.6357421875e-02], ..., [-9.2163085938e-03, 1.0864257812e-02, 1.8066406250e-02, ..., -2.1057128906e-03, -1.0147094727e-03, -2.7313232422e-03], [-2.5146484375e-02, 7.3242187500e-03, 2.6367187500e-02, ..., -2.5177001953e-04, -2.1850585938e-02, -9.1171264648e-04], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.9956054688e-03, 2.5878906250e-02, 1.8066406250e-02, ..., -3.4423828125e-02, 9.0332031250e-03, 6.6528320312e-03], [-1.5441894531e-02, 5.5694580078e-04, -1.0314941406e-02, ..., 1.1367797852e-03, 2.2430419922e-03, -2.8991699219e-03], ..., [ 2.1118164062e-02, 4.4860839844e-03, 2.9907226562e-02, ..., -2.5268554688e-02, -9.0332031250e-03, 3.2653808594e-03], [ 8.7738037109e-04, -1.3000488281e-02, 2.3498535156e-03, ..., -3.3111572266e-03, -7.3547363281e-03, -1.0803222656e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.9956054688e-03, 2.5878906250e-02, 1.8066406250e-02, ..., -3.4423828125e-02, 9.0332031250e-03, 6.6528320312e-03], [-1.5441894531e-02, 5.5694580078e-04, -1.0314941406e-02, ..., 1.1367797852e-03, 2.2430419922e-03, -2.8991699219e-03], ..., [ 2.1118164062e-02, 4.4860839844e-03, 2.9907226562e-02, ..., -2.5268554688e-02, -9.0332031250e-03, 3.2653808594e-03], [ 8.7738037109e-04, -1.3000488281e-02, 2.3498535156e-03, ..., -3.3111572266e-03, -7.3547363281e-03, -1.0803222656e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 6.0424804688e-03, 4.0588378906e-03, 2.9052734375e-02, ..., -1.9653320312e-02, 5.4931640625e-03, 9.3994140625e-03], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [ 1.2207031250e-02, 6.3476562500e-03, -2.0874023438e-02, ..., 5.3100585938e-03, 1.6357421875e-02, 1.1230468750e-02], [ 1.4221191406e-02, -1.7578125000e-02, 1.6479492188e-02, ..., -2.6733398438e-02, -3.4912109375e-02, 1.9287109375e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 6.0424804688e-03, 4.0588378906e-03, 2.9052734375e-02, ..., -1.9653320312e-02, 5.4931640625e-03, 9.3994140625e-03], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [ 1.2207031250e-02, 6.3476562500e-03, -2.0874023438e-02, ..., 5.3100585938e-03, 1.6357421875e-02, 1.1230468750e-02], [ 1.4221191406e-02, -1.7578125000e-02, 1.6479492188e-02, ..., -2.6733398438e-02, -3.4912109375e-02, 1.9287109375e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.8735351562e-03, -6.1798095703e-04, -5.0659179688e-03, ..., -2.3803710938e-02, 2.0629882812e-02, -1.3427734375e-02], [ 8.7890625000e-03, 1.4190673828e-03, -1.7333984375e-02, ..., -1.4648437500e-02, 9.2163085938e-03, 2.9907226562e-02], ..., [-1.9531250000e-02, 1.9531250000e-02, 1.3504028320e-03, ..., -9.8876953125e-03, -9.8266601562e-03, -4.3640136719e-03], [ 1.1749267578e-03, 2.7465820312e-02, -1.2634277344e-02, ..., 4.0893554688e-03, 1.8692016602e-03, -5.3405761719e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.8735351562e-03, -6.1798095703e-04, -5.0659179688e-03, ..., -2.3803710938e-02, 2.0629882812e-02, -1.3427734375e-02], [ 8.7890625000e-03, 1.4190673828e-03, -1.7333984375e-02, ..., -1.4648437500e-02, 9.2163085938e-03, 2.9907226562e-02], ..., [-1.9531250000e-02, 1.9531250000e-02, 1.3504028320e-03, ..., -9.8876953125e-03, -9.8266601562e-03, -4.3640136719e-03], [ 1.1749267578e-03, 2.7465820312e-02, -1.2634277344e-02, ..., 4.0893554688e-03, 1.8692016602e-03, -5.3405761719e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- .------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.0559082031e-02, 1.5258789062e-02, 7.0495605469e-03, ..., -4.0283203125e-03, -1.5319824219e-02, 3.3691406250e-02], [-3.7353515625e-02, -2.5634765625e-03, 1.0498046875e-02, ..., 8.5449218750e-03, 3.2043457031e-04, -6.1645507812e-03], ..., [ 1.5014648438e-02, 7.1716308594e-03, -1.1657714844e-02, ..., 1.1962890625e-02, 2.2705078125e-02, 9.3994140625e-03], [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.0559082031e-02, 1.5258789062e-02, 7.0495605469e-03, ..., -4.0283203125e-03, -1.5319824219e-02, 3.3691406250e-02], [-3.7353515625e-02, -2.5634765625e-03, 1.0498046875e-02, ..., 8.5449218750e-03, 3.2043457031e-04, -6.1645507812e-03], ..., [ 1.5014648438e-02, 7.1716308594e-03, -1.1657714844e-02, ..., 1.1962890625e-02, 2.2705078125e-02, 9.3994140625e-03], [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [ 0.0091552734, 0.0233154297, -0.0025482178, ..., 0.0067443848, 0.0133666992, 0.0177001953], [ 0.0133666992, -0.0008697510, 0.0185546875, ..., 0.0008239746, -0.0106201172, 0.0147094727], ..., [ 0.0483398438, 0.0026855469, -0.0017166138, ..., 0.0130615234, 0.0032501221, -0.0050354004], [ 0.0209960938, 0.0045776367, 0.0059814453, ..., -0.0011215210, -0.0222167969, 0.0051269531], [ 0.0029144287, 0.0050354004, 0.0004882812, ..., 0.0006332397, -0.0057373047, 0.0007133484]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [ 0.0091552734, 0.0233154297, -0.0025482178, ..., 0.0067443848, 0.0133666992, 0.0177001953], [ 0.0133666992, -0.0008697510, 0.0185546875, ..., 0.0008239746, -0.0106201172, 0.0147094727], ..., [ 0.0483398438, 0.0026855469, -0.0017166138, ..., 0.0130615234, 0.0032501221, -0.0050354004], [ 0.0209960938, 0.0045776367, 0.0059814453, ..., -0.0011215210, -0.0222167969, 0.0051269531], [ 0.0029144287, 0.0050354004, 0.0004882812, ..., 0.0006332397, -0.0057373047, 0.0007133484]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.9956054688e-03, 2.5878906250e-02, 1.8066406250e-02, ..., -3.4423828125e-02, 9.0332031250e-03, 6.6528320312e-03], [-9.2773437500e-03, 8.7890625000e-03, 1.3854980469e-02, ..., 9.3994140625e-03, 1.0314941406e-02, 1.1413574219e-02], ..., [ 2.4871826172e-03, -1.6357421875e-02, -2.0294189453e-03, ..., -7.1716308594e-04, -1.6479492188e-03, 9.7656250000e-03], [-1.3549804688e-02, -5.1879882812e-03, -1.0559082031e-02, ..., -1.8066406250e-02, 2.0874023438e-02, -1.4099121094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 7.9956054688e-03, 2.5878906250e-02, 1.8066406250e-02, ..., -3.4423828125e-02, 9.0332031250e-03, 6.6528320312e-03], [-9.2773437500e-03, 8.7890625000e-03, 1.3854980469e-02, ..., 9.3994140625e-03, 1.0314941406e-02, 1.1413574219e-02], ..., [ 2.4871826172e-03, -1.6357421875e-02, -2.0294189453e-03, ..., -7.1716308594e-04, -1.6479492188e-03, 9.7656250000e-03], [-1.3549804688e-02, -5.1879882812e-03, -1.0559082031e-02, ..., -1.8066406250e-02, 2.0874023438e-02, -1.4099121094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 2.4536132812e-02, -1.4190673828e-03, 2.6123046875e-02, ..., 1.5945434570e-03, 6.2866210938e-03, -1.2329101562e-02], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], ..., [-7.0190429688e-03, -1.8997192383e-03, 1.1352539062e-02, ..., -2.0751953125e-02, 5.4931640625e-03, 1.6601562500e-02], [-3.2196044922e-03, -1.5075683594e-02, -4.8217773438e-03, ..., -7.4462890625e-03, -1.5319824219e-02, 4.0527343750e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 2.4536132812e-02, -1.4190673828e-03, 2.6123046875e-02, ..., 1.5945434570e-03, 6.2866210938e-03, -1.2329101562e-02], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], ..., [-7.0190429688e-03, -1.8997192383e-03, 1.1352539062e-02, ..., -2.0751953125e-02, 5.4931640625e-03, 1.6601562500e-02], [-3.2196044922e-03, -1.5075683594e-02, -4.8217773438e-03, ..., -7.4462890625e-03, -1.5319824219e-02, 4.0527343750e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-9.7656250000e-03, -7.2021484375e-03, -2.9174804688e-02, ..., 4.8217773438e-03, -1.0131835938e-02, 2.4414062500e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [-8.2397460938e-03, 7.4768066406e-04, 8.7890625000e-03, ..., 2.1240234375e-02, -2.0019531250e-02, -2.9174804688e-02], [-8.3007812500e-03, -2.2583007812e-03, -1.6357421875e-02, ..., 2.9541015625e-02, -4.9438476562e-03, 2.3437500000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-9.7656250000e-03, -7.2021484375e-03, -2.9174804688e-02, ..., 4.8217773438e-03, -1.0131835938e-02, 2.4414062500e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [-8.2397460938e-03, 7.4768066406e-04, 8.7890625000e-03, ..., 2.1240234375e-02, -2.0019531250e-02, -2.9174804688e-02], [-8.3007812500e-03, -2.2583007812e-03, -1.6357421875e-02, ..., 2.9541015625e-02, -4.9438476562e-03, 2.3437500000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-512.0000000000, -7.4960937500, -512.0000000000, ..., 0.0000000000, 0.0000000000, 0.0000000000], [ 0.0000000000, 0.0000000000, 0.0000000000, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ 0.0000000000, 0.0000000000, 0.0000000000, ..., 0.0000000000, 0.0000000000, 0.0000000000], [ 0.0000000000, 0.0000000000, 0.0000000000, ..., 0.0000000000, 0.0000000000, 0.0000000000], [ 0.0000000000, 0.0000000000, 0.0000000000, ..., 0.0000000000, 0.0000000000, 0.0000000000]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-1.9775390625e-02, -1.7822265625e-02, 2.3437500000e-02, ..., -1.3427734375e-02, -4.1809082031e-03, -1.0620117188e-02], ..., [-1.5441894531e-02, 5.5694580078e-04, -1.0314941406e-02, ..., 1.1367797852e-03, 2.2430419922e-03, -2.8991699219e-03], [ 1.5075683594e-02, -9.3383789062e-03, 1.7211914062e-02, ..., -1.5945434570e-03, -2.0629882812e-02, 8.3618164062e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-1.9775390625e-02, -1.7822265625e-02, 2.3437500000e-02, ..., -1.3427734375e-02, -4.1809082031e-03, -1.0620117188e-02], ..., [-1.5441894531e-02, 5.5694580078e-04, -1.0314941406e-02, ..., 1.1367797852e-03, 2.2430419922e-03, -2.8991699219e-03], [ 1.5075683594e-02, -9.3383789062e-03, 1.7211914062e-02, ..., -1.5945434570e-03, -2.0629882812e-02, 8.3618164062e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 3.3203125000e-01, 3.5009765625e-01, -7.4267578125e-01, ..., -1.1962890625e+00, -2.0312500000e+00, 1.2333984375e+00], [ 7.3144531250e-01, -2.8457031250e+00, 5.9521484375e-01, ..., -1.9804687500e+00, -9.7656250000e-04, 5.6933593750e-01], [-5.2441406250e-01, -1.3144531250e+00, 6.6699218750e-01, ..., -7.3144531250e-01, -6.7773437500e-01, 2.3886718750e+00], ..., [-2.5756835938e-01, -5.2441406250e-01, -1.3171386719e-01, ..., -9.8632812500e-02, -6.0107421875e-01, 2.7270507812e-01], [-1.6625976562e-01, -2.4499511719e-01, -3.7866210938e-01, ..., -1.6455078125e-01, -3.7524414062e-01, 3.8635253906e-02], [-5.1220703125e-01, -3.1054687500e-01, -5.6396484375e-02, ..., -2.3535156250e-01, -9.0942382812e-02, 1.9360351562e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-9.7656250000e-03, -7.2021484375e-03, -2.9174804688e-02, ..., 4.8217773438e-03, -1.0131835938e-02, 2.4414062500e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [-5.7678222656e-03, -3.3935546875e-02, 6.2255859375e-03, ..., 2.8442382812e-02, 9.4604492188e-03, 2.0751953125e-02], [ 3.7536621094e-03, 2.5024414062e-03, 1.1215209961e-03, ..., -5.7067871094e-03, -1.8188476562e-02, -2.7709960938e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-9.7656250000e-03, -7.2021484375e-03, -2.9174804688e-02, ..., 4.8217773438e-03, -1.0131835938e-02, 2.4414062500e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [-5.7678222656e-03, -3.3935546875e-02, 6.2255859375e-03, ..., 2.8442382812e-02, 9.4604492188e-03, 2.0751953125e-02], [ 3.7536621094e-03, 2.5024414062e-03, 1.1215209961e-03, ..., -5.7067871094e-03, -1.8188476562e-02, -2.7709960938e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.3444824219, 1.0429687500, 1.7890625000, ..., 1.8681640625, -1.5263671875, -0.9838867188], [-2.7695312500, 1.6621093750, 1.3447265625, ..., -4.7734375000, 0.0698242188, -2.1835937500], [ 0.4252929688, 0.1026611328, 1.2578125000, ..., 0.3854980469, -1.4316406250, -1.3486328125], ..., [-0.8652343750, -1.8232421875, -0.7700195312, ..., 2.5332031250, 1.3261718750, 1.4326171875], [-0.4956054688, -0.3552246094, 0.0700073242, ..., -0.3266601562, 0.0793457031, 0.2575683594], [-0.6142578125, -1.0625000000, -0.3342285156, ..., 0.2753906250, -0.2166748047, -0.1727294922]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-512.0000000000, -7.4960937500, -512.0000000000, ..., 0.0000000000, 0.0000000000, 0.0000000000], [ 0.0000000000, 0.0000000000, 0.0000000000, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [ 0.0091552734, 0.0233154297, -0.0025482178, ..., 0.0067443848, 0.0133666992, 0.0177001953], [ 0.0133666992, -0.0008697510, 0.0185546875, ..., 0.0008239746, -0.0106201172, 0.0147094727], ..., [-0.0059814453, -0.0067749023, -0.0246582031, ..., 0.0079345703, 0.0133056641, 0.0043945312], [ 0.0407714844, 0.0127563477, 0.0211181641, ..., 0.0054321289, 0.0093994141, -0.0145263672], [ 0.0094604492, -0.0095825195, -0.0241699219, ..., -0.0134887695, -0.0072326660, 0.0074157715]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [ 0.0091552734, 0.0233154297, -0.0025482178, ..., 0.0067443848, 0.0133666992, 0.0177001953], [ 0.0133666992, -0.0008697510, 0.0185546875, ..., 0.0008239746, -0.0106201172, 0.0147094727], ..., [-0.0059814453, -0.0067749023, -0.0246582031, ..., 0.0079345703, 0.0133056641, 0.0043945312], [ 0.0407714844, 0.0127563477, 0.0211181641, ..., 0.0054321289, 0.0093994141, -0.0145263672], [ 0.0094604492, -0.0095825195, -0.0241699219, ..., -0.0134887695, -0.0072326660, 0.0074157715]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.9624023438, -0.5078125000, 1.2578125000, ..., 1.2988281250, -0.4765625000, -1.9433593750], [-1.5029296875, -2.8437500000, 0.4162597656, ..., -1.7167968750, -0.3393554688, 0.4558105469], [-0.9589843750, 0.5791015625, 1.0273437500, ..., 0.2763671875, 0.7875976562, -0.2375488281], ..., [-0.6503906250, -0.4538574219, -0.3691406250, ..., -0.1927490234, 0.0711669922, -0.3635253906], [ 0.0161132812, -0.9204101562, -0.2246093750, ..., -0.2690429688, 0.0480957031, 0.0792236328], [-0.4648437500, -2.2031250000, -0.5390625000, ..., 0.4575195312, 3.0722656250, 0.7041015625]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [-9.0332031250e-02, -2.3803710938e-03, -4.2724609375e-03, ..., -1.8081665039e-03, -1.6250610352e-03, -9.9182128906e-04], [-2.7465820312e-03, 2.1972656250e-02, 1.9165039062e-02, ..., -1.6479492188e-02, 2.6550292969e-03, 1.0833740234e-03], ..., [-3.3691406250e-02, 1.3198852539e-03, -1.4221191406e-02, ..., -1.5869140625e-03, -1.1657714844e-02, 1.5869140625e-02], [ 1.7944335938e-02, 2.1240234375e-02, -3.9550781250e-02, ..., -7.0800781250e-03, -1.9409179688e-02, 2.3437500000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [-9.0332031250e-02, -2.3803710938e-03, -4.2724609375e-03, ..., -1.8081665039e-03, -1.6250610352e-03, -9.9182128906e-04], [-2.7465820312e-03, 2.1972656250e-02, 1.9165039062e-02, ..., -1.6479492188e-02, 2.6550292969e-03, 1.0833740234e-03], ..., [-3.3691406250e-02, 1.3198852539e-03, -1.4221191406e-02, ..., -1.5869140625e-03, -1.1657714844e-02, 1.5869140625e-02], [ 1.7944335938e-02, 2.1240234375e-02, -3.9550781250e-02, ..., -7.0800781250e-03, -1.9409179688e-02, 2.3437500000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.5483398438, 0.6240234375, 0.2263183594, ..., -0.0869140625, -0.1269531250, -0.3583984375], [ 0.8442382812, -1.0791015625, 1.4033203125, ..., -3.1796875000, 1.1318359375, 2.6523437500], [ 1.1708984375, -0.5502929688, 0.6254882812, ..., -0.1043701172, 0.7880859375, 2.7734375000], ..., [-0.4348144531, 0.2968750000, -0.3969726562, ..., -0.2233886719, -0.0921630859, -0.2622070312], [-0.6494140625, 0.6464843750, -0.2844238281, ..., 0.1682128906, 0.1643066406, 0.0490722656], [-0.1276855469, 0.3291015625, -0.6074218750, ..., -0.2045898438, 0.1060791016, -0.1083984375]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [-7.2326660156e-03, 7.5531005859e-04, -1.4953613281e-03, ..., -8.5830688477e-04, -3.4179687500e-03, 4.9438476562e-03], [ 1.9287109375e-02, -2.0507812500e-02, 2.0874023438e-02, ..., -4.2968750000e-02, 6.2561035156e-03, -2.6397705078e-03], ..., [-3.3691406250e-02, 1.3198852539e-03, -1.4221191406e-02, ..., -1.5869140625e-03, -1.1657714844e-02, 1.5869140625e-02], [ 1.7944335938e-02, 2.1240234375e-02, -3.9550781250e-02, ..., -7.0800781250e-03, -1.9409179688e-02, 2.3437500000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [-7.2326660156e-03, 7.5531005859e-04, -1.4953613281e-03, ..., -8.5830688477e-04, -3.4179687500e-03, 4.9438476562e-03], [ 1.9287109375e-02, -2.0507812500e-02, 2.0874023438e-02, ..., -4.2968750000e-02, 6.2561035156e-03, -2.6397705078e-03], ..., [-3.3691406250e-02, 1.3198852539e-03, -1.4221191406e-02, ..., -1.5869140625e-03, -1.1657714844e-02, 1.5869140625e-02], [ 1.7944335938e-02, 2.1240234375e-02, -3.9550781250e-02, ..., -7.0800781250e-03, -1.9409179688e-02, 2.3437500000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 9.5581054688e-02, 4.0747070312e-01, 2.0104980469e-01, ..., -2.9492187500e-01, -1.9140625000e-01, -3.1591796875e-01], [ 8.3056640625e-01, 1.0742187500e+00, -2.2910156250e+00, ..., 1.3842773438e-01, 3.4277343750e-01, 4.8486328125e-01], [-3.3496093750e-01, -4.5825195312e-01, -1.3085937500e+00, ..., 4.7668457031e-02, 1.2512207031e-01, 1.6442871094e-01], ..., [-4.4140625000e-01, 2.7050781250e-01, 9.7656250000e-04, ..., -6.9482421875e-01, 2.1166992188e-01, 5.0634765625e-01], [-2.8710937500e-01, -4.3762207031e-02, -2.3950195312e-01, ..., 1.6894531250e-01, 3.5351562500e-01, 2.4719238281e-01], [-1.7529296875e-01, 2.5488281250e-01, -4.2211914062e-01, ..., -6.2255859375e-02, -5.3710937500e-03, 5.7128906250e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- .------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [-7.5683593750e-03, -1.2664794922e-03, -9.0942382812e-03, ..., 6.5917968750e-03, -5.5236816406e-03, -1.1108398438e-02], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], ..., [-1.6723632812e-02, -3.1738281250e-03, -6.6528320312e-03, ..., -1.2695312500e-02, 1.2817382812e-02, 1.4648437500e-03], [-9.0332031250e-02, -2.3803710938e-03, -4.2724609375e-03, ..., -1.8081665039e-03, -1.6250610352e-03, -9.9182128906e-04], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [-7.5683593750e-03, -1.2664794922e-03, -9.0942382812e-03, ..., 6.5917968750e-03, -5.5236816406e-03, -1.1108398438e-02], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], ..., [-1.6723632812e-02, -3.1738281250e-03, -6.6528320312e-03, ..., -1.2695312500e-02, 1.2817382812e-02, 1.4648437500e-03], [-9.0332031250e-02, -2.3803710938e-03, -4.2724609375e-03, ..., -1.8081665039e-03, -1.6250610352e-03, -9.9182128906e-04], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.9628906250, 0.3212890625, -1.9863281250, ..., 3.2304687500, 2.4980468750, -1.6650390625], [-1.0234375000, 0.0256347656, 1.3105468750, ..., -0.2279052734, 1.6162109375, 1.4726562500], [ 0.8505859375, 0.7641601562, 2.1132812500, ..., 0.8789062500, 2.3964843750, 2.4023437500], ..., [ 0.0882568359, 0.2756347656, -0.8154296875, ..., 0.1193847656, -0.1929931641, -0.1730957031], [ 0.1696777344, -0.0462646484, -0.2418212891, ..., 0.4882812500, -0.2060546875, 0.5708007812], [ 0.2705078125, 0.1927490234, -0.0051040649, ..., 0.2379150391, -0.3107910156, 0.7065429688]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [ 2.1240234375e-02, 5.0964355469e-03, -8.6669921875e-03, ..., -8.2397460938e-03, -2.1514892578e-03, -1.9897460938e-02], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], ..., [ 7.4768066406e-03, -2.6733398438e-02, 3.5156250000e-02, ..., -1.1672973633e-03, 1.0253906250e-02, 5.6457519531e-03], [-3.8146972656e-03, -2.6855468750e-02, -4.8522949219e-03, ..., -1.8676757812e-02, 1.2130737305e-03, 1.8676757812e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [ 2.1240234375e-02, 5.0964355469e-03, -8.6669921875e-03, ..., -8.2397460938e-03, -2.1514892578e-03, -1.9897460938e-02], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], ..., [ 7.4768066406e-03, -2.6733398438e-02, 3.5156250000e-02, ..., -1.1672973633e-03, 1.0253906250e-02, 5.6457519531e-03], [-3.8146972656e-03, -2.6855468750e-02, -4.8522949219e-03, ..., -1.8676757812e-02, 1.2130737305e-03, 1.8676757812e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.2442626953, 0.6079101562, 0.2220458984, ..., 0.0783691406, -1.1367187500, -0.4853515625], [ 0.4460449219, -0.2028808594, -0.7382812500, ..., -0.1939697266, 0.0634765625, 0.6850585938], [ 0.4611816406, -0.5341796875, 0.4531250000, ..., -0.1877441406, 1.8212890625, 2.0097656250], ..., [-0.1067504883, -0.2036132812, -0.2365722656, ..., 0.3510742188, 0.1472167969, 0.0203552246], [ 0.1171264648, -0.3452148438, -0.5737304688, ..., 0.1052246094, 0.2099609375, -0.0270385742], [-0.3466796875, -0.0289306641, -0.3696289062, ..., 0.2189941406, 0.1966552734, 0.3632812500]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [-1.0498046875e-02, 1.9042968750e-02, 1.0925292969e-02, ..., 3.0212402344e-03, 1.8676757812e-02, 1.6601562500e-02], ..., [ 7.4768066406e-03, -2.6733398438e-02, 3.5156250000e-02, ..., -1.1672973633e-03, 1.0253906250e-02, 5.6457519531e-03], [-3.8146972656e-03, -2.6855468750e-02, -4.8522949219e-03, ..., -1.8676757812e-02, 1.2130737305e-03, 1.8676757812e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [ 1.2634277344e-02, 2.6550292969e-03, 7.1105957031e-03, ..., -1.9683837891e-03, -4.7683715820e-04, -1.2756347656e-02], [-1.0498046875e-02, 1.9042968750e-02, 1.0925292969e-02, ..., 3.0212402344e-03, 1.8676757812e-02, 1.6601562500e-02], ..., [ 7.4768066406e-03, -2.6733398438e-02, 3.5156250000e-02, ..., -1.1672973633e-03, 1.0253906250e-02, 5.6457519531e-03], [-3.8146972656e-03, -2.6855468750e-02, -4.8522949219e-03, ..., -1.8676757812e-02, 1.2130737305e-03, 1.8676757812e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.0284118652, 1.1201171875, 0.4645996094, ..., -0.1074218750, -0.3730468750, -0.0762939453], [ 0.3530273438, 1.1513671875, 0.3955078125, ..., 0.1126708984, -0.4353027344, -1.1347656250], [ 0.2070312500, -2.1972656250, 0.7084960938, ..., 0.8642578125, 1.0605468750, 1.5048828125], ..., [ 0.6083984375, 0.1029052734, -0.5620117188, ..., 0.3154296875, -0.1080932617, 0.0324096680], [ 0.4636230469, 0.1179199219, -0.2316894531, ..., -0.4050292969, -0.0203247070, 0.4633789062], [-0.4189453125, -0.0289306641, -0.4184570312, ..., -0.2861328125, 0.0611877441, -0.2976074219]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [ 4.1748046875e-02, 3.4179687500e-03, -9.2773437500e-03, ..., 8.4228515625e-03, -8.6669921875e-03, 3.7231445312e-03], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], ..., [ 2.2216796875e-02, 1.8066406250e-02, -2.5268554688e-02, ..., 6.5612792969e-03, -1.2939453125e-02, 1.2664794922e-03], [ 2.9449462891e-03, -1.1108398438e-02, -1.8920898438e-02, ..., -1.8798828125e-02, 2.4047851562e-02, 6.1340332031e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [ 4.1748046875e-02, 3.4179687500e-03, -9.2773437500e-03, ..., 8.4228515625e-03, -8.6669921875e-03, 3.7231445312e-03], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], ..., [ 2.2216796875e-02, 1.8066406250e-02, -2.5268554688e-02, ..., 6.5612792969e-03, -1.2939453125e-02, 1.2664794922e-03], [ 2.9449462891e-03, -1.1108398438e-02, -1.8920898438e-02, ..., -1.8798828125e-02, 2.4047851562e-02, 6.1340332031e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-4.2749023438e-01, -1.0876464844e-01, -1.5051269531e-01, ..., -3.4350585938e-01, -1.9989013672e-02, -4.4616699219e-02], [ 7.5000000000e-01, 1.4443359375e+00, -3.9233398438e-01, ..., -4.5727539062e-01, -9.5336914062e-02, -5.0195312500e-01], [ 8.7402343750e-01, -1.0126953125e+00, 1.1523437500e+00, ..., -4.9438476562e-02, 8.1103515625e-01, 2.2734375000e+00], ..., [-1.5686035156e-01, -1.4245605469e-01, -2.4291992188e-02, ..., -2.3925781250e-01, 1.8066406250e-02, -3.5937500000e-01], [ 1.0429687500e+00, 1.1289062500e+00, 1.7607421875e+00, ..., 8.3691406250e-01, 1.9931640625e+00, 4.4765625000e+00], [-3.0786132812e-01, 1.4880371094e-01, -7.0507812500e-01, ..., 2.1667480469e-03, 2.4389648438e-01, 4.3652343750e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [-1.6723632812e-02, -3.1738281250e-03, -6.6528320312e-03, ..., -1.2695312500e-02, 1.2817382812e-02, 1.4648437500e-03], [ 2.3193359375e-02, -2.1667480469e-03, -9.7656250000e-03, ..., 1.2817382812e-02, 1.1352539062e-02, 4.0588378906e-03], ..., [ 9.3383789062e-03, 2.7587890625e-02, -1.1230468750e-02, ..., 1.7211914062e-02, -7.9345703125e-03, 1.4465332031e-02], [ 2.9449462891e-03, -1.1108398438e-02, -1.8920898438e-02, ..., -1.8798828125e-02, 2.4047851562e-02, 6.1340332031e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [-1.6723632812e-02, -3.1738281250e-03, -6.6528320312e-03, ..., -1.2695312500e-02, 1.2817382812e-02, 1.4648437500e-03], [ 2.3193359375e-02, -2.1667480469e-03, -9.7656250000e-03, ..., 1.2817382812e-02, 1.1352539062e-02, 4.0588378906e-03], ..., [ 9.3383789062e-03, 2.7587890625e-02, -1.1230468750e-02, ..., 1.7211914062e-02, -7.9345703125e-03, 1.4465332031e-02], [ 2.9449462891e-03, -1.1108398438e-02, -1.8920898438e-02, ..., -1.8798828125e-02, 2.4047851562e-02, 6.1340332031e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.2636718750, 0.8437500000, -0.0212402344, ..., -0.3330078125, -0.6123046875, -0.1538085938], [ 1.6005859375, -0.8256835938, 0.2575683594, ..., -0.0908203125, -0.5249023438, 0.6420898438], [ 1.9013671875, 1.7480468750, -0.6816406250, ..., 4.3515625000, 1.5039062500, 2.3007812500], ..., [-0.2583007812, -0.2154541016, -0.2399902344, ..., 0.0930786133, 0.1721191406, 0.4067382812], [-0.1094970703, 0.2062988281, -0.1739501953, ..., -0.3452148438, 0.0653076172, 0.0270996094], [ 0.3974609375, 0.0529785156, -0.4438476562, ..., 0.2524414062, -0.1716308594, 0.4353027344]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [ 1.5014648438e-02, 7.1716308594e-03, -1.1657714844e-02, ..., 1.1962890625e-02, 2.2705078125e-02, 9.3994140625e-03], [-2.5268554688e-02, -1.5319824219e-02, -1.9531250000e-02, ..., 6.9274902344e-03, 8.9111328125e-03, 2.6550292969e-03], ..., [-3.3691406250e-02, 1.3198852539e-03, -1.4221191406e-02, ..., -1.5869140625e-03, -1.1657714844e-02, 1.5869140625e-02], [ 1.7944335938e-02, 2.1240234375e-02, -3.9550781250e-02, ..., -7.0800781250e-03, -1.9409179688e-02, 2.3437500000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [ 1.5014648438e-02, 7.1716308594e-03, -1.1657714844e-02, ..., 1.1962890625e-02, 2.2705078125e-02, 9.3994140625e-03], [-2.5268554688e-02, -1.5319824219e-02, -1.9531250000e-02, ..., 6.9274902344e-03, 8.9111328125e-03, 2.6550292969e-03], ..., [-3.3691406250e-02, 1.3198852539e-03, -1.4221191406e-02, ..., -1.5869140625e-03, -1.1657714844e-02, 1.5869140625e-02], [ 1.7944335938e-02, 2.1240234375e-02, -3.9550781250e-02, ..., -7.0800781250e-03, -1.9409179688e-02, 2.3437500000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.0209350586, 0.4465332031, 0.5366210938, ..., -0.2666015625, -0.2993164062, -0.4318847656], [-0.7656250000, -0.1882324219, -0.5957031250, ..., -0.7001953125, -0.3461914062, 0.2017822266], [ 0.6572265625, -0.8549804688, 0.3471679688, ..., -2.7558593750, 0.3410644531, 2.2011718750], ..., [-1.0019531250, 0.3530273438, -0.6220703125, ..., -0.4714355469, -0.3776855469, -0.2197265625], [-0.4448242188, 0.2844238281, 0.0415039062, ..., 0.1014404297, 0.2910156250, -0.5883789062], [-0.0909423828, 0.1695556641, -0.3996582031, ..., 0.0053176880, 0.0296630859, 0.4438476562]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [ 3.4423828125e-02, -8.3160400391e-04, 1.5075683594e-02, ..., -7.5988769531e-03, -1.1779785156e-02, -1.4526367188e-02], [-1.7944335938e-02, -2.2094726562e-02, 1.3366699219e-02, ..., 9.5825195312e-03, -1.9042968750e-02, -1.0253906250e-02], ..., [ 2.2216796875e-02, 1.8066406250e-02, -2.5268554688e-02, ..., 6.5612792969e-03, -1.2939453125e-02, 1.2664794922e-03], [ 2.9449462891e-03, -1.1108398438e-02, -1.8920898438e-02, ..., -1.8798828125e-02, 2.4047851562e-02, 6.1340332031e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-3.1738281250e-02, -3.6315917969e-03, -2.3437500000e-02, ..., -3.8574218750e-02, -1.4404296875e-02, -1.2023925781e-02], [ 3.4423828125e-02, -8.3160400391e-04, 1.5075683594e-02, ..., -7.5988769531e-03, -1.1779785156e-02, -1.4526367188e-02], [-1.7944335938e-02, -2.2094726562e-02, 1.3366699219e-02, ..., 9.5825195312e-03, -1.9042968750e-02, -1.0253906250e-02], ..., [ 2.2216796875e-02, 1.8066406250e-02, -2.5268554688e-02, ..., 6.5612792969e-03, -1.2939453125e-02, 1.2664794922e-03], [ 2.9449462891e-03, -1.1108398438e-02, -1.8920898438e-02, ..., -1.8798828125e-02, 2.4047851562e-02, 6.1340332031e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.2565917969, 0.4855957031, 0.4235839844, ..., -0.4543457031, -0.9135742188, -0.2609863281], [ 0.2524414062, 0.0737915039, -0.9331054688, ..., -0.2827148438, 0.7001953125, -0.6572265625], [ 0.7709960938, -1.1503906250, -0.4453125000, ..., -0.1975097656, 0.3725585938, 2.4765625000], ..., [-0.1765136719, 0.1202392578, -0.1799316406, ..., -0.3823242188, 0.1323242188, -0.3549804688], [ 0.0320434570, 0.2012939453, -0.1497802734, ..., -0.4980468750, 0.0661010742, 0.1383056641], [ 0.3564453125, 0.0657958984, -0.3334960938, ..., -0.0328063965, 0.1800537109, 0.6962890625]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], [ 1.3366699219e-02, -8.6975097656e-04, 1.8554687500e-02, ..., 8.2397460938e-04, -1.0620117188e-02, 1.4709472656e-02], ..., [ 2.2277832031e-03, -1.9653320312e-02, 3.5400390625e-02, ..., -2.3193359375e-02, -3.3691406250e-02, -3.5156250000e-02], [-1.5075683594e-02, -3.3691406250e-02, -3.6468505859e-03, ..., -9.8266601562e-03, 9.7656250000e-03, 1.0620117188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], [ 1.3366699219e-02, -8.6975097656e-04, 1.8554687500e-02, ..., 8.2397460938e-04, -1.0620117188e-02, 1.4709472656e-02], ..., [ 2.2277832031e-03, -1.9653320312e-02, 3.5400390625e-02, ..., -2.3193359375e-02, -3.3691406250e-02, -3.5156250000e-02], [-1.5075683594e-02, -3.3691406250e-02, -3.6468505859e-03, ..., -9.8266601562e-03, 9.7656250000e-03, 1.0620117188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-1.2666015625, 1.1787109375, 0.5502929688, ..., 2.8046875000, 1.2226562500, 0.3129882812], [-1.1455078125, 0.3935546875, 4.0195312500, ..., 2.3789062500, 1.8525390625, -1.1523437500], [-1.2597656250, 1.2070312500, 0.6113281250, ..., 2.8046875000, 1.2402343750, 0.3361816406], ..., [-0.0787963867, -0.1614990234, -0.2631835938, ..., 0.0612182617, -0.3022460938, -0.3234863281], [-0.7128906250, -1.6318359375, 2.3574218750, ..., 1.0634765625, -1.4697265625, 1.1240234375], [-0.0499267578, 0.2729492188, -0.5722656250, ..., 0.4458007812, -0.1456298828, -0.0917968750]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.6723632812e-02, -3.2714843750e-02, 3.6865234375e-02, ..., -2.1606445312e-02, 5.7678222656e-03, 7.7209472656e-03], [-4.2114257812e-03, -1.2023925781e-02, 1.2451171875e-02, ..., -2.6611328125e-02, 1.1108398438e-02, 2.6123046875e-02], ..., [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 2.8991699219e-03, -2.1118164062e-02, -1.0009765625e-02, ..., -1.3916015625e-02, -1.5625000000e-02, 5.6762695312e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.6723632812e-02, -3.2714843750e-02, 3.6865234375e-02, ..., -2.1606445312e-02, 5.7678222656e-03, 7.7209472656e-03], [-4.2114257812e-03, -1.2023925781e-02, 1.2451171875e-02, ..., -2.6611328125e-02, 1.1108398438e-02, 2.6123046875e-02], ..., [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 2.8991699219e-03, -2.1118164062e-02, -1.0009765625e-02, ..., -1.3916015625e-02, -1.5625000000e-02, 5.6762695312e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.4516601562, -1.5380859375, 0.6176757812, ..., -2.0761718750, -0.9243164062, 0.8027343750], [-0.3601074219, 2.1875000000, 1.6953125000, ..., -0.7773437500, -1.0742187500, -3.1503906250], [ 0.0117797852, -1.7832031250, 0.0449523926, ..., -2.4765625000, -0.6337890625, 1.6142578125], ..., [ 0.0197753906, -0.3259277344, -0.0363769531, ..., -0.7070312500, -0.7851562500, -0.4699707031], [ 0.8471679688, -2.3750000000, 1.8535156250, ..., -2.6425781250, 1.1347656250, 1.1933593750], [-0.0250244141, -0.5092773438, -0.1564941406, ..., 0.1486816406, 0.0920410156, 0.6562500000]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.0681152344e-03, -1.0131835938e-02, 3.5644531250e-02, ..., -1.6967773438e-02, -1.3732910156e-02, -1.6357421875e-02], ..., [ 2.9296875000e-02, -2.6092529297e-03, -1.1840820312e-02, ..., 1.3183593750e-02, -3.4484863281e-03, -9.8876953125e-03], [-6.4697265625e-03, 1.0559082031e-02, -1.9775390625e-02, ..., 4.6997070312e-03, -2.8320312500e-02, -2.0385742188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.0681152344e-03, -1.0131835938e-02, 3.5644531250e-02, ..., -1.6967773438e-02, -1.3732910156e-02, -1.6357421875e-02], ..., [ 2.9296875000e-02, -2.6092529297e-03, -1.1840820312e-02, ..., 1.3183593750e-02, -3.4484863281e-03, -9.8876953125e-03], [-6.4697265625e-03, 1.0559082031e-02, -1.9775390625e-02, ..., 4.6997070312e-03, -2.8320312500e-02, -2.0385742188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-1.3125000000e+00, -1.9941406250e+00, 1.5830078125e+00, ..., 1.2724609375e+00, -1.2226562500e+00, 1.1123046875e+00], [ 3.7280273438e-01, -2.0996093750e+00, 1.7158203125e+00, ..., 4.1162109375e-01, -1.8222656250e+00, 6.6650390625e-01], [ 7.3242187500e-04, 4.6132812500e+00, -4.9072265625e-02, ..., 4.2656250000e+00, 1.7529296875e+00, -1.5468750000e+00], ..., [ 2.4902343750e-02, 7.7685546875e-01, -4.2749023438e-01, ..., 8.8330078125e-01, 5.2490234375e-01, 2.0080566406e-02], [-8.5693359375e-01, -1.3857421875e+00, 3.6269531250e+00, ..., -1.2929687500e+00, -3.0000000000e+00, 2.4277343750e+00], [ 5.0048828125e-01, 2.6123046875e-01, -8.7353515625e-01, ..., 4.8779296875e-01, -2.1026611328e-02, 2.5341796875e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- .------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.1840820312e-02, -1.7395019531e-03, -1.2084960938e-02, ..., -9.2163085938e-03, -5.3405761719e-03, 4.2419433594e-03], [ 8.3618164062e-03, 1.2023925781e-02, 3.5156250000e-02, ..., -1.9287109375e-02, -1.5380859375e-02, 4.5654296875e-02], ..., [ 1.1749267578e-03, 2.7465820312e-02, -1.2634277344e-02, ..., 4.0893554688e-03, 1.8692016602e-03, -5.3405761719e-03], [-1.3977050781e-02, 2.7221679688e-02, 2.2983551025e-04, ..., -1.8310546875e-02, 1.3061523438e-02, -1.0375976562e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.1840820312e-02, -1.7395019531e-03, -1.2084960938e-02, ..., -9.2163085938e-03, -5.3405761719e-03, 4.2419433594e-03], [ 8.3618164062e-03, 1.2023925781e-02, 3.5156250000e-02, ..., -1.9287109375e-02, -1.5380859375e-02, 4.5654296875e-02], ..., [ 1.1749267578e-03, 2.7465820312e-02, -1.2634277344e-02, ..., 4.0893554688e-03, 1.8692016602e-03, -5.3405761719e-03], [-1.3977050781e-02, 2.7221679688e-02, 2.2983551025e-04, ..., -1.8310546875e-02, 1.3061523438e-02, -1.0375976562e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.8623046875, -1.3046875000, 0.0819091797, ..., -2.3476562500, 0.8178710938, 1.2285156250], [-0.0980834961, -0.0697021484, 0.4782714844, ..., -0.8950195312, -1.4218750000, 1.7636718750], [-1.9423828125, -0.7646484375, 1.4296875000, ..., 1.3105468750, -0.7373046875, 0.9243164062], ..., [ 0.0080566406, -0.6235351562, 0.6245117188, ..., -0.4377441406, -0.1055297852, -0.2416992188], [-0.3776855469, -0.1645507812, -0.3242187500, ..., 0.1141357422, -0.4868164062, 0.1079101562], [-0.6660156250, -0.6689453125, 0.6977539062, ..., -0.1469726562, 0.3991699219, 0.4226074219]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 9.9487304688e-03, 1.9775390625e-02, -9.0942382812e-03, ..., 3.6926269531e-03, -1.0314941406e-02, 3.0975341797e-03], ..., [ 1.3366699219e-02, -8.6975097656e-04, 1.8554687500e-02, ..., 8.2397460938e-04, -1.0620117188e-02, 1.4709472656e-02], [ 5.6762695312e-03, 2.5634765625e-03, -2.5558471680e-04, ..., 1.5014648438e-02, -1.9836425781e-03, 5.2490234375e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 9.9487304688e-03, 1.9775390625e-02, -9.0942382812e-03, ..., 3.6926269531e-03, -1.0314941406e-02, 3.0975341797e-03], ..., [ 1.3366699219e-02, -8.6975097656e-04, 1.8554687500e-02, ..., 8.2397460938e-04, -1.0620117188e-02, 1.4709472656e-02], [ 5.6762695312e-03, 2.5634765625e-03, -2.5558471680e-04, ..., 1.5014648438e-02, -1.9836425781e-03, 5.2490234375e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 1.2188720703e-01, 4.3164062500e-01, 3.9843750000e-01, ..., 4.9658203125e-01, -2.3803710938e-03, -8.7353515625e-01], [-2.5122070312e-01, 2.4572753906e-01, -3.9489746094e-02, ..., 2.0935058594e-01, 9.5581054688e-02, -4.0014648438e-01], [-3.9062500000e+00, -3.6777343750e+00, -1.2363281250e+00, ..., -4.9648437500e+00, -1.4130859375e+00, 2.0214843750e+00], ..., [-1.8261718750e-01, 1.7939453125e+00, 2.7294921875e-01, ..., 5.9814453125e-01, 4.1229248047e-02, -8.1250000000e-01], [-1.8007812500e+00, -2.0781250000e+00, -8.5156250000e-01, ..., -2.0312500000e+00, -8.3398437500e-01, 1.4599609375e+00], [-2.2070312500e+00, -2.1601562500e+00, -3.6108398438e-01, ..., -2.0507812500e+00, -3.0102539062e-01, 1.3798828125e+00]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[ 0., -65504., -65504., ..., -65504., -65504., -65504.], [-65504., -65504., -65504., ..., -65504., 0., 0.], [ 0., 0., 0., ..., -65504., -65504., -65504.], ..., [ 0., 0., 0., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.1535644531e-02, 1.0009765625e-02, 1.0757446289e-03, ..., 2.7008056641e-03, -1.6357421875e-02, -2.8686523438e-03], ..., [ 6.9580078125e-03, -1.0131835938e-02, 2.8442382812e-02, ..., 8.0566406250e-03, 5.5236816406e-03, 1.1413574219e-02], [-1.3183593750e-02, -1.1596679688e-02, 8.9721679688e-03, ..., 4.1809082031e-03, -2.2094726562e-02, -1.4831542969e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.1535644531e-02, 1.0009765625e-02, 1.0757446289e-03, ..., 2.7008056641e-03, -1.6357421875e-02, -2.8686523438e-03], ..., [ 6.9580078125e-03, -1.0131835938e-02, 2.8442382812e-02, ..., 8.0566406250e-03, 5.5236816406e-03, 1.1413574219e-02], [-1.3183593750e-02, -1.1596679688e-02, 8.9721679688e-03, ..., 4.1809082031e-03, -2.2094726562e-02, -1.4831542969e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-1.4587402344e-01, 4.2773437500e-01, -1.7785644531e-01, ..., 8.1884765625e-01, 1.0076904297e-01, -1.0410156250e+00], [ 4.2089843750e-01, -3.5400390625e-01, 3.7011718750e+00, ..., -4.2031250000e+00, 2.8710937500e+00, 1.9628906250e+00], [ 5.1806640625e-01, 2.6093750000e+00, -7.7392578125e-01, ..., -1.2382812500e+00, 1.6416015625e+00, -4.5288085938e-01], ..., [-3.9428710938e-01, 2.1093750000e-01, -3.9746093750e-01, ..., -1.6613769531e-01, -2.2412109375e-01, -4.9316406250e-01], [-4.2041015625e-01, -2.7709960938e-01, -5.1696777344e-02, ..., -3.0957031250e-01, -3.5888671875e-01, 3.1958007812e-01], [-3.7695312500e-01, 1.5063476562e-01, -1.1767578125e+00, ..., -1.9201660156e-01, -2.5024414062e-03, -1.6967773438e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0213623047, -0.0025939941, 0.0005035400, ..., 0.0119018555, 0.0027008057, -0.0087890625], [-0.0108032227, 0.0061035156, 0.0205078125, ..., 0.0070190430, -0.0006141663, 0.0060424805], ..., [-0.1113281250, 0.0017700195, -0.0085449219, ..., 0.0029144287, -0.0007934570, -0.0029296875], [-0.0295410156, 0.0324707031, -0.0017166138, ..., 0.0044250488, 0.0126342773, 0.0028381348], [-0.0057983398, -0.0078735352, 0.0174560547, ..., -0.0050354004, -0.0006599426, -0.0158691406]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0213623047, -0.0025939941, 0.0005035400, ..., 0.0119018555, 0.0027008057, -0.0087890625], [-0.0108032227, 0.0061035156, 0.0205078125, ..., 0.0070190430, -0.0006141663, 0.0060424805], ..., [-0.1113281250, 0.0017700195, -0.0085449219, ..., 0.0029144287, -0.0007934570, -0.0029296875], [-0.0295410156, 0.0324707031, -0.0017166138, ..., 0.0044250488, 0.0126342773, 0.0028381348], [-0.0057983398, -0.0078735352, 0.0174560547, ..., -0.0050354004, -0.0006599426, -0.0158691406]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.6630859375, 1.7021484375, 1.1669921875, ..., 0.0830688477, -0.5312500000, 0.2546386719], [-0.6455078125, 1.1835937500, 2.0605468750, ..., 0.6113281250, 0.3742675781, 1.8457031250], [-2.1152343750, -3.8398437500, 2.7089843750, ..., 1.9677734375, -0.4433593750, 1.7998046875], ..., [-0.9570312500, 0.6044921875, -0.4467773438, ..., 0.3615722656, -0.2209472656, -0.1911621094], [ 0.0389404297, -0.4921875000, -0.3818359375, ..., -0.4262695312, -0.7866210938, -0.6562500000], [ 0.0233154297, 0.4116210938, -0.3354492188, ..., 0.2492675781, 0.4663085938, -0.4592285156]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 2.2460937500e-02, 1.0620117188e-02, 1.6235351562e-02, ..., -9.7656250000e-03, -9.7656250000e-03, -5.2795410156e-03], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [ 6.5612792969e-03, -2.9907226562e-03, -1.5991210938e-02, ..., 1.3549804688e-02, 3.3691406250e-02, 2.3925781250e-02], [-1.0803222656e-02, 6.1035156250e-03, 2.0507812500e-02, ..., 7.0190429688e-03, -6.1416625977e-04, 6.0424804688e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 2.2460937500e-02, 1.0620117188e-02, 1.6235351562e-02, ..., -9.7656250000e-03, -9.7656250000e-03, -5.2795410156e-03], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [ 6.5612792969e-03, -2.9907226562e-03, -1.5991210938e-02, ..., 1.3549804688e-02, 3.3691406250e-02, 2.3925781250e-02], [-1.0803222656e-02, 6.1035156250e-03, 2.0507812500e-02, ..., 7.0190429688e-03, -6.1416625977e-04, 6.0424804688e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.8325195312, -0.3034667969, 0.2412109375, ..., 0.6650390625, -0.3041992188, 0.2475585938], [ 0.8173828125, -0.9863281250, 1.5683593750, ..., 1.9296875000, -0.7045898438, 1.3388671875], [-2.2421875000, -2.4667968750, -1.4677734375, ..., 0.9111328125, -1.0546875000, 1.7412109375], ..., [-0.8681640625, 0.0595703125, -0.1542968750, ..., 0.2346191406, -0.2313232422, -0.1901855469], [-3.6875000000, -3.2285156250, 0.9594726562, ..., 1.6552734375, -0.8334960938, 0.5756835938], [-0.1125488281, -0.6445312500, 0.2255859375, ..., -0.2561035156, 0.1381835938, -0.0161132812]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.0681152344e-03, -1.0131835938e-02, 3.5644531250e-02, ..., -1.6967773438e-02, -1.3732910156e-02, -1.6357421875e-02], ..., [-9.0332031250e-02, -2.3803710938e-03, -4.2724609375e-03, ..., -1.8081665039e-03, -1.6250610352e-03, -9.9182128906e-04], [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.0681152344e-03, -1.0131835938e-02, 3.5644531250e-02, ..., -1.6967773438e-02, -1.3732910156e-02, -1.6357421875e-02], ..., [-9.0332031250e-02, -2.3803710938e-03, -4.2724609375e-03, ..., -1.8081665039e-03, -1.6250610352e-03, -9.9182128906e-04], [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], [ 1.3366699219e-02, -8.6975097656e-04, 1.8554687500e-02, ..., 8.2397460938e-04, -1.0620117188e-02, 1.4709472656e-02], ..., [ 5.4443359375e-02, -6.4086914062e-03, 1.1108398438e-02, ..., 2.4261474609e-03, -4.9743652344e-03, 1.5869140625e-02], [-1.3305664062e-02, 3.7689208984e-03, -4.3334960938e-03, ..., 4.9743652344e-03, -2.6367187500e-02, 3.5400390625e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 9.1552734375e-03, 2.3315429688e-02, -2.5482177734e-03, ..., 6.7443847656e-03, 1.3366699219e-02, 1.7700195312e-02], [ 1.3366699219e-02, -8.6975097656e-04, 1.8554687500e-02, ..., 8.2397460938e-04, -1.0620117188e-02, 1.4709472656e-02], ..., [ 5.4443359375e-02, -6.4086914062e-03, 1.1108398438e-02, ..., 2.4261474609e-03, -4.9743652344e-03, 1.5869140625e-02], [-1.3305664062e-02, 3.7689208984e-03, -4.3334960938e-03, ..., 4.9743652344e-03, -2.6367187500e-02, 3.5400390625e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-8.4228515625e-01, -2.1640625000e+00, 1.3085937500e+00, ..., 1.6113281250e+00, 2.2766113281e-01, -5.0683593750e-01], [-1.0527343750e+00, -2.4921875000e+00, 1.6250000000e+00, ..., 2.0153808594e-01, -1.0195312500e+00, 1.1298828125e+00], [-7.6562500000e-01, 2.4169921875e-01, 8.9648437500e-01, ..., -1.1071777344e-01, 1.0908203125e+00, 2.8076171875e-03], ..., [ 4.7119140625e-01, 7.3242187500e-02, -9.3554687500e-01, ..., -9.3066406250e-01, -1.7656250000e+00, 9.5166015625e-01], [-2.1997070312e-01, -4.2944335938e-01, -9.8266601562e-02, ..., 2.3291015625e-01, -1.2451171875e-01, 3.2177734375e-01], [-3.1054687500e-01, 6.1035156250e-04, -3.5742187500e-01, ..., 4.7436523438e-01, -5.4785156250e-01, -7.2753906250e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.5869140625e-03, 7.6293945312e-03, -4.3029785156e-03, ..., 1.8188476562e-02, -1.1962890625e-02, 1.6845703125e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], [-2.7709960938e-02, -2.2460937500e-02, 1.0986328125e-02, ..., -1.5945434570e-03, -1.8554687500e-02, 5.1269531250e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.5869140625e-03, 7.6293945312e-03, -4.3029785156e-03, ..., 1.8188476562e-02, -1.1962890625e-02, 1.6845703125e-02], [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], ..., [-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03], [-2.7709960938e-02, -2.2460937500e-02, 1.0986328125e-02, ..., -1.5945434570e-03, -1.8554687500e-02, 5.1269531250e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.9897460938e-02, 2.3651123047e-03, 1.8432617188e-02, ..., -2.3315429688e-02, -2.3345947266e-03, 2.9418945312e-02], [-4.5166015625e-02, 8.2015991211e-04, 7.3623657227e-04, ..., -7.8582763672e-04, 1.5830993652e-04, 3.9367675781e-03], ..., [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [ 1.9897460938e-02, 2.3651123047e-03, 1.8432617188e-02, ..., -2.3315429688e-02, -2.3345947266e-03, 2.9418945312e-02], [-4.5166015625e-02, 8.2015991211e-04, 7.3623657227e-04, ..., -7.8582763672e-04, 1.5830993652e-04, 3.9367675781e-03], ..., [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 5.4199218750e-02, -2.5749206543e-04, -1.0070800781e-02, ..., 1.0253906250e-02, 4.6997070312e-03, 2.0446777344e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-4.8828125000e-03, 3.1738281250e-02, 1.7578125000e-02, ..., 1.5502929688e-02, -1.4770507812e-02, -6.4086914062e-03], ..., [-1.0803222656e-02, -4.6157836914e-04, 5.2795410156e-03, ..., -3.6926269531e-03, -1.3305664062e-02, -4.3945312500e-02], [-5.5694580078e-04, 6.8664550781e-04, 2.1484375000e-02, ..., -4.9209594727e-04, 3.9062500000e-03, 6.6528320312e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-4.8828125000e-03, 3.1738281250e-02, 1.7578125000e-02, ..., 1.5502929688e-02, -1.4770507812e-02, -6.4086914062e-03], ..., [-1.0803222656e-02, -4.6157836914e-04, 5.2795410156e-03, ..., -3.6926269531e-03, -1.3305664062e-02, -4.3945312500e-02], [-5.5694580078e-04, 6.8664550781e-04, 2.1484375000e-02, ..., -4.9209594727e-04, 3.9062500000e-03, 6.6528320312e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- . ** Perplexity: nan -- Testing------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 1.7089843750e-02, -5.3100585938e-03, 2.8076171875e-02, ..., -7.9345703125e-03, 2.4414062500e-02, -1.3610839844e-02], [ 7.5378417969e-03, -3.9367675781e-03, -6.3781738281e-03, ..., -7.5073242188e-03, -1.3977050781e-02, -4.0893554688e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 1.7089843750e-02, -5.3100585938e-03, 2.8076171875e-02, ..., -7.9345703125e-03, 2.4414062500e-02, -1.3610839844e-02], [ 7.5378417969e-03, -3.9367675781e-03, -6.3781738281e-03, ..., -7.5073242188e-03, -1.3977050781e-02, -4.0893554688e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [-5.4931640625e-04, 1.3427734375e-02, 4.5776367188e-03, ..., -4.3945312500e-03, 1.3122558594e-02, 1.3549804688e-02], [ 3.1738281250e-03, 8.7280273438e-03, 2.0385742188e-02, ..., -1.6113281250e-02, -4.0039062500e-02, -1.2145996094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [-5.4931640625e-04, 1.3427734375e-02, 4.5776367188e-03, ..., -4.3945312500e-03, 1.3122558594e-02, 1.3549804688e-02], [ 3.1738281250e-03, 8.7280273438e-03, 2.0385742188e-02, ..., -1.6113281250e-02, -4.0039062500e-02, -1.2145996094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 2.9541015625e-02, 1.8066406250e-02, 1.1352539062e-02, ..., -3.4423828125e-02, -7.0800781250e-03, -2.9182434082e-04], ..., [ 7.3852539062e-03, -1.2145996094e-02, -1.8798828125e-02, ..., -1.3885498047e-03, -2.1972656250e-03, 1.9989013672e-03], [ 1.6357421875e-02, 4.2724609375e-03, 2.3071289062e-02, ..., -1.7623901367e-03, 2.0874023438e-02, 1.0192871094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 2.9541015625e-02, 1.8066406250e-02, 1.1352539062e-02, ..., -3.4423828125e-02, -7.0800781250e-03, -2.9182434082e-04], ..., [ 7.3852539062e-03, -1.2145996094e-02, -1.8798828125e-02, ..., -1.3885498047e-03, -2.1972656250e-03, 1.9989013672e-03], [ 1.6357421875e-02, 4.2724609375e-03, 2.3071289062e-02, ..., -1.7623901367e-03, 2.0874023438e-02, 1.0192871094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.7871093750, 1.0478515625, 1.6533203125, ..., 0.8828125000, -0.2326660156, -1.1933593750], [-0.0867919922, -0.4360351562, 1.8164062500, ..., -0.7001953125, -0.8671875000, -0.8076171875], [-2.1679687500, 2.0136718750, 0.9418945312, ..., 0.2369384766, -1.0322265625, 3.0429687500], ..., [-0.5488281250, -0.3032226562, -0.4819335938, ..., 0.6665039062, -0.2822265625, 0.6582031250], [-0.7299804688, -0.3325195312, -0.4301757812, ..., 0.0543212891, -0.2253417969, -0.3256835938], [ 0.3400878906, -0.0252685547, -0.0664062500, ..., -0.4184570312, 0.1245117188, -0.3710937500]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-4.9743652344e-03, -3.2424926758e-04, 1.1108398438e-02, ..., 1.6326904297e-03, 3.7536621094e-03, 2.4414062500e-02], ..., [ 3.3935546875e-02, 1.9683837891e-03, 2.2460937500e-02, ..., 2.2705078125e-02, -7.2021484375e-03, -1.0498046875e-02], [ 3.4667968750e-02, 1.1230468750e-02, 1.6723632812e-02, ..., -4.3106079102e-04, -1.2111663818e-04, 1.4221191406e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-4.9743652344e-03, -3.2424926758e-04, 1.1108398438e-02, ..., 1.6326904297e-03, 3.7536621094e-03, 2.4414062500e-02], ..., [ 3.3935546875e-02, 1.9683837891e-03, 2.2460937500e-02, ..., 2.2705078125e-02, -7.2021484375e-03, -1.0498046875e-02], [ 3.4667968750e-02, 1.1230468750e-02, 1.6723632812e-02, ..., -4.3106079102e-04, -1.2111663818e-04, 1.4221191406e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.4472656250, 0.5410156250, 0.0219726562, ..., -0.6611328125, -0.7734375000, -0.1252441406], [-0.3256835938, -0.4965820312, 2.1289062500, ..., 1.2031250000, -1.1464843750, 1.0166015625], [-1.4570312500, -1.8251953125, 3.2500000000, ..., 2.7480468750, -2.0117187500, 0.4162597656], ..., [-0.0941162109, 0.6162109375, -0.1776123047, ..., -0.2924804688, -0.1795654297, -0.4008789062], [-0.3862304688, 0.4013671875, -0.6230468750, ..., -0.2348632812, 0.3715820312, -0.1457519531], [-0.2802734375, 0.7397460938, -0.2861328125, ..., -0.5590820312, -0.1784667969, 0.6645507812]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-512.0000000000, -7.4960937500, -512.0000000000, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0142822266, -0.0049438477, 0.0106201172, ..., -0.0164794922, 0.0082397461, -0.0002498627], [ 0.0161132812, 0.0095825195, 0.0114135742, ..., 0.0008850098, -0.0012664795, 0.0119018555], ..., [ 0.0383300781, 0.0007209778, -0.0052185059, ..., 0.0031585693, 0.0004558563, 0.0014114380], [-0.0084228516, 0.0233154297, 0.0415039062, ..., -0.0336914062, -0.0076904297, 0.0037841797], [ 0.0094604492, -0.0095825195, -0.0241699219, ..., -0.0134887695, -0.0072326660, 0.0074157715]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0142822266, -0.0049438477, 0.0106201172, ..., -0.0164794922, 0.0082397461, -0.0002498627], [ 0.0161132812, 0.0095825195, 0.0114135742, ..., 0.0008850098, -0.0012664795, 0.0119018555], ..., [ 0.0383300781, 0.0007209778, -0.0052185059, ..., 0.0031585693, 0.0004558563, 0.0014114380], [-0.0084228516, 0.0233154297, 0.0415039062, ..., -0.0336914062, -0.0076904297, 0.0037841797], [ 0.0094604492, -0.0095825195, -0.0241699219, ..., -0.0134887695, -0.0072326660, 0.0074157715]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.7324218750, 1.3085937500, 1.5068359375, ..., 1.5595703125, -2.2500000000, -0.5898437500], [ 0.6269531250, -2.7578125000, 0.3417968750, ..., -0.6538085938, -0.4272460938, 0.6879882812], [-0.4228515625, 0.2966308594, 2.5117187500, ..., 0.4858398438, -0.7207031250, 1.7373046875], ..., [-0.0709228516, -0.3212890625, -0.4619140625, ..., 0.0441894531, -0.0559692383, -0.0299682617], [ 0.0282440186, -1.0810546875, -0.7373046875, ..., -0.2197265625, 0.3937988281, 0.3671875000], [ 0.2817382812, -0.3356933594, -0.3977050781, ..., 0.1318359375, -0.2983398438, -0.2153320312]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-5.6152343750e-03, -2.3437500000e-02, -4.7851562500e-02, ..., 1.2145996094e-02, -1.2023925781e-02, -6.4697265625e-03], ..., [-1.0131835938e-02, -1.1413574219e-02, 4.9133300781e-03, ..., 2.1972656250e-02, -2.7587890625e-02, 3.5644531250e-02], [ 5.9204101562e-03, -3.8909912109e-03, -1.3671875000e-02, ..., 4.7302246094e-03, -1.9989013672e-03, -3.1250000000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-5.6152343750e-03, -2.3437500000e-02, -4.7851562500e-02, ..., 1.2145996094e-02, -1.2023925781e-02, -6.4697265625e-03], ..., [-1.0131835938e-02, -1.1413574219e-02, 4.9133300781e-03, ..., 2.1972656250e-02, -2.7587890625e-02, 3.5644531250e-02], [ 5.9204101562e-03, -3.8909912109e-03, -1.3671875000e-02, ..., 4.7302246094e-03, -1.9989013672e-03, -3.1250000000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.2629394531, -0.0207366943, 0.7177734375, ..., -0.1286621094, -0.6025390625, 0.6416015625], [-0.0887451172, -0.1285400391, 1.6386718750, ..., -0.8247070312, 0.2788085938, 1.6699218750], [ 0.2919921875, 0.4104003906, 3.5859375000, ..., -0.6967773438, 0.5263671875, 1.1191406250], ..., [-0.9482421875, -0.1206054688, -0.2424316406, ..., 0.0160217285, -0.2661132812, -0.5517578125], [ 0.0267333984, -0.1447753906, 0.1833496094, ..., 0.3388671875, -0.0269775391, 0.1219482422], [ 0.0639648438, 0.0290527344, 0.4936523438, ..., -0.1206054688, 0.0475158691, 0.2421875000]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 3.0639648438e-02, 5.9509277344e-03, 2.9373168945e-04, ..., -2.4902343750e-02, 4.6081542969e-03, -2.0019531250e-02], ..., [-1.4282226562e-02, -8.2778930664e-04, 5.7067871094e-03, ..., 1.0803222656e-02, 1.0986328125e-02, -1.8310546875e-04], [ 2.0751953125e-02, 1.8554687500e-02, -4.9209594727e-04, ..., 8.9111328125e-03, 5.7678222656e-03, 2.2338867188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 3.0639648438e-02, 5.9509277344e-03, 2.9373168945e-04, ..., -2.4902343750e-02, 4.6081542969e-03, -2.0019531250e-02], ..., [-1.4282226562e-02, -8.2778930664e-04, 5.7067871094e-03, ..., 1.0803222656e-02, 1.0986328125e-02, -1.8310546875e-04], [ 2.0751953125e-02, 1.8554687500e-02, -4.9209594727e-04, ..., 8.9111328125e-03, 5.7678222656e-03, 2.2338867188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.0771484375, 0.1373291016, 0.2763671875, ..., -0.0061645508, -1.0927734375, -1.0234375000], [ 0.0526123047, -2.1777343750, 0.2434082031, ..., -0.4533691406, 0.7041015625, -0.2680664062], [-1.5126953125, -2.4062500000, 1.3095703125, ..., -0.6250000000, 0.9321289062, 0.5996093750], ..., [ 0.1334228516, -0.0905761719, -0.2399902344, ..., -0.2592773438, -0.0706176758, -0.1936035156], [-0.2215576172, 0.4433593750, 0.4460449219, ..., -0.0117187500, -0.4987792969, -0.5146484375], [ 1.0029296875, 0.2851562500, 0.3789062500, ..., -0.2849121094, -0.5595703125, -0.7392578125]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.8188476562e-02, -3.4667968750e-02, 2.1850585938e-02, ..., -1.2817382812e-02, -1.7578125000e-02, 1.5869140625e-02], ..., [-3.1127929688e-03, 9.3994140625e-03, 1.1413574219e-02, ..., 4.8522949219e-03, -6.0729980469e-03, -1.4648437500e-02], [-1.7929077148e-03, 8.0566406250e-03, 2.0996093750e-02, ..., 1.4831542969e-02, -7.7209472656e-03, 1.2207031250e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.8188476562e-02, -3.4667968750e-02, 2.1850585938e-02, ..., -1.2817382812e-02, -1.7578125000e-02, 1.5869140625e-02], ..., [-3.1127929688e-03, 9.3994140625e-03, 1.1413574219e-02, ..., 4.8522949219e-03, -6.0729980469e-03, -1.4648437500e-02], [-1.7929077148e-03, 8.0566406250e-03, 2.0996093750e-02, ..., 1.4831542969e-02, -7.7209472656e-03, 1.2207031250e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.2800292969, 0.2524414062, -0.1623535156, ..., -0.2497558594, -0.6586914062, -0.0318603516], [ 0.1654052734, -0.6816406250, 1.4335937500, ..., 0.9013671875, -0.5346679688, 0.9555664062], [ 0.0408630371, -2.1835937500, 0.7387695312, ..., -0.4550781250, 0.9794921875, 1.4843750000], ..., [ 0.2851562500, -0.3632812500, -0.1903076172, ..., -0.1697998047, 0.3022460938, 0.7534179688], [ 0.4909667969, -0.1528320312, -0.1265869141, ..., -0.1707763672, 0.1041259766, 0.3007812500], [ 0.4240722656, 0.0872192383, -0.2949218750, ..., 0.1313476562, 0.1643066406, 0.3686523438]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- . ** Perplexity (switched): nan -- Testing------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 1.7089843750e-02, -5.3100585938e-03, 2.8076171875e-02, ..., -7.9345703125e-03, 2.4414062500e-02, -1.3610839844e-02], [ 7.5378417969e-03, -3.9367675781e-03, -6.3781738281e-03, ..., -7.5073242188e-03, -1.3977050781e-02, -4.0893554688e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [ 1.7089843750e-02, -5.3100585938e-03, 2.8076171875e-02, ..., -7.9345703125e-03, 2.4414062500e-02, -1.3610839844e-02], [ 7.5378417969e-03, -3.9367675781e-03, -6.3781738281e-03, ..., -7.5073242188e-03, -1.3977050781e-02, -4.0893554688e-03], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-1.2089843750, -2.9472656250, 2.0644531250, ..., -2.4511718750, -0.7998046875, 2.1816406250], [-0.2734375000, -1.9082031250, 3.6015625000, ..., -3.5117187500, 2.1191406250, 3.4101562500], [ 0.5898437500, -0.6328125000, 1.4101562500, ..., -3.2734375000, -2.2304687500, 2.5703125000], ..., [ 0.2269287109, 0.0401611328, -0.1689453125, ..., -0.1770019531, -0.2279052734, -0.5175781250], [ 0.6748046875, -0.3857421875, -0.6269531250, ..., 0.0364990234, -0.0400390625, -0.2827148438], [ 0.0534057617, 0.0549316406, -0.8398437500, ..., -0.1809082031, 0.2493896484, 0.0058898926]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [-5.4931640625e-04, 1.3427734375e-02, 4.5776367188e-03, ..., -4.3945312500e-03, 1.3122558594e-02, 1.3549804688e-02], [ 3.1738281250e-03, 8.7280273438e-03, 2.0385742188e-02, ..., -1.6113281250e-02, -4.0039062500e-02, -1.2145996094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-2.7618408203e-03, -3.3691406250e-02, -1.8066406250e-02, ..., 4.0588378906e-03, 8.1176757812e-03, 1.8432617188e-02], [ 2.3956298828e-03, -1.1596679688e-02, 7.9345703125e-04, ..., -1.5319824219e-02, 3.5400390625e-03, 3.0517578125e-03], ..., [-5.4931640625e-04, 1.3427734375e-02, 4.5776367188e-03, ..., -4.3945312500e-03, 1.3122558594e-02, 1.3549804688e-02], [ 3.1738281250e-03, 8.7280273438e-03, 2.0385742188e-02, ..., -1.6113281250e-02, -4.0039062500e-02, -1.2145996094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-2.9765625000, -3.2128906250, 2.7421875000, ..., 0.2012939453, -0.1105957031, 1.6230468750], [-0.1389160156, -2.7929687500, 0.1635742188, ..., -3.6171875000, -0.8876953125, 1.3789062500], [-3.6035156250, -4.4023437500, 1.4335937500, ..., 0.7832031250, 1.9941406250, 3.9550781250], ..., [-0.1154785156, -0.1612548828, -0.7758789062, ..., 0.0250854492, -0.5087890625, -0.2075195312], [-0.0711669922, -0.4743652344, -0.5527343750, ..., -0.5375976562, -0.6474609375, -0.2858886719], [-0.0401916504, -0.2224121094, -0.5942382812, ..., -0.0122680664, -0.3193359375, -0.0757446289]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 2.9541015625e-02, 1.8066406250e-02, 1.1352539062e-02, ..., -3.4423828125e-02, -7.0800781250e-03, -2.9182434082e-04], ..., [ 7.3852539062e-03, -1.2145996094e-02, -1.8798828125e-02, ..., -1.3885498047e-03, -2.1972656250e-03, 1.9989013672e-03], [ 1.6357421875e-02, 4.2724609375e-03, 2.3071289062e-02, ..., -1.7623901367e-03, 2.0874023438e-02, 1.0192871094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 2.9541015625e-02, 1.8066406250e-02, 1.1352539062e-02, ..., -3.4423828125e-02, -7.0800781250e-03, -2.9182434082e-04], ..., [ 7.3852539062e-03, -1.2145996094e-02, -1.8798828125e-02, ..., -1.3885498047e-03, -2.1972656250e-03, 1.9989013672e-03], [ 1.6357421875e-02, 4.2724609375e-03, 2.3071289062e-02, ..., -1.7623901367e-03, 2.0874023438e-02, 1.0192871094e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.4833984375, -0.3696289062, 0.3198242188, ..., 0.6259765625, -0.1232299805, -0.7617187500], [-0.1505126953, -0.5683593750, 1.0009765625, ..., -2.1835937500, -0.9228515625, 0.4958496094], [-3.8964843750, -0.6723632812, 1.2460937500, ..., -1.3691406250, -1.5947265625, 2.9140625000], ..., [-0.0516662598, 0.1862792969, -1.1074218750, ..., -0.0699462891, -0.6494140625, -0.6762695312], [ 0.1136474609, 0.1190795898, -0.8393554688, ..., 0.0206298828, -0.4279785156, -0.7138671875], [-0.0467529297, 0.0042724609, -0.4392089844, ..., 0.0253906250, -1.0048828125, -0.5703125000]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-4.9743652344e-03, -3.2424926758e-04, 1.1108398438e-02, ..., 1.6326904297e-03, 3.7536621094e-03, 2.4414062500e-02], ..., [ 3.3935546875e-02, 1.9683837891e-03, 2.2460937500e-02, ..., 2.2705078125e-02, -7.2021484375e-03, -1.0498046875e-02], [ 3.4667968750e-02, 1.1230468750e-02, 1.6723632812e-02, ..., -4.3106079102e-04, -1.2111663818e-04, 1.4221191406e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-4.9743652344e-03, -3.2424926758e-04, 1.1108398438e-02, ..., 1.6326904297e-03, 3.7536621094e-03, 2.4414062500e-02], ..., [ 3.3935546875e-02, 1.9683837891e-03, 2.2460937500e-02, ..., 2.2705078125e-02, -7.2021484375e-03, -1.0498046875e-02], [ 3.4667968750e-02, 1.1230468750e-02, 1.6723632812e-02, ..., -4.3106079102e-04, -1.2111663818e-04, 1.4221191406e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 1.1210937500e+00, 1.6113281250e-02, -3.7231445312e-01, ..., -1.3955078125e+00, -8.6376953125e-01, -9.4531250000e-01], [-2.8637695312e-01, -2.5664062500e+00, 3.5195312500e+00, ..., 7.0678710938e-02, -1.6103515625e+00, 2.0000000000e+00], [-9.9169921875e-01, -5.3281250000e+00, 4.3125000000e+00, ..., 2.4921875000e+00, -2.5585937500e+00, 9.6435546875e-03], ..., [ 6.1279296875e-02, 3.4228515625e-01, -1.7089843750e-03, ..., -1.9860839844e-01, 2.6074218750e-01, -8.6572265625e-01], [-3.7353515625e-01, 1.9494628906e-01, -7.9882812500e-01, ..., -5.1904296875e-01, 3.6596679688e-01, -4.6533203125e-01], [-3.8110351562e-01, 8.8989257812e-02, -3.1298828125e-01, ..., -5.4150390625e-01, -5.3613281250e-01, 1.9836425781e-01]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0142822266, -0.0049438477, 0.0106201172, ..., -0.0164794922, 0.0082397461, -0.0002498627], [ 0.0161132812, 0.0095825195, 0.0114135742, ..., 0.0008850098, -0.0012664795, 0.0119018555], ..., [ 0.0383300781, 0.0007209778, -0.0052185059, ..., 0.0031585693, 0.0004558563, 0.0014114380], [-0.0084228516, 0.0233154297, 0.0415039062, ..., -0.0336914062, -0.0076904297, 0.0037841797], [ 0.0094604492, -0.0095825195, -0.0241699219, ..., -0.0134887695, -0.0072326660, 0.0074157715]]], dtype=torch.float16) ------- tensor([[[-0.0119018555, 0.0015945435, -0.0010452271, ..., -0.0003910065, 0.0015029907, -0.0015106201], [-0.0142822266, -0.0049438477, 0.0106201172, ..., -0.0164794922, 0.0082397461, -0.0002498627], [ 0.0161132812, 0.0095825195, 0.0114135742, ..., 0.0008850098, -0.0012664795, 0.0119018555], ..., [ 0.0383300781, 0.0007209778, -0.0052185059, ..., 0.0031585693, 0.0004558563, 0.0014114380], [-0.0084228516, 0.0233154297, 0.0415039062, ..., -0.0336914062, -0.0076904297, 0.0037841797], [ 0.0094604492, -0.0095825195, -0.0241699219, ..., -0.0134887695, -0.0072326660, 0.0074157715]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.1987304688, 1.3876953125, 1.5214843750, ..., 1.3701171875, -1.9970703125, 0.5888671875], [-0.4895019531, -1.8574218750, -0.3186035156, ..., 0.6171875000, 1.3310546875, 2.0878906250], [ 1.4628906250, 0.3071289062, 2.8847656250, ..., 0.8242187500, -0.8891601562, 0.7475585938], ..., [-0.0937500000, -0.2207031250, 0.0635986328, ..., -0.1618652344, 0.0372924805, 0.0671997070], [ 0.3461914062, -0.6416015625, -0.5668945312, ..., -0.2702636719, 0.0480957031, -0.1885986328], [ 0.1304931641, -0.4709472656, -0.0251464844, ..., 0.1936035156, -0.6826171875, -0.3808593750]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-5.6152343750e-03, -2.3437500000e-02, -4.7851562500e-02, ..., 1.2145996094e-02, -1.2023925781e-02, -6.4697265625e-03], ..., [-1.0131835938e-02, -1.1413574219e-02, 4.9133300781e-03, ..., 2.1972656250e-02, -2.7587890625e-02, 3.5644531250e-02], [ 5.9204101562e-03, -3.8909912109e-03, -1.3671875000e-02, ..., 4.7302246094e-03, -1.9989013672e-03, -3.1250000000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [-5.6152343750e-03, -2.3437500000e-02, -4.7851562500e-02, ..., 1.2145996094e-02, -1.2023925781e-02, -6.4697265625e-03], ..., [-1.0131835938e-02, -1.1413574219e-02, 4.9133300781e-03, ..., 2.1972656250e-02, -2.7587890625e-02, 3.5644531250e-02], [ 5.9204101562e-03, -3.8909912109e-03, -1.3671875000e-02, ..., 4.7302246094e-03, -1.9989013672e-03, -3.1250000000e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[-0.2424316406, -0.0531311035, 0.6923828125, ..., -0.1911621094, -0.5605468750, 0.6718750000], [-0.0639648438, -0.0075683594, 1.5800781250, ..., -0.8359375000, 0.3369140625, 1.6279296875], [ 0.3081054688, 0.7089843750, 3.6191406250, ..., -0.6899414062, 0.5976562500, 1.0683593750], ..., [-0.9428710938, -0.1617431641, -0.2257080078, ..., -0.0220336914, -0.2680664062, -0.5678710938], [ 0.0424804688, -0.2014160156, 0.2058105469, ..., 0.3264160156, -0.0439453125, 0.1564941406], [ 0.0717773438, -0.0466308594, 0.5312500000, ..., -0.1549072266, 0.0742187500, 0.2705078125]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 3.0639648438e-02, 5.9509277344e-03, 2.9373168945e-04, ..., -2.4902343750e-02, 4.6081542969e-03, -2.0019531250e-02], ..., [-1.4282226562e-02, -8.2778930664e-04, 5.7067871094e-03, ..., 1.0803222656e-02, 1.0986328125e-02, -1.8310546875e-04], [ 2.0751953125e-02, 1.8554687500e-02, -4.9209594727e-04, ..., 8.9111328125e-03, 5.7678222656e-03, 2.2338867188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 3.0639648438e-02, 5.9509277344e-03, 2.9373168945e-04, ..., -2.4902343750e-02, 4.6081542969e-03, -2.0019531250e-02], ..., [-1.4282226562e-02, -8.2778930664e-04, 5.7067871094e-03, ..., 1.0803222656e-02, 1.0986328125e-02, -1.8310546875e-04], [ 2.0751953125e-02, 1.8554687500e-02, -4.9209594727e-04, ..., 8.9111328125e-03, 5.7678222656e-03, 2.2338867188e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.0787963867, 0.0902099609, 0.3061523438, ..., -0.0366210938, -1.1044921875, -1.0576171875], [ 0.1137695312, -2.2460937500, 0.1201171875, ..., -0.5234375000, 0.6352539062, -0.2207031250], [-1.4560546875, -2.3515625000, 1.3271484375, ..., -0.6582031250, 0.9003906250, 0.5737304688], ..., [ 0.1365966797, -0.0849609375, -0.1791992188, ..., -0.2355957031, -0.0215454102, -0.1899414062], [-0.1916503906, 0.4147949219, 0.4404296875, ..., -0.0275878906, -0.4833984375, -0.5273437500], [ 1.0146484375, 0.2800292969, 0.3503417969, ..., -0.2944335938, -0.5449218750, -0.7490234375]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[-65504., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.8188476562e-02, -3.4667968750e-02, 2.1850585938e-02, ..., -1.2817382812e-02, -1.7578125000e-02, 1.5869140625e-02], ..., [-3.1127929688e-03, 9.3994140625e-03, 1.1413574219e-02, ..., 4.8522949219e-03, -6.0729980469e-03, -1.4648437500e-02], [-1.7929077148e-03, 8.0566406250e-03, 2.0996093750e-02, ..., 1.4831542969e-02, -7.7209472656e-03, 1.2207031250e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], dtype=torch.float16) ------- tensor([[[-1.1901855469e-02, 1.5945434570e-03, -1.0452270508e-03, ..., -3.9100646973e-04, 1.5029907227e-03, -1.5106201172e-03], [-1.4282226562e-02, -4.9438476562e-03, 1.0620117188e-02, ..., -1.6479492188e-02, 8.2397460938e-03, -2.4986267090e-04], [ 1.8188476562e-02, -3.4667968750e-02, 2.1850585938e-02, ..., -1.2817382812e-02, -1.7578125000e-02, 1.5869140625e-02], ..., [-3.1127929688e-03, 9.3994140625e-03, 1.1413574219e-02, ..., 4.8522949219e-03, -6.0729980469e-03, -1.4648437500e-02], [-1.7929077148e-03, 8.0566406250e-03, 2.0996093750e-02, ..., 1.4831542969e-02, -7.7209472656e-03, 1.2207031250e-02], [ 6.7138671875e-03, -4.3487548828e-04, 7.4005126953e-04, ..., -1.7547607422e-04, -8.1062316895e-05, 4.5204162598e-04]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.2829589844, 0.2500000000, -0.1588134766, ..., -0.2519531250, -0.6625976562, -0.0248870850], [ 0.1928710938, -0.6953125000, 1.4423828125, ..., 0.9155273438, -0.5004882812, 0.9707031250], [ 0.0472717285, -2.1621093750, 0.7460937500, ..., -0.4575195312, 0.9902343750, 1.4794921875], ..., [ 0.2814941406, -0.3686523438, -0.1894531250, ..., -0.1846923828, 0.3049316406, 0.7558593750], [ 0.4973144531, -0.1473388672, -0.1180419922, ..., -0.1779785156, 0.1027221680, 0.2939453125], [ 0.4245605469, 0.0838623047, -0.2998046875, ..., 0.1303710938, 0.1687011719, 0.3686523438]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- . ** Perplexity (quant_only): nan ------- tensor([[[-0.0056457520, -0.0036468506, 0.0032806396, ..., -0.0206298828, 0.0004711151, 0.0296630859], [ 0.0644531250, -0.0043640137, -0.0190429688, ..., 0.0290527344, -0.0029144287, 0.0010223389], [-0.0108032227, -0.0004615784, 0.0052795410, ..., -0.0036926270, -0.0133056641, -0.0439453125], ..., [ 0.0100708008, 0.0008316040, 0.0003833771, ..., 0.0003814697, 0.0011749268, 0.0005722046], [-0.0262451172, 0.0014343262, -0.0203857422, ..., 0.0058593750, -0.0034942627, 0.0032653809], [ 0.0075378418, 0.0039672852, 0.0439453125, ..., -0.0110473633, 0.0025787354, -0.0101928711]]], dtype=torch.float16) ------- tensor([[[-0.0056457520, -0.0036468506, 0.0032806396, ..., -0.0206298828, 0.0004711151, 0.0296630859], [ 0.0644531250, -0.0043640137, -0.0190429688, ..., 0.0290527344, -0.0029144287, 0.0010223389], [-0.0108032227, -0.0004615784, 0.0052795410, ..., -0.0036926270, -0.0133056641, -0.0439453125], ..., [ 0.0100708008, 0.0008316040, 0.0003833771, ..., 0.0003814697, 0.0011749268, 0.0005722046], [-0.0262451172, 0.0014343262, -0.0203857422, ..., 0.0058593750, -0.0034942627, 0.0032653809], [ 0.0075378418, 0.0039672852, 0.0439453125, ..., -0.0110473633, 0.0025787354, -0.0101928711]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.5556640625, 0.0631713867, 0.0404052734, ..., -0.4077148438, -0.1899414062, 0.2053222656], [ 0.2298583984, 0.7314453125, -0.1241455078, ..., 0.4804687500, -0.6445312500, -1.0175781250], [ 0.0356445312, -0.2668457031, -0.0814208984, ..., 0.3007812500, -2.0156250000, -1.5292968750], ..., [-0.4628906250, 0.4658203125, 0.0866699219, ..., 1.8544921875, 0.0871582031, 0.2482910156], [ 0.5126953125, 0.3896484375, -0.0450439453, ..., 0.0070800781, -1.3701171875, -0.6748046875], [ 0.4973144531, 0.5654296875, 0.0598144531, ..., 0.2492675781, 0.2214355469, 0.2408447266]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[ 0., -65504., -65504., ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- ------- tensor([[[-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03]]], dtype=torch.float16) ------- tensor([[[-4.9072265625e-02, -2.9683113098e-05, -5.1574707031e-03, ..., 4.0054321289e-04, 3.2806396484e-04, -2.8381347656e-03]]], device='cuda:0', dtype=torch.float16) ------- ------- tensor([[[ 0.6254882812, -0.0958862305, 0.2409667969, ..., -0.5498046875, -0.5815429688, -0.5849609375]]], device='cuda:0', dtype=torch.float16) ------- tensor([[[nan, nan, nan, ..., nan, nan, nan]]], device='cuda:1', dtype=torch.float16) ------- Traceback (most recent call last): File "/home/john/Projects/Python/GLaDOS/exllama/test_benchmark_inference.py", line 294, in text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/john/Projects/Python/GLaDOS/exllama/generator.py", line 179, in generate_simple token = self.gen_single_token() ^^^^^^^^^^^^^^^^^^^^^^^ File "/home/john/Projects/Python/GLaDOS/exllama/generator.py", line 202, in gen_single_token token, _ = self.sample(logits, ^^^^^^^^^^^^^^^^^^^ File "/home/john/Projects/Python/GLaDOS/exllama/generator.py", line 77, in sample sampled_ind = torch.multinomial(norm_probs, norm_probs.shape[-1] if num == -1 else min(num, norm_probs.shape[-1])) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ RuntimeError: probability tensor contains either `inf`, `nan` or element < 0