// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-34431801 // Cuda compilation tools, release 12.6, V12.6.20 // Based on NVVM 7.0.1 // .version 8.5 .target sm_90a .address_size 64 // _ZZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted .global .align 1 .u8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705043std17integral_constantIbLb0EE5valueE; .global .align 1 .u8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705043std17integral_constantIbLb1EE5valueE = 1; .global .align 1 .u8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705043std14__numeric_typeIvE5valueE = 1; .extern .shared .align 16 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE[]; .entry _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE( .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_0[24], .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_1[24], .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_2[16], .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_3[24], .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_4[16], .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_5[24], .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_6[16], .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_7[16], .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_8[24], .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_9[24], .param .align 8 .b8 _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_10[16] ) { .reg .pred %p<348>; .reg .b16 %rs<1013>; .reg .f32 %f<2040>; .reg .b32 %r<2636>; .reg .b64 %rd<330>; // demoted variable .shared .align 4 .u32 _ZZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEEE14nvfuser_zero_s; ld.param.v2.u32 {%r919, %r920}, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_0+8]; ld.param.u64 %rd25, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_9]; ld.param.u64 %rd24, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_8]; ld.param.u64 %rd21, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_5]; ld.param.u64 %rd20, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_4]; ld.param.u64 %rd19, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_3]; ld.param.u64 %rd18, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_2]; ld.param.u64 %rd17, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_1]; ld.param.u64 %rd16, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_0]; mov.u32 %r1, %tid.x; setp.ne.s32 %p3, %r1, 0; @%p3 bra $L__BB0_2; mov.u32 %r953, 0; st.shared.u32 [_ZZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r953; $L__BB0_2: bar.sync 0; mov.u64 %rd27, _ZZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEEE14nvfuser_zero_s; atom.shared.min.s32 %r954, [%rd27], %r1; ld.shared.u32 %r4, [_ZZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEEE14nvfuser_zero_s]; mov.u32 %r955, %nctaid.y; add.s32 %r956, %r955, -1; add.s32 %r957, %r956, %r919; div.s32 %r5, %r957, %r955; add.s32 %r958, %r920, 3; shr.s32 %r959, %r958, 31; shr.u32 %r960, %r959, 30; add.s32 %r961, %r958, %r960; shr.s32 %r962, %r961, 2; mov.u32 %r6, %ntid.x; add.s32 %r963, %r6, -1; add.s32 %r964, %r963, %r962; div.s32 %r965, %r964, %r6; add.s32 %r966, %r956, %r965; mov.u32 %r7, %ntid.y; add.s32 %r967, %r956, %r7; shl.b32 %r968, %r6, 3; mov.u32 %r8, %tid.y; shl.b32 %r969, %r1, 3; mad.lo.s32 %r9, %r968, %r8, %r969; add.s32 %r970, %r920, 7; shr.s32 %r971, %r970, 31; shr.u32 %r972, %r971, 29; add.s32 %r973, %r970, %r972; shr.s32 %r974, %r973, 3; add.s32 %r975, %r963, %r974; div.s32 %r976, %r975, %r6; add.s32 %r977, %r976, 7; shr.s32 %r978, %r977, 31; shr.u32 %r979, %r978, 29; add.s32 %r980, %r977, %r979; shr.s32 %r10, %r980, 3; or.b32 %r981, %r9, 7; mov.f32 %f109, 0f00000000; // begin inline asm { cvt.rn.bf16.f32 %rs132, %f109;} // end inline asm mov.b32 %r2156, {%rs132, %rs132}; div.s32 %r12, %r966, %r955; div.s32 %r13, %r967, %r7; mul.lo.s32 %r14, %r10, %r968; setp.ge.s32 %p4, %r8, %r10; sub.s32 %r15, %r981, %r920; cvt.rn.f32.s32 %f110, %r920; rcp.rn.f32 %f1, %f110; mul.lo.s32 %r982, %r4, %r14; shl.b32 %r16, %r982, 1; neg.s32 %r983, %r16; setp.ge.s32 %p5, %r15, %r983; or.pred %p6, %p4, %p5; mov.u32 %r2128, %r2156; mov.u32 %r2129, %r2156; mov.u32 %r2130, %r2156; mov.u32 %r2131, %r2156; @%p6 bra $L__BB0_4; add.s32 %r988, %r16, %r9; mul.wide.s32 %rd29, %r988, 2; add.s64 %rd28, %rd20, %rd29; // begin inline asm ld.global.cs.v4.u32 {%r2131,%r2130,%r2129,%r2128}, [%rd28]; // end inline asm $L__BB0_4: add.s32 %r25, %r16, %r14; neg.s32 %r989, %r25; setp.ge.s32 %p7, %r15, %r989; or.pred %p9, %p4, %p7; mov.u32 %r2132, %r2156; mov.u32 %r2133, %r2156; mov.u32 %r2134, %r2156; mov.u32 %r2135, %r2156; @%p9 bra $L__BB0_6; add.s32 %r994, %r25, %r9; mul.wide.s32 %rd31, %r994, 2; add.s64 %rd30, %rd20, %rd31; // begin inline asm ld.global.cs.v4.u32 {%r2135,%r2134,%r2133,%r2132}, [%rd30]; // end inline asm $L__BB0_6: add.s32 %r34, %r25, %r14; neg.s32 %r995, %r34; setp.ge.s32 %p10, %r15, %r995; or.pred %p12, %p4, %p10; mov.u32 %r2136, %r2156; mov.u32 %r2137, %r2156; mov.u32 %r2138, %r2156; mov.u32 %r2139, %r2156; @%p12 bra $L__BB0_8; add.s32 %r1000, %r34, %r9; mul.wide.s32 %rd33, %r1000, 2; add.s64 %rd32, %rd20, %rd33; // begin inline asm ld.global.cs.v4.u32 {%r2139,%r2138,%r2137,%r2136}, [%rd32]; // end inline asm $L__BB0_8: add.s32 %r43, %r34, %r14; neg.s32 %r1001, %r43; setp.ge.s32 %p13, %r15, %r1001; or.pred %p15, %p4, %p13; mov.u32 %r2140, %r2156; mov.u32 %r2141, %r2156; mov.u32 %r2142, %r2156; mov.u32 %r2143, %r2156; @%p15 bra $L__BB0_10; add.s32 %r1006, %r43, %r9; mul.wide.s32 %rd35, %r1006, 2; add.s64 %rd34, %rd20, %rd35; // begin inline asm ld.global.cs.v4.u32 {%r2143,%r2142,%r2141,%r2140}, [%rd34]; // end inline asm $L__BB0_10: add.s32 %r52, %r43, %r14; neg.s32 %r1007, %r52; setp.ge.s32 %p16, %r15, %r1007; or.pred %p18, %p4, %p16; mov.u32 %r2144, %r2156; mov.u32 %r2145, %r2156; mov.u32 %r2146, %r2156; mov.u32 %r2147, %r2156; @%p18 bra $L__BB0_12; add.s32 %r1012, %r52, %r9; mul.wide.s32 %rd37, %r1012, 2; add.s64 %rd36, %rd20, %rd37; // begin inline asm ld.global.cs.v4.u32 {%r2147,%r2146,%r2145,%r2144}, [%rd36]; // end inline asm $L__BB0_12: add.s32 %r61, %r52, %r14; neg.s32 %r1013, %r61; setp.ge.s32 %p19, %r15, %r1013; or.pred %p21, %p4, %p19; mov.u32 %r2148, %r2156; mov.u32 %r2149, %r2156; mov.u32 %r2150, %r2156; mov.u32 %r2151, %r2156; @%p21 bra $L__BB0_14; add.s32 %r1018, %r61, %r9; mul.wide.s32 %rd39, %r1018, 2; add.s64 %rd38, %rd20, %rd39; // begin inline asm ld.global.cs.v4.u32 {%r2151,%r2150,%r2149,%r2148}, [%rd38]; // end inline asm $L__BB0_14: add.s32 %r70, %r61, %r14; neg.s32 %r1019, %r70; setp.ge.s32 %p22, %r15, %r1019; or.pred %p24, %p4, %p22; mov.u32 %r2152, %r2156; mov.u32 %r2153, %r2156; mov.u32 %r2154, %r2156; mov.u32 %r2155, %r2156; @%p24 bra $L__BB0_16; add.s32 %r1024, %r70, %r9; mul.wide.s32 %rd41, %r1024, 2; add.s64 %rd40, %rd20, %rd41; // begin inline asm ld.global.cs.v4.u32 {%r2155,%r2154,%r2153,%r2152}, [%rd40]; // end inline asm $L__BB0_16: add.s32 %r79, %r70, %r14; neg.s32 %r1025, %r79; setp.ge.s32 %p25, %r15, %r1025; or.pred %p27, %p4, %p25; mov.u32 %r2157, %r2156; mov.u32 %r2158, %r2156; mov.u32 %r2159, %r2156; @%p27 bra $L__BB0_18; add.s32 %r1030, %r79, %r9; mul.wide.s32 %rd43, %r1030, 2; add.s64 %rd42, %rd20, %rd43; // begin inline asm ld.global.cs.v4.u32 {%r2159,%r2158,%r2157,%r2156}, [%rd42]; // end inline asm $L__BB0_18: shl.b32 %r2612, %r4, 4; setp.gt.s32 %p28, %r5, 0; @%p28 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: mul.lo.s32 %r1288, %r6, %r7; clz.b32 %r1289, %r1288; mov.u32 %r1290, 31; sub.s32 %r1291, %r1290, %r1289; mov.u32 %r1292, 1; shl.b32 %r89, %r1292, %r1291; shr.u32 %r1293, %r89, 31; add.s32 %r1294, %r89, %r1293; shr.s32 %r90, %r1294, 1; mov.b32 {%rs1, %rs2}, %r2131; mov.b32 {%rs3, %rs4}, %r2130; mov.b32 {%rs5, %rs6}, %r2129; mov.b32 {%rs7, %rs8}, %r2128; mov.b32 {%rs9, %rs10}, %r2135; mov.b32 {%rs11, %rs12}, %r2134; mov.b32 {%rs13, %rs14}, %r2133; mov.b32 {%rs15, %rs16}, %r2132; mov.b32 {%rs17, %rs18}, %r2139; mov.b32 {%rs19, %rs20}, %r2138; mov.b32 {%rs21, %rs22}, %r2137; mov.b32 {%rs23, %rs24}, %r2136; mov.b32 {%rs25, %rs26}, %r2143; mov.b32 {%rs27, %rs28}, %r2142; mov.b32 {%rs29, %rs30}, %r2141; mov.b32 {%rs31, %rs32}, %r2140; mov.b32 {%rs33, %rs34}, %r2147; mov.b32 {%rs35, %rs36}, %r2146; mov.b32 {%rs37, %rs38}, %r2145; mov.b32 {%rs39, %rs40}, %r2144; mov.b32 {%rs41, %rs42}, %r2151; mov.b32 {%rs43, %rs44}, %r2150; mov.b32 {%rs45, %rs46}, %r2149; mov.b32 {%rs47, %rs48}, %r2148; mov.b32 {%rs49, %rs50}, %r2155; mov.b32 {%rs51, %rs52}, %r2154; mov.b32 {%rs53, %rs54}, %r2153; mov.b32 {%rs55, %rs56}, %r2152; mov.b32 {%rs57, %rs58}, %r2159; mov.b32 {%rs59, %rs60}, %r2158; mov.b32 {%rs61, %rs62}, %r2157; mov.b32 {%rs63, %rs64}, %r2156; cvta.to.global.u64 %rd5, %rd18; cvta.to.global.u64 %rd6, %rd19; mov.u32 %r2466, 0; // begin inline asm { mov.b32 %f155, {0,%rs1};} // end inline asm // begin inline asm { mov.b32 %f158, {0,%rs2};} // end inline asm // begin inline asm { mov.b32 %f161, {0,%rs3};} // end inline asm // begin inline asm { mov.b32 %f164, {0,%rs4};} // end inline asm // begin inline asm { mov.b32 %f167, {0,%rs5};} // end inline asm // begin inline asm { mov.b32 %f170, {0,%rs6};} // end inline asm // begin inline asm { mov.b32 %f173, {0,%rs7};} // end inline asm // begin inline asm { mov.b32 %f176, {0,%rs8};} // end inline asm // begin inline asm { mov.b32 %f281, {0,%rs9};} // end inline asm // begin inline asm { mov.b32 %f284, {0,%rs10};} // end inline asm // begin inline asm { mov.b32 %f287, {0,%rs11};} // end inline asm // begin inline asm { mov.b32 %f290, {0,%rs12};} // end inline asm // begin inline asm { mov.b32 %f293, {0,%rs13};} // end inline asm // begin inline asm { mov.b32 %f296, {0,%rs14};} // end inline asm // begin inline asm { mov.b32 %f299, {0,%rs15};} // end inline asm // begin inline asm { mov.b32 %f302, {0,%rs16};} // end inline asm // begin inline asm { mov.b32 %f406, {0,%rs17};} // end inline asm // begin inline asm { mov.b32 %f409, {0,%rs18};} // end inline asm // begin inline asm { mov.b32 %f412, {0,%rs19};} // end inline asm // begin inline asm { mov.b32 %f415, {0,%rs20};} // end inline asm // begin inline asm { mov.b32 %f418, {0,%rs21};} // end inline asm // begin inline asm { mov.b32 %f421, {0,%rs22};} // end inline asm // begin inline asm { mov.b32 %f424, {0,%rs23};} // end inline asm // begin inline asm { mov.b32 %f427, {0,%rs24};} // end inline asm // begin inline asm { mov.b32 %f531, {0,%rs25};} // end inline asm // begin inline asm { mov.b32 %f534, {0,%rs26};} // end inline asm // begin inline asm { mov.b32 %f537, {0,%rs27};} // end inline asm // begin inline asm { mov.b32 %f540, {0,%rs28};} // end inline asm // begin inline asm { mov.b32 %f543, {0,%rs29};} // end inline asm // begin inline asm { mov.b32 %f546, {0,%rs30};} // end inline asm // begin inline asm { mov.b32 %f549, {0,%rs31};} // end inline asm // begin inline asm { mov.b32 %f552, {0,%rs32};} // end inline asm // begin inline asm { mov.b32 %f656, {0,%rs33};} // end inline asm // begin inline asm { mov.b32 %f659, {0,%rs34};} // end inline asm // begin inline asm { mov.b32 %f662, {0,%rs35};} // end inline asm // begin inline asm { mov.b32 %f665, {0,%rs36};} // end inline asm // begin inline asm { mov.b32 %f668, {0,%rs37};} // end inline asm // begin inline asm { mov.b32 %f671, {0,%rs38};} // end inline asm // begin inline asm { mov.b32 %f674, {0,%rs39};} // end inline asm // begin inline asm { mov.b32 %f677, {0,%rs40};} // end inline asm // begin inline asm { mov.b32 %f781, {0,%rs41};} // end inline asm // begin inline asm { mov.b32 %f784, {0,%rs42};} // end inline asm // begin inline asm { mov.b32 %f787, {0,%rs43};} // end inline asm // begin inline asm { mov.b32 %f790, {0,%rs44};} // end inline asm // begin inline asm { mov.b32 %f793, {0,%rs45};} // end inline asm // begin inline asm { mov.b32 %f796, {0,%rs46};} // end inline asm // begin inline asm { mov.b32 %f799, {0,%rs47};} // end inline asm // begin inline asm { mov.b32 %f802, {0,%rs48};} // end inline asm // begin inline asm { mov.b32 %f906, {0,%rs49};} // end inline asm // begin inline asm { mov.b32 %f909, {0,%rs50};} // end inline asm // begin inline asm { mov.b32 %f912, {0,%rs51};} // end inline asm // begin inline asm { mov.b32 %f915, {0,%rs52};} // end inline asm // begin inline asm { mov.b32 %f918, {0,%rs53};} // end inline asm // begin inline asm { mov.b32 %f921, {0,%rs54};} // end inline asm // begin inline asm { mov.b32 %f924, {0,%rs55};} // end inline asm // begin inline asm { mov.b32 %f927, {0,%rs56};} // end inline asm // begin inline asm { mov.b32 %f1031, {0,%rs57};} // end inline asm // begin inline asm { mov.b32 %f1034, {0,%rs58};} // end inline asm // begin inline asm { mov.b32 %f1037, {0,%rs59};} // end inline asm // begin inline asm { mov.b32 %f1040, {0,%rs60};} // end inline asm // begin inline asm { mov.b32 %f1043, {0,%rs61};} // end inline asm // begin inline asm { mov.b32 %f1046, {0,%rs62};} // end inline asm // begin inline asm { mov.b32 %f1049, {0,%rs63};} // end inline asm // begin inline asm { mov.b32 %f1052, {0,%rs64};} // end inline asm mov.u32 %r2467, %r2466; mov.u32 %r2468, %r2466; mov.u32 %r2469, %r2466; mov.u32 %r2470, %r2466; mov.u32 %r2471, %r2466; mov.u32 %r2472, %r2466; mov.u32 %r2473, %r2466; mov.u32 %r2450, %r2466; mov.u32 %r2451, %r2466; mov.u32 %r2452, %r2466; mov.u32 %r2453, %r2466; mov.u32 %r2454, %r2466; mov.u32 %r2455, %r2466; mov.u32 %r2456, %r2466; mov.u32 %r2457, %r2466; mov.u32 %r2434, %r2466; mov.u32 %r2435, %r2466; mov.u32 %r2436, %r2466; mov.u32 %r2437, %r2466; mov.u32 %r2438, %r2466; mov.u32 %r2439, %r2466; mov.u32 %r2440, %r2466; mov.u32 %r2441, %r2466; mov.u32 %r2418, %r2466; mov.u32 %r2419, %r2466; mov.u32 %r2420, %r2466; mov.u32 %r2421, %r2466; mov.u32 %r2422, %r2466; mov.u32 %r2423, %r2466; mov.u32 %r2424, %r2466; mov.u32 %r2425, %r2466; mov.u32 %r2402, %r2466; mov.u32 %r2403, %r2466; mov.u32 %r2404, %r2466; mov.u32 %r2405, %r2466; mov.u32 %r2406, %r2466; mov.u32 %r2407, %r2466; mov.u32 %r2408, %r2466; mov.u32 %r2409, %r2466; mov.u32 %r2386, %r2466; mov.u32 %r2387, %r2466; mov.u32 %r2388, %r2466; mov.u32 %r2389, %r2466; mov.u32 %r2390, %r2466; mov.u32 %r2391, %r2466; mov.u32 %r2392, %r2466; mov.u32 %r2393, %r2466; mov.u32 %r2370, %r2466; mov.u32 %r2371, %r2466; mov.u32 %r2372, %r2466; mov.u32 %r2373, %r2466; mov.u32 %r2374, %r2466; mov.u32 %r2375, %r2466; mov.u32 %r2376, %r2466; mov.u32 %r2377, %r2466; mov.u32 %r2216, %r2466; mov.u32 %r2217, %r2466; mov.u32 %r2218, %r2466; mov.u32 %r2219, %r2466; mov.u32 %r2220, %r2466; mov.u32 %r2221, %r2466; mov.u32 %r2222, %r2466; mov.u32 %r2223, %r2466; mov.u32 %r2474, %r2466; mov.u32 %r2475, %r2466; mov.u32 %r2476, %r2466; mov.u32 %r2477, %r2466; mov.u32 %r2478, %r2466; mov.u32 %r2479, %r2466; mov.u32 %r2480, %r2466; mov.u32 %r2481, %r2466; mov.u32 %r2458, %r2466; mov.u32 %r2459, %r2466; mov.u32 %r2460, %r2466; mov.u32 %r2461, %r2466; mov.u32 %r2462, %r2466; mov.u32 %r2463, %r2466; mov.u32 %r2464, %r2466; mov.u32 %r2465, %r2466; mov.u32 %r2442, %r2466; mov.u32 %r2443, %r2466; mov.u32 %r2444, %r2466; mov.u32 %r2445, %r2466; mov.u32 %r2446, %r2466; mov.u32 %r2447, %r2466; mov.u32 %r2448, %r2466; mov.u32 %r2449, %r2466; mov.u32 %r2426, %r2466; mov.u32 %r2427, %r2466; mov.u32 %r2428, %r2466; mov.u32 %r2429, %r2466; mov.u32 %r2430, %r2466; mov.u32 %r2431, %r2466; mov.u32 %r2432, %r2466; mov.u32 %r2433, %r2466; mov.u32 %r2410, %r2466; mov.u32 %r2411, %r2466; mov.u32 %r2412, %r2466; mov.u32 %r2413, %r2466; mov.u32 %r2414, %r2466; mov.u32 %r2415, %r2466; mov.u32 %r2416, %r2466; mov.u32 %r2417, %r2466; mov.u32 %r2394, %r2466; mov.u32 %r2395, %r2466; mov.u32 %r2396, %r2466; mov.u32 %r2397, %r2466; mov.u32 %r2398, %r2466; mov.u32 %r2399, %r2466; mov.u32 %r2400, %r2466; mov.u32 %r2401, %r2466; mov.u32 %r2378, %r2466; mov.u32 %r2379, %r2466; mov.u32 %r2380, %r2466; mov.u32 %r2381, %r2466; mov.u32 %r2382, %r2466; mov.u32 %r2383, %r2466; mov.u32 %r2384, %r2466; mov.u32 %r2385, %r2466; mov.u32 %r2280, %r2466; mov.u32 %r2281, %r2466; mov.u32 %r2282, %r2466; mov.u32 %r2283, %r2466; mov.u32 %r2284, %r2466; mov.u32 %r2285, %r2466; mov.u32 %r2286, %r2466; mov.u32 %r2287, %r2466; mov.u32 %r2289, %r2466; $L__BB0_21: .pragma "nounroll"; mov.u32 %r1295, %tid.y; setp.lt.s32 %p29, %r1295, %r10; mov.u32 %r1296, %ctaid.y; mad.lo.s32 %r221, %r5, %r1296, %r2289; setp.lt.s32 %p30, %r221, %r919; mov.f32 %f118, 0f00000000; // begin inline asm { cvt.rn.bf16.f32 %rs140, %f118;} // end inline asm mov.b32 %r2318, {%rs140, %rs140}; mad.lo.s32 %r1301, %r968, %r1295, %r969; mad.lo.s32 %r223, %r221, %r920, %r1301; and.pred %p1, %p29, %p30; not.pred %p31, %p1; mul.lo.s32 %r1302, %r2612, %r14; shl.b32 %r224, %r1302, 1; neg.s32 %r1303, %r224; setp.ge.s32 %p32, %r15, %r1303; or.pred %p33, %p31, %p32; mov.u32 %r2290, %r2318; mov.u32 %r2291, %r2318; mov.u32 %r2292, %r2318; mov.u32 %r2293, %r2318; @%p33 bra $L__BB0_23; add.s32 %r1308, %r224, %r223; mul.wide.s32 %rd45, %r1308, 2; add.s64 %rd44, %rd16, %rd45; // begin inline asm ld.global.cs.v4.u32 {%r2293,%r2292,%r2291,%r2290}, [%rd44]; // end inline asm $L__BB0_23: add.s32 %r233, %r224, %r14; neg.s32 %r1309, %r233; setp.ge.s32 %p34, %r15, %r1309; or.pred %p36, %p31, %p34; mov.u32 %r2294, %r2318; mov.u32 %r2295, %r2318; mov.u32 %r2296, %r2318; mov.u32 %r2297, %r2318; @%p36 bra $L__BB0_25; add.s32 %r1314, %r233, %r223; mul.wide.s32 %rd47, %r1314, 2; add.s64 %rd46, %rd16, %rd47; // begin inline asm ld.global.cs.v4.u32 {%r2297,%r2296,%r2295,%r2294}, [%rd46]; // end inline asm $L__BB0_25: add.s32 %r242, %r233, %r14; neg.s32 %r1315, %r242; setp.ge.s32 %p37, %r15, %r1315; or.pred %p39, %p31, %p37; mov.u32 %r2298, %r2318; mov.u32 %r2299, %r2318; mov.u32 %r2300, %r2318; mov.u32 %r2301, %r2318; @%p39 bra $L__BB0_27; add.s32 %r1320, %r242, %r223; mul.wide.s32 %rd49, %r1320, 2; add.s64 %rd48, %rd16, %rd49; // begin inline asm ld.global.cs.v4.u32 {%r2301,%r2300,%r2299,%r2298}, [%rd48]; // end inline asm $L__BB0_27: add.s32 %r251, %r242, %r14; neg.s32 %r1321, %r251; setp.ge.s32 %p40, %r15, %r1321; or.pred %p42, %p31, %p40; mov.u32 %r2302, %r2318; mov.u32 %r2303, %r2318; mov.u32 %r2304, %r2318; mov.u32 %r2305, %r2318; @%p42 bra $L__BB0_29; add.s32 %r1326, %r251, %r223; mul.wide.s32 %rd51, %r1326, 2; add.s64 %rd50, %rd16, %rd51; // begin inline asm ld.global.cs.v4.u32 {%r2305,%r2304,%r2303,%r2302}, [%rd50]; // end inline asm $L__BB0_29: add.s32 %r260, %r251, %r14; neg.s32 %r1327, %r260; setp.ge.s32 %p43, %r15, %r1327; or.pred %p45, %p31, %p43; mov.u32 %r2306, %r2318; mov.u32 %r2307, %r2318; mov.u32 %r2308, %r2318; mov.u32 %r2309, %r2318; @%p45 bra $L__BB0_31; add.s32 %r1332, %r260, %r223; mul.wide.s32 %rd53, %r1332, 2; add.s64 %rd52, %rd16, %rd53; // begin inline asm ld.global.cs.v4.u32 {%r2309,%r2308,%r2307,%r2306}, [%rd52]; // end inline asm $L__BB0_31: add.s32 %r269, %r260, %r14; neg.s32 %r1333, %r269; setp.ge.s32 %p46, %r15, %r1333; or.pred %p48, %p31, %p46; mov.u32 %r2310, %r2318; mov.u32 %r2311, %r2318; mov.u32 %r2312, %r2318; mov.u32 %r2313, %r2318; @%p48 bra $L__BB0_33; add.s32 %r1338, %r269, %r223; mul.wide.s32 %rd55, %r1338, 2; add.s64 %rd54, %rd16, %rd55; // begin inline asm ld.global.cs.v4.u32 {%r2313,%r2312,%r2311,%r2310}, [%rd54]; // end inline asm $L__BB0_33: add.s32 %r278, %r269, %r14; neg.s32 %r1339, %r278; setp.ge.s32 %p49, %r15, %r1339; or.pred %p51, %p31, %p49; mov.u32 %r2314, %r2318; mov.u32 %r2315, %r2318; mov.u32 %r2316, %r2318; mov.u32 %r2317, %r2318; @%p51 bra $L__BB0_35; add.s32 %r1344, %r278, %r223; mul.wide.s32 %rd57, %r1344, 2; add.s64 %rd56, %rd16, %rd57; // begin inline asm ld.global.cs.v4.u32 {%r2317,%r2316,%r2315,%r2314}, [%rd56]; // end inline asm $L__BB0_35: add.s32 %r287, %r278, %r14; neg.s32 %r1345, %r287; setp.ge.s32 %p52, %r15, %r1345; or.pred %p54, %p31, %p52; mov.u32 %r2319, %r2318; mov.u32 %r2320, %r2318; mov.u32 %r2321, %r2318; @%p54 bra $L__BB0_37; add.s32 %r1350, %r287, %r223; mul.wide.s32 %rd59, %r1350, 2; add.s64 %rd58, %rd16, %rd59; // begin inline asm ld.global.cs.v4.u32 {%r2321,%r2320,%r2319,%r2318}, [%rd58]; // end inline asm $L__BB0_37: mul.lo.s32 %r2124, %r2612, %r14; mov.f32 %f126, 0f00000000; // begin inline asm { cvt.rn.bf16.f32 %rs148, %f126;} // end inline asm mov.b32 %r2350, {%rs148, %rs148}; shl.b32 %r297, %r2124, 3; neg.s32 %r1352, %r297; setp.ge.s32 %p55, %r15, %r1352; or.pred %p57, %p31, %p55; mov.u32 %r2322, %r2350; mov.u32 %r2323, %r2350; mov.u32 %r2324, %r2350; mov.u32 %r2325, %r2350; @%p57 bra $L__BB0_39; add.s32 %r1357, %r297, %r223; mul.wide.s32 %rd61, %r1357, 2; add.s64 %rd60, %rd17, %rd61; // begin inline asm ld.global.cs.v4.u32 {%r2325,%r2324,%r2323,%r2322}, [%rd60]; // end inline asm $L__BB0_39: add.s32 %r306, %r297, %r14; neg.s32 %r1358, %r306; setp.ge.s32 %p58, %r15, %r1358; or.pred %p60, %p31, %p58; mov.u32 %r2326, %r2350; mov.u32 %r2327, %r2350; mov.u32 %r2328, %r2350; mov.u32 %r2329, %r2350; @%p60 bra $L__BB0_41; add.s32 %r1363, %r306, %r223; mul.wide.s32 %rd63, %r1363, 2; add.s64 %rd62, %rd17, %rd63; // begin inline asm ld.global.cs.v4.u32 {%r2329,%r2328,%r2327,%r2326}, [%rd62]; // end inline asm $L__BB0_41: add.s32 %r315, %r306, %r14; neg.s32 %r1364, %r315; setp.ge.s32 %p61, %r15, %r1364; or.pred %p63, %p31, %p61; mov.u32 %r2330, %r2350; mov.u32 %r2331, %r2350; mov.u32 %r2332, %r2350; mov.u32 %r2333, %r2350; @%p63 bra $L__BB0_43; add.s32 %r1369, %r315, %r223; mul.wide.s32 %rd65, %r1369, 2; add.s64 %rd64, %rd17, %rd65; // begin inline asm ld.global.cs.v4.u32 {%r2333,%r2332,%r2331,%r2330}, [%rd64]; // end inline asm $L__BB0_43: add.s32 %r324, %r315, %r14; neg.s32 %r1370, %r324; setp.ge.s32 %p64, %r15, %r1370; or.pred %p66, %p31, %p64; mov.u32 %r2334, %r2350; mov.u32 %r2335, %r2350; mov.u32 %r2336, %r2350; mov.u32 %r2337, %r2350; @%p66 bra $L__BB0_45; add.s32 %r1375, %r324, %r223; mul.wide.s32 %rd67, %r1375, 2; add.s64 %rd66, %rd17, %rd67; // begin inline asm ld.global.cs.v4.u32 {%r2337,%r2336,%r2335,%r2334}, [%rd66]; // end inline asm $L__BB0_45: add.s32 %r333, %r324, %r14; neg.s32 %r1376, %r333; setp.ge.s32 %p67, %r15, %r1376; or.pred %p69, %p31, %p67; mov.u32 %r2338, %r2350; mov.u32 %r2339, %r2350; mov.u32 %r2340, %r2350; mov.u32 %r2341, %r2350; @%p69 bra $L__BB0_47; add.s32 %r1381, %r333, %r223; mul.wide.s32 %rd69, %r1381, 2; add.s64 %rd68, %rd17, %rd69; // begin inline asm ld.global.cs.v4.u32 {%r2341,%r2340,%r2339,%r2338}, [%rd68]; // end inline asm $L__BB0_47: add.s32 %r342, %r333, %r14; neg.s32 %r1382, %r342; setp.ge.s32 %p70, %r15, %r1382; or.pred %p72, %p31, %p70; mov.u32 %r2342, %r2350; mov.u32 %r2343, %r2350; mov.u32 %r2344, %r2350; mov.u32 %r2345, %r2350; @%p72 bra $L__BB0_49; add.s32 %r1387, %r342, %r223; mul.wide.s32 %rd71, %r1387, 2; add.s64 %rd70, %rd17, %rd71; // begin inline asm ld.global.cs.v4.u32 {%r2345,%r2344,%r2343,%r2342}, [%rd70]; // end inline asm $L__BB0_49: add.s32 %r351, %r342, %r14; neg.s32 %r1388, %r351; setp.ge.s32 %p73, %r15, %r1388; or.pred %p75, %p31, %p73; mov.u32 %r2346, %r2350; mov.u32 %r2347, %r2350; mov.u32 %r2348, %r2350; mov.u32 %r2349, %r2350; @%p75 bra $L__BB0_51; add.s32 %r1393, %r351, %r223; mul.wide.s32 %rd73, %r1393, 2; add.s64 %rd72, %rd17, %rd73; // begin inline asm ld.global.cs.v4.u32 {%r2349,%r2348,%r2347,%r2346}, [%rd72]; // end inline asm $L__BB0_51: add.s32 %r360, %r351, %r14; neg.s32 %r1394, %r360; setp.ge.s32 %p76, %r15, %r1394; or.pred %p78, %p31, %p76; mov.u32 %r2351, %r2350; mov.u32 %r2352, %r2350; mov.u32 %r2353, %r2350; @%p78 bra $L__BB0_53; add.s32 %r1399, %r360, %r223; mul.wide.s32 %rd75, %r1399, 2; add.s64 %rd74, %rd17, %rd75; // begin inline asm ld.global.cs.v4.u32 {%r2353,%r2352,%r2351,%r2350}, [%rd74]; // end inline asm $L__BB0_53: setp.ge.s32 %p79, %r221, %r919; mov.f32 %f1996, 0f00000000; mov.f32 %f1997, %f1996; @%p79 bra $L__BB0_55; mul.wide.s32 %rd76, %r221, 4; add.s64 %rd77, %rd5, %rd76; ld.global.f32 %f1996, [%rd77]; add.s64 %rd78, %rd6, %rd76; ld.global.f32 %f1997, [%rd78]; $L__BB0_55: neg.s32 %r1400, %r14; mul.lo.s32 %r1401, %r2612, %r1400; shl.b32 %r369, %r1401, 4; setp.lt.s32 %p80, %r15, %r369; and.pred %p81, %p1, %p80; mov.b32 {%rs149, %rs66}, %r2325; // begin inline asm { mov.b32 %f129, {0,%rs149};} // end inline asm @%p81 bra $L__BB0_57; bra.uni $L__BB0_56; $L__BB0_57: mov.b32 %f178, %r2223; add.f32 %f179, %f129, %f178; mov.b32 %r2223, %f179; mul.f32 %f180, %f129, %f155; mul.f32 %f181, %f1997, %f180; mov.f32 %f182, 0f00000000; sub.f32 %f183, %f182, %f181; mov.b32 {%rs174, %rs177}, %r2293; // begin inline asm { mov.b32 %f156, {0,%rs174};} // end inline asm sub.f32 %f184, %f156, %f1996; mul.f32 %f185, %f1997, %f184; fma.rn.f32 %f186, %f180, %f184, 0f00000000; mov.b32 %f187, %r2287; fma.rn.f32 %f188, %f129, %f185, %f187; mov.b32 %r2287, %f188; // begin inline asm { mov.b32 %f157, {0,%rs66};} // end inline asm mov.b32 %f189, %r2222; add.f32 %f190, %f157, %f189; mov.b32 %r2222, %f190; mul.f32 %f191, %f157, %f158; mul.f32 %f192, %f1997, %f191; sub.f32 %f193, %f183, %f192; // begin inline asm { mov.b32 %f159, {0,%rs177};} // end inline asm sub.f32 %f194, %f159, %f1996; mul.f32 %f195, %f1997, %f194; fma.rn.f32 %f196, %f191, %f194, %f186; mov.b32 %f197, %r2286; fma.rn.f32 %f198, %f157, %f195, %f197; mov.b32 %r2286, %f198; mov.b32 {%rs178, %rs181}, %r2324; // begin inline asm { mov.b32 %f160, {0,%rs178};} // end inline asm mov.b32 %f199, %r2221; add.f32 %f200, %f160, %f199; mov.b32 %r2221, %f200; mul.f32 %f201, %f160, %f161; mul.f32 %f202, %f1997, %f201; sub.f32 %f203, %f193, %f202; mov.b32 {%rs180, %rs183}, %r2292; // begin inline asm { mov.b32 %f162, {0,%rs180};} // end inline asm sub.f32 %f204, %f162, %f1996; mul.f32 %f205, %f1997, %f204; fma.rn.f32 %f206, %f201, %f204, %f196; mov.b32 %f207, %r2285; fma.rn.f32 %f208, %f160, %f205, %f207; mov.b32 %r2285, %f208; // begin inline asm { mov.b32 %f163, {0,%rs181};} // end inline asm mov.b32 %f209, %r2220; add.f32 %f210, %f163, %f209; mov.b32 %r2220, %f210; mul.f32 %f211, %f163, %f164; mul.f32 %f212, %f1997, %f211; sub.f32 %f213, %f203, %f212; // begin inline asm { mov.b32 %f165, {0,%rs183};} // end inline asm sub.f32 %f214, %f165, %f1996; mul.f32 %f215, %f1997, %f214; fma.rn.f32 %f216, %f211, %f214, %f206; mov.b32 %f217, %r2284; fma.rn.f32 %f218, %f163, %f215, %f217; mov.b32 %r2284, %f218; mov.b32 {%rs184, %rs187}, %r2323; // begin inline asm { mov.b32 %f166, {0,%rs184};} // end inline asm mov.b32 %f219, %r2219; add.f32 %f220, %f166, %f219; mov.b32 %r2219, %f220; mul.f32 %f221, %f166, %f167; mul.f32 %f222, %f1997, %f221; sub.f32 %f223, %f213, %f222; mov.b32 {%rs186, %rs189}, %r2291; // begin inline asm { mov.b32 %f168, {0,%rs186};} // end inline asm sub.f32 %f224, %f168, %f1996; mul.f32 %f225, %f1997, %f224; fma.rn.f32 %f226, %f221, %f224, %f216; mov.b32 %f227, %r2283; fma.rn.f32 %f228, %f166, %f225, %f227; mov.b32 %r2283, %f228; // begin inline asm { mov.b32 %f169, {0,%rs187};} // end inline asm mov.b32 %f229, %r2218; add.f32 %f230, %f169, %f229; mov.b32 %r2218, %f230; mul.f32 %f231, %f169, %f170; mul.f32 %f232, %f1997, %f231; sub.f32 %f233, %f223, %f232; // begin inline asm { mov.b32 %f171, {0,%rs189};} // end inline asm sub.f32 %f234, %f171, %f1996; mul.f32 %f235, %f1997, %f234; fma.rn.f32 %f236, %f231, %f234, %f226; mov.b32 %f237, %r2282; fma.rn.f32 %f238, %f169, %f235, %f237; mov.b32 %r2282, %f238; mov.b32 {%rs190, %rs193}, %r2322; // begin inline asm { mov.b32 %f172, {0,%rs190};} // end inline asm mov.b32 %f239, %r2217; add.f32 %f240, %f172, %f239; mov.b32 %r2217, %f240; mul.f32 %f241, %f172, %f173; mul.f32 %f242, %f1997, %f241; sub.f32 %f243, %f233, %f242; mov.b32 {%rs192, %rs195}, %r2290; // begin inline asm { mov.b32 %f174, {0,%rs192};} // end inline asm sub.f32 %f244, %f174, %f1996; mul.f32 %f245, %f1997, %f244; fma.rn.f32 %f246, %f241, %f244, %f236; mov.b32 %f247, %r2281; fma.rn.f32 %f248, %f172, %f245, %f247; mov.b32 %r2281, %f248; // begin inline asm { mov.b32 %f175, {0,%rs193};} // end inline asm mov.b32 %f249, %r2216; add.f32 %f250, %f175, %f249; mov.b32 %r2216, %f250; mul.f32 %f251, %f175, %f176; mul.f32 %f252, %f1997, %f251; sub.f32 %f2001, %f243, %f252; // begin inline asm { mov.b32 %f177, {0,%rs195};} // end inline asm sub.f32 %f253, %f177, %f1996; mul.f32 %f254, %f1997, %f253; fma.rn.f32 %f2000, %f251, %f253, %f246; mov.b32 %f255, %r2280; fma.rn.f32 %f256, %f175, %f254, %f255; mov.b32 %r2280, %f256; bra.uni $L__BB0_58; $L__BB0_56: mov.f32 %f2000, 0f00000000; mov.f32 %f2001, %f2000; $L__BB0_58: sub.s32 %r402, %r369, %r14; setp.lt.s32 %p82, %r15, %r402; and.pred %p83, %p1, %p82; mov.b32 {%rs196, %rs69}, %r2329; // begin inline asm { mov.b32 %f257, {0,%rs196};} // end inline asm @%p83 bra $L__BB0_59; bra.uni $L__BB0_60; $L__BB0_59: mov.b32 %f304, %r2377; add.f32 %f305, %f257, %f304; mov.b32 %r2377, %f305; mul.f32 %f306, %f257, %f281; mul.f32 %f307, %f1997, %f306; sub.f32 %f308, %f2001, %f307; mov.b32 {%rs221, %rs224}, %r2297; // begin inline asm { mov.b32 %f282, {0,%rs221};} // end inline asm sub.f32 %f309, %f282, %f1996; mul.f32 %f310, %f1997, %f309; fma.rn.f32 %f311, %f306, %f309, %f2000; mov.b32 %f312, %r2385; fma.rn.f32 %f313, %f257, %f310, %f312; mov.b32 %r2385, %f313; // begin inline asm { mov.b32 %f283, {0,%rs69};} // end inline asm mov.b32 %f314, %r2376; add.f32 %f315, %f283, %f314; mov.b32 %r2376, %f315; mul.f32 %f316, %f283, %f284; mul.f32 %f317, %f1997, %f316; sub.f32 %f318, %f308, %f317; // begin inline asm { mov.b32 %f285, {0,%rs224};} // end inline asm sub.f32 %f319, %f285, %f1996; mul.f32 %f320, %f1997, %f319; fma.rn.f32 %f321, %f316, %f319, %f311; mov.b32 %f322, %r2384; fma.rn.f32 %f323, %f283, %f320, %f322; mov.b32 %r2384, %f323; mov.b32 {%rs225, %rs228}, %r2328; // begin inline asm { mov.b32 %f286, {0,%rs225};} // end inline asm mov.b32 %f324, %r2375; add.f32 %f325, %f286, %f324; mov.b32 %r2375, %f325; mul.f32 %f326, %f286, %f287; mul.f32 %f327, %f1997, %f326; sub.f32 %f328, %f318, %f327; mov.b32 {%rs227, %rs230}, %r2296; // begin inline asm { mov.b32 %f288, {0,%rs227};} // end inline asm sub.f32 %f329, %f288, %f1996; mul.f32 %f330, %f1997, %f329; fma.rn.f32 %f331, %f326, %f329, %f321; mov.b32 %f332, %r2383; fma.rn.f32 %f333, %f286, %f330, %f332; mov.b32 %r2383, %f333; // begin inline asm { mov.b32 %f289, {0,%rs228};} // end inline asm mov.b32 %f334, %r2374; add.f32 %f335, %f289, %f334; mov.b32 %r2374, %f335; mul.f32 %f336, %f289, %f290; mul.f32 %f337, %f1997, %f336; sub.f32 %f338, %f328, %f337; // begin inline asm { mov.b32 %f291, {0,%rs230};} // end inline asm sub.f32 %f339, %f291, %f1996; mul.f32 %f340, %f1997, %f339; fma.rn.f32 %f341, %f336, %f339, %f331; mov.b32 %f342, %r2382; fma.rn.f32 %f343, %f289, %f340, %f342; mov.b32 %r2382, %f343; mov.b32 {%rs231, %rs234}, %r2327; // begin inline asm { mov.b32 %f292, {0,%rs231};} // end inline asm mov.b32 %f344, %r2373; add.f32 %f345, %f292, %f344; mov.b32 %r2373, %f345; mul.f32 %f346, %f292, %f293; mul.f32 %f347, %f1997, %f346; sub.f32 %f348, %f338, %f347; mov.b32 {%rs233, %rs236}, %r2295; // begin inline asm { mov.b32 %f294, {0,%rs233};} // end inline asm sub.f32 %f349, %f294, %f1996; mul.f32 %f350, %f1997, %f349; fma.rn.f32 %f351, %f346, %f349, %f341; mov.b32 %f352, %r2381; fma.rn.f32 %f353, %f292, %f350, %f352; mov.b32 %r2381, %f353; // begin inline asm { mov.b32 %f295, {0,%rs234};} // end inline asm mov.b32 %f354, %r2372; add.f32 %f355, %f295, %f354; mov.b32 %r2372, %f355; mul.f32 %f356, %f295, %f296; mul.f32 %f357, %f1997, %f356; sub.f32 %f358, %f348, %f357; // begin inline asm { mov.b32 %f297, {0,%rs236};} // end inline asm sub.f32 %f359, %f297, %f1996; mul.f32 %f360, %f1997, %f359; fma.rn.f32 %f361, %f356, %f359, %f351; mov.b32 %f362, %r2380; fma.rn.f32 %f363, %f295, %f360, %f362; mov.b32 %r2380, %f363; mov.b32 {%rs237, %rs240}, %r2326; // begin inline asm { mov.b32 %f298, {0,%rs237};} // end inline asm mov.b32 %f364, %r2371; add.f32 %f365, %f298, %f364; mov.b32 %r2371, %f365; mul.f32 %f366, %f298, %f299; mul.f32 %f367, %f1997, %f366; sub.f32 %f368, %f358, %f367; mov.b32 {%rs239, %rs242}, %r2294; // begin inline asm { mov.b32 %f300, {0,%rs239};} // end inline asm sub.f32 %f369, %f300, %f1996; mul.f32 %f370, %f1997, %f369; fma.rn.f32 %f371, %f366, %f369, %f361; mov.b32 %f372, %r2379; fma.rn.f32 %f373, %f298, %f370, %f372; mov.b32 %r2379, %f373; // begin inline asm { mov.b32 %f301, {0,%rs240};} // end inline asm mov.b32 %f374, %r2370; add.f32 %f375, %f301, %f374; mov.b32 %r2370, %f375; mul.f32 %f376, %f301, %f302; mul.f32 %f377, %f1997, %f376; sub.f32 %f2001, %f368, %f377; // begin inline asm { mov.b32 %f303, {0,%rs242};} // end inline asm sub.f32 %f378, %f303, %f1996; mul.f32 %f379, %f1997, %f378; fma.rn.f32 %f2000, %f376, %f378, %f371; mov.b32 %f380, %r2378; fma.rn.f32 %f381, %f301, %f379, %f380; mov.b32 %r2378, %f381; $L__BB0_60: sub.s32 %r435, %r402, %r14; setp.lt.s32 %p84, %r15, %r435; and.pred %p85, %p1, %p84; mov.b32 {%rs243, %rs72}, %r2333; // begin inline asm { mov.b32 %f382, {0,%rs243};} // end inline asm @%p85 bra $L__BB0_61; bra.uni $L__BB0_62; $L__BB0_61: mov.b32 %f429, %r2393; add.f32 %f430, %f382, %f429; mov.b32 %r2393, %f430; mul.f32 %f431, %f382, %f406; mul.f32 %f432, %f1997, %f431; sub.f32 %f433, %f2001, %f432; mov.b32 {%rs268, %rs271}, %r2301; // begin inline asm { mov.b32 %f407, {0,%rs268};} // end inline asm sub.f32 %f434, %f407, %f1996; mul.f32 %f435, %f1997, %f434; fma.rn.f32 %f436, %f431, %f434, %f2000; mov.b32 %f437, %r2401; fma.rn.f32 %f438, %f382, %f435, %f437; mov.b32 %r2401, %f438; // begin inline asm { mov.b32 %f408, {0,%rs72};} // end inline asm mov.b32 %f439, %r2392; add.f32 %f440, %f408, %f439; mov.b32 %r2392, %f440; mul.f32 %f441, %f408, %f409; mul.f32 %f442, %f1997, %f441; sub.f32 %f443, %f433, %f442; // begin inline asm { mov.b32 %f410, {0,%rs271};} // end inline asm sub.f32 %f444, %f410, %f1996; mul.f32 %f445, %f1997, %f444; fma.rn.f32 %f446, %f441, %f444, %f436; mov.b32 %f447, %r2400; fma.rn.f32 %f448, %f408, %f445, %f447; mov.b32 %r2400, %f448; mov.b32 {%rs272, %rs275}, %r2332; // begin inline asm { mov.b32 %f411, {0,%rs272};} // end inline asm mov.b32 %f449, %r2391; add.f32 %f450, %f411, %f449; mov.b32 %r2391, %f450; mul.f32 %f451, %f411, %f412; mul.f32 %f452, %f1997, %f451; sub.f32 %f453, %f443, %f452; mov.b32 {%rs274, %rs277}, %r2300; // begin inline asm { mov.b32 %f413, {0,%rs274};} // end inline asm sub.f32 %f454, %f413, %f1996; mul.f32 %f455, %f1997, %f454; fma.rn.f32 %f456, %f451, %f454, %f446; mov.b32 %f457, %r2399; fma.rn.f32 %f458, %f411, %f455, %f457; mov.b32 %r2399, %f458; // begin inline asm { mov.b32 %f414, {0,%rs275};} // end inline asm mov.b32 %f459, %r2390; add.f32 %f460, %f414, %f459; mov.b32 %r2390, %f460; mul.f32 %f461, %f414, %f415; mul.f32 %f462, %f1997, %f461; sub.f32 %f463, %f453, %f462; // begin inline asm { mov.b32 %f416, {0,%rs277};} // end inline asm sub.f32 %f464, %f416, %f1996; mul.f32 %f465, %f1997, %f464; fma.rn.f32 %f466, %f461, %f464, %f456; mov.b32 %f467, %r2398; fma.rn.f32 %f468, %f414, %f465, %f467; mov.b32 %r2398, %f468; mov.b32 {%rs278, %rs281}, %r2331; // begin inline asm { mov.b32 %f417, {0,%rs278};} // end inline asm mov.b32 %f469, %r2389; add.f32 %f470, %f417, %f469; mov.b32 %r2389, %f470; mul.f32 %f471, %f417, %f418; mul.f32 %f472, %f1997, %f471; sub.f32 %f473, %f463, %f472; mov.b32 {%rs280, %rs283}, %r2299; // begin inline asm { mov.b32 %f419, {0,%rs280};} // end inline asm sub.f32 %f474, %f419, %f1996; mul.f32 %f475, %f1997, %f474; fma.rn.f32 %f476, %f471, %f474, %f466; mov.b32 %f477, %r2397; fma.rn.f32 %f478, %f417, %f475, %f477; mov.b32 %r2397, %f478; // begin inline asm { mov.b32 %f420, {0,%rs281};} // end inline asm mov.b32 %f479, %r2388; add.f32 %f480, %f420, %f479; mov.b32 %r2388, %f480; mul.f32 %f481, %f420, %f421; mul.f32 %f482, %f1997, %f481; sub.f32 %f483, %f473, %f482; // begin inline asm { mov.b32 %f422, {0,%rs283};} // end inline asm sub.f32 %f484, %f422, %f1996; mul.f32 %f485, %f1997, %f484; fma.rn.f32 %f486, %f481, %f484, %f476; mov.b32 %f487, %r2396; fma.rn.f32 %f488, %f420, %f485, %f487; mov.b32 %r2396, %f488; mov.b32 {%rs284, %rs287}, %r2330; // begin inline asm { mov.b32 %f423, {0,%rs284};} // end inline asm mov.b32 %f489, %r2387; add.f32 %f490, %f423, %f489; mov.b32 %r2387, %f490; mul.f32 %f491, %f423, %f424; mul.f32 %f492, %f1997, %f491; sub.f32 %f493, %f483, %f492; mov.b32 {%rs286, %rs289}, %r2298; // begin inline asm { mov.b32 %f425, {0,%rs286};} // end inline asm sub.f32 %f494, %f425, %f1996; mul.f32 %f495, %f1997, %f494; fma.rn.f32 %f496, %f491, %f494, %f486; mov.b32 %f497, %r2395; fma.rn.f32 %f498, %f423, %f495, %f497; mov.b32 %r2395, %f498; // begin inline asm { mov.b32 %f426, {0,%rs287};} // end inline asm mov.b32 %f499, %r2386; add.f32 %f500, %f426, %f499; mov.b32 %r2386, %f500; mul.f32 %f501, %f426, %f427; mul.f32 %f502, %f1997, %f501; sub.f32 %f2001, %f493, %f502; // begin inline asm { mov.b32 %f428, {0,%rs289};} // end inline asm sub.f32 %f503, %f428, %f1996; mul.f32 %f504, %f1997, %f503; fma.rn.f32 %f2000, %f501, %f503, %f496; mov.b32 %f505, %r2394; fma.rn.f32 %f506, %f426, %f504, %f505; mov.b32 %r2394, %f506; $L__BB0_62: sub.s32 %r468, %r435, %r14; setp.lt.s32 %p86, %r15, %r468; and.pred %p87, %p1, %p86; mov.b32 {%rs290, %rs75}, %r2337; // begin inline asm { mov.b32 %f507, {0,%rs290};} // end inline asm @%p87 bra $L__BB0_63; bra.uni $L__BB0_64; $L__BB0_63: mov.b32 %f554, %r2409; add.f32 %f555, %f507, %f554; mov.b32 %r2409, %f555; mul.f32 %f556, %f507, %f531; mul.f32 %f557, %f1997, %f556; sub.f32 %f558, %f2001, %f557; mov.b32 {%rs315, %rs318}, %r2305; // begin inline asm { mov.b32 %f532, {0,%rs315};} // end inline asm sub.f32 %f559, %f532, %f1996; mul.f32 %f560, %f1997, %f559; fma.rn.f32 %f561, %f556, %f559, %f2000; mov.b32 %f562, %r2417; fma.rn.f32 %f563, %f507, %f560, %f562; mov.b32 %r2417, %f563; // begin inline asm { mov.b32 %f533, {0,%rs75};} // end inline asm mov.b32 %f564, %r2408; add.f32 %f565, %f533, %f564; mov.b32 %r2408, %f565; mul.f32 %f566, %f533, %f534; mul.f32 %f567, %f1997, %f566; sub.f32 %f568, %f558, %f567; // begin inline asm { mov.b32 %f535, {0,%rs318};} // end inline asm sub.f32 %f569, %f535, %f1996; mul.f32 %f570, %f1997, %f569; fma.rn.f32 %f571, %f566, %f569, %f561; mov.b32 %f572, %r2416; fma.rn.f32 %f573, %f533, %f570, %f572; mov.b32 %r2416, %f573; mov.b32 {%rs319, %rs322}, %r2336; // begin inline asm { mov.b32 %f536, {0,%rs319};} // end inline asm mov.b32 %f574, %r2407; add.f32 %f575, %f536, %f574; mov.b32 %r2407, %f575; mul.f32 %f576, %f536, %f537; mul.f32 %f577, %f1997, %f576; sub.f32 %f578, %f568, %f577; mov.b32 {%rs321, %rs324}, %r2304; // begin inline asm { mov.b32 %f538, {0,%rs321};} // end inline asm sub.f32 %f579, %f538, %f1996; mul.f32 %f580, %f1997, %f579; fma.rn.f32 %f581, %f576, %f579, %f571; mov.b32 %f582, %r2415; fma.rn.f32 %f583, %f536, %f580, %f582; mov.b32 %r2415, %f583; // begin inline asm { mov.b32 %f539, {0,%rs322};} // end inline asm mov.b32 %f584, %r2406; add.f32 %f585, %f539, %f584; mov.b32 %r2406, %f585; mul.f32 %f586, %f539, %f540; mul.f32 %f587, %f1997, %f586; sub.f32 %f588, %f578, %f587; // begin inline asm { mov.b32 %f541, {0,%rs324};} // end inline asm sub.f32 %f589, %f541, %f1996; mul.f32 %f590, %f1997, %f589; fma.rn.f32 %f591, %f586, %f589, %f581; mov.b32 %f592, %r2414; fma.rn.f32 %f593, %f539, %f590, %f592; mov.b32 %r2414, %f593; mov.b32 {%rs325, %rs328}, %r2335; // begin inline asm { mov.b32 %f542, {0,%rs325};} // end inline asm mov.b32 %f594, %r2405; add.f32 %f595, %f542, %f594; mov.b32 %r2405, %f595; mul.f32 %f596, %f542, %f543; mul.f32 %f597, %f1997, %f596; sub.f32 %f598, %f588, %f597; mov.b32 {%rs327, %rs330}, %r2303; // begin inline asm { mov.b32 %f544, {0,%rs327};} // end inline asm sub.f32 %f599, %f544, %f1996; mul.f32 %f600, %f1997, %f599; fma.rn.f32 %f601, %f596, %f599, %f591; mov.b32 %f602, %r2413; fma.rn.f32 %f603, %f542, %f600, %f602; mov.b32 %r2413, %f603; // begin inline asm { mov.b32 %f545, {0,%rs328};} // end inline asm mov.b32 %f604, %r2404; add.f32 %f605, %f545, %f604; mov.b32 %r2404, %f605; mul.f32 %f606, %f545, %f546; mul.f32 %f607, %f1997, %f606; sub.f32 %f608, %f598, %f607; // begin inline asm { mov.b32 %f547, {0,%rs330};} // end inline asm sub.f32 %f609, %f547, %f1996; mul.f32 %f610, %f1997, %f609; fma.rn.f32 %f611, %f606, %f609, %f601; mov.b32 %f612, %r2412; fma.rn.f32 %f613, %f545, %f610, %f612; mov.b32 %r2412, %f613; mov.b32 {%rs331, %rs334}, %r2334; // begin inline asm { mov.b32 %f548, {0,%rs331};} // end inline asm mov.b32 %f614, %r2403; add.f32 %f615, %f548, %f614; mov.b32 %r2403, %f615; mul.f32 %f616, %f548, %f549; mul.f32 %f617, %f1997, %f616; sub.f32 %f618, %f608, %f617; mov.b32 {%rs333, %rs336}, %r2302; // begin inline asm { mov.b32 %f550, {0,%rs333};} // end inline asm sub.f32 %f619, %f550, %f1996; mul.f32 %f620, %f1997, %f619; fma.rn.f32 %f621, %f616, %f619, %f611; mov.b32 %f622, %r2411; fma.rn.f32 %f623, %f548, %f620, %f622; mov.b32 %r2411, %f623; // begin inline asm { mov.b32 %f551, {0,%rs334};} // end inline asm mov.b32 %f624, %r2402; add.f32 %f625, %f551, %f624; mov.b32 %r2402, %f625; mul.f32 %f626, %f551, %f552; mul.f32 %f627, %f1997, %f626; sub.f32 %f2001, %f618, %f627; // begin inline asm { mov.b32 %f553, {0,%rs336};} // end inline asm sub.f32 %f628, %f553, %f1996; mul.f32 %f629, %f1997, %f628; fma.rn.f32 %f2000, %f626, %f628, %f621; mov.b32 %f630, %r2410; fma.rn.f32 %f631, %f551, %f629, %f630; mov.b32 %r2410, %f631; $L__BB0_64: sub.s32 %r501, %r468, %r14; setp.lt.s32 %p88, %r15, %r501; and.pred %p89, %p1, %p88; mov.b32 {%rs337, %rs78}, %r2341; // begin inline asm { mov.b32 %f632, {0,%rs337};} // end inline asm @%p89 bra $L__BB0_65; bra.uni $L__BB0_66; $L__BB0_65: mov.b32 %f679, %r2425; add.f32 %f680, %f632, %f679; mov.b32 %r2425, %f680; mul.f32 %f681, %f632, %f656; mul.f32 %f682, %f1997, %f681; sub.f32 %f683, %f2001, %f682; mov.b32 {%rs362, %rs365}, %r2309; // begin inline asm { mov.b32 %f657, {0,%rs362};} // end inline asm sub.f32 %f684, %f657, %f1996; mul.f32 %f685, %f1997, %f684; fma.rn.f32 %f686, %f681, %f684, %f2000; mov.b32 %f687, %r2433; fma.rn.f32 %f688, %f632, %f685, %f687; mov.b32 %r2433, %f688; // begin inline asm { mov.b32 %f658, {0,%rs78};} // end inline asm mov.b32 %f689, %r2424; add.f32 %f690, %f658, %f689; mov.b32 %r2424, %f690; mul.f32 %f691, %f658, %f659; mul.f32 %f692, %f1997, %f691; sub.f32 %f693, %f683, %f692; // begin inline asm { mov.b32 %f660, {0,%rs365};} // end inline asm sub.f32 %f694, %f660, %f1996; mul.f32 %f695, %f1997, %f694; fma.rn.f32 %f696, %f691, %f694, %f686; mov.b32 %f697, %r2432; fma.rn.f32 %f698, %f658, %f695, %f697; mov.b32 %r2432, %f698; mov.b32 {%rs366, %rs369}, %r2340; // begin inline asm { mov.b32 %f661, {0,%rs366};} // end inline asm mov.b32 %f699, %r2423; add.f32 %f700, %f661, %f699; mov.b32 %r2423, %f700; mul.f32 %f701, %f661, %f662; mul.f32 %f702, %f1997, %f701; sub.f32 %f703, %f693, %f702; mov.b32 {%rs368, %rs371}, %r2308; // begin inline asm { mov.b32 %f663, {0,%rs368};} // end inline asm sub.f32 %f704, %f663, %f1996; mul.f32 %f705, %f1997, %f704; fma.rn.f32 %f706, %f701, %f704, %f696; mov.b32 %f707, %r2431; fma.rn.f32 %f708, %f661, %f705, %f707; mov.b32 %r2431, %f708; // begin inline asm { mov.b32 %f664, {0,%rs369};} // end inline asm mov.b32 %f709, %r2422; add.f32 %f710, %f664, %f709; mov.b32 %r2422, %f710; mul.f32 %f711, %f664, %f665; mul.f32 %f712, %f1997, %f711; sub.f32 %f713, %f703, %f712; // begin inline asm { mov.b32 %f666, {0,%rs371};} // end inline asm sub.f32 %f714, %f666, %f1996; mul.f32 %f715, %f1997, %f714; fma.rn.f32 %f716, %f711, %f714, %f706; mov.b32 %f717, %r2430; fma.rn.f32 %f718, %f664, %f715, %f717; mov.b32 %r2430, %f718; mov.b32 {%rs372, %rs375}, %r2339; // begin inline asm { mov.b32 %f667, {0,%rs372};} // end inline asm mov.b32 %f719, %r2421; add.f32 %f720, %f667, %f719; mov.b32 %r2421, %f720; mul.f32 %f721, %f667, %f668; mul.f32 %f722, %f1997, %f721; sub.f32 %f723, %f713, %f722; mov.b32 {%rs374, %rs377}, %r2307; // begin inline asm { mov.b32 %f669, {0,%rs374};} // end inline asm sub.f32 %f724, %f669, %f1996; mul.f32 %f725, %f1997, %f724; fma.rn.f32 %f726, %f721, %f724, %f716; mov.b32 %f727, %r2429; fma.rn.f32 %f728, %f667, %f725, %f727; mov.b32 %r2429, %f728; // begin inline asm { mov.b32 %f670, {0,%rs375};} // end inline asm mov.b32 %f729, %r2420; add.f32 %f730, %f670, %f729; mov.b32 %r2420, %f730; mul.f32 %f731, %f670, %f671; mul.f32 %f732, %f1997, %f731; sub.f32 %f733, %f723, %f732; // begin inline asm { mov.b32 %f672, {0,%rs377};} // end inline asm sub.f32 %f734, %f672, %f1996; mul.f32 %f735, %f1997, %f734; fma.rn.f32 %f736, %f731, %f734, %f726; mov.b32 %f737, %r2428; fma.rn.f32 %f738, %f670, %f735, %f737; mov.b32 %r2428, %f738; mov.b32 {%rs378, %rs381}, %r2338; // begin inline asm { mov.b32 %f673, {0,%rs378};} // end inline asm mov.b32 %f739, %r2419; add.f32 %f740, %f673, %f739; mov.b32 %r2419, %f740; mul.f32 %f741, %f673, %f674; mul.f32 %f742, %f1997, %f741; sub.f32 %f743, %f733, %f742; mov.b32 {%rs380, %rs383}, %r2306; // begin inline asm { mov.b32 %f675, {0,%rs380};} // end inline asm sub.f32 %f744, %f675, %f1996; mul.f32 %f745, %f1997, %f744; fma.rn.f32 %f746, %f741, %f744, %f736; mov.b32 %f747, %r2427; fma.rn.f32 %f748, %f673, %f745, %f747; mov.b32 %r2427, %f748; // begin inline asm { mov.b32 %f676, {0,%rs381};} // end inline asm mov.b32 %f749, %r2418; add.f32 %f750, %f676, %f749; mov.b32 %r2418, %f750; mul.f32 %f751, %f676, %f677; mul.f32 %f752, %f1997, %f751; sub.f32 %f2001, %f743, %f752; // begin inline asm { mov.b32 %f678, {0,%rs383};} // end inline asm sub.f32 %f753, %f678, %f1996; mul.f32 %f754, %f1997, %f753; fma.rn.f32 %f2000, %f751, %f753, %f746; mov.b32 %f755, %r2426; fma.rn.f32 %f756, %f676, %f754, %f755; mov.b32 %r2426, %f756; $L__BB0_66: sub.s32 %r534, %r501, %r14; setp.lt.s32 %p90, %r15, %r534; and.pred %p91, %p1, %p90; mov.b32 {%rs384, %rs81}, %r2345; // begin inline asm { mov.b32 %f757, {0,%rs384};} // end inline asm @%p91 bra $L__BB0_67; bra.uni $L__BB0_68; $L__BB0_67: mov.b32 %f804, %r2441; add.f32 %f805, %f757, %f804; mov.b32 %r2441, %f805; mul.f32 %f806, %f757, %f781; mul.f32 %f807, %f1997, %f806; sub.f32 %f808, %f2001, %f807; mov.b32 {%rs409, %rs412}, %r2313; // begin inline asm { mov.b32 %f782, {0,%rs409};} // end inline asm sub.f32 %f809, %f782, %f1996; mul.f32 %f810, %f1997, %f809; fma.rn.f32 %f811, %f806, %f809, %f2000; mov.b32 %f812, %r2449; fma.rn.f32 %f813, %f757, %f810, %f812; mov.b32 %r2449, %f813; // begin inline asm { mov.b32 %f783, {0,%rs81};} // end inline asm mov.b32 %f814, %r2440; add.f32 %f815, %f783, %f814; mov.b32 %r2440, %f815; mul.f32 %f816, %f783, %f784; mul.f32 %f817, %f1997, %f816; sub.f32 %f818, %f808, %f817; // begin inline asm { mov.b32 %f785, {0,%rs412};} // end inline asm sub.f32 %f819, %f785, %f1996; mul.f32 %f820, %f1997, %f819; fma.rn.f32 %f821, %f816, %f819, %f811; mov.b32 %f822, %r2448; fma.rn.f32 %f823, %f783, %f820, %f822; mov.b32 %r2448, %f823; mov.b32 {%rs413, %rs416}, %r2344; // begin inline asm { mov.b32 %f786, {0,%rs413};} // end inline asm mov.b32 %f824, %r2439; add.f32 %f825, %f786, %f824; mov.b32 %r2439, %f825; mul.f32 %f826, %f786, %f787; mul.f32 %f827, %f1997, %f826; sub.f32 %f828, %f818, %f827; mov.b32 {%rs415, %rs418}, %r2312; // begin inline asm { mov.b32 %f788, {0,%rs415};} // end inline asm sub.f32 %f829, %f788, %f1996; mul.f32 %f830, %f1997, %f829; fma.rn.f32 %f831, %f826, %f829, %f821; mov.b32 %f832, %r2447; fma.rn.f32 %f833, %f786, %f830, %f832; mov.b32 %r2447, %f833; // begin inline asm { mov.b32 %f789, {0,%rs416};} // end inline asm mov.b32 %f834, %r2438; add.f32 %f835, %f789, %f834; mov.b32 %r2438, %f835; mul.f32 %f836, %f789, %f790; mul.f32 %f837, %f1997, %f836; sub.f32 %f838, %f828, %f837; // begin inline asm { mov.b32 %f791, {0,%rs418};} // end inline asm sub.f32 %f839, %f791, %f1996; mul.f32 %f840, %f1997, %f839; fma.rn.f32 %f841, %f836, %f839, %f831; mov.b32 %f842, %r2446; fma.rn.f32 %f843, %f789, %f840, %f842; mov.b32 %r2446, %f843; mov.b32 {%rs419, %rs422}, %r2343; // begin inline asm { mov.b32 %f792, {0,%rs419};} // end inline asm mov.b32 %f844, %r2437; add.f32 %f845, %f792, %f844; mov.b32 %r2437, %f845; mul.f32 %f846, %f792, %f793; mul.f32 %f847, %f1997, %f846; sub.f32 %f848, %f838, %f847; mov.b32 {%rs421, %rs424}, %r2311; // begin inline asm { mov.b32 %f794, {0,%rs421};} // end inline asm sub.f32 %f849, %f794, %f1996; mul.f32 %f850, %f1997, %f849; fma.rn.f32 %f851, %f846, %f849, %f841; mov.b32 %f852, %r2445; fma.rn.f32 %f853, %f792, %f850, %f852; mov.b32 %r2445, %f853; // begin inline asm { mov.b32 %f795, {0,%rs422};} // end inline asm mov.b32 %f854, %r2436; add.f32 %f855, %f795, %f854; mov.b32 %r2436, %f855; mul.f32 %f856, %f795, %f796; mul.f32 %f857, %f1997, %f856; sub.f32 %f858, %f848, %f857; // begin inline asm { mov.b32 %f797, {0,%rs424};} // end inline asm sub.f32 %f859, %f797, %f1996; mul.f32 %f860, %f1997, %f859; fma.rn.f32 %f861, %f856, %f859, %f851; mov.b32 %f862, %r2444; fma.rn.f32 %f863, %f795, %f860, %f862; mov.b32 %r2444, %f863; mov.b32 {%rs425, %rs428}, %r2342; // begin inline asm { mov.b32 %f798, {0,%rs425};} // end inline asm mov.b32 %f864, %r2435; add.f32 %f865, %f798, %f864; mov.b32 %r2435, %f865; mul.f32 %f866, %f798, %f799; mul.f32 %f867, %f1997, %f866; sub.f32 %f868, %f858, %f867; mov.b32 {%rs427, %rs430}, %r2310; // begin inline asm { mov.b32 %f800, {0,%rs427};} // end inline asm sub.f32 %f869, %f800, %f1996; mul.f32 %f870, %f1997, %f869; fma.rn.f32 %f871, %f866, %f869, %f861; mov.b32 %f872, %r2443; fma.rn.f32 %f873, %f798, %f870, %f872; mov.b32 %r2443, %f873; // begin inline asm { mov.b32 %f801, {0,%rs428};} // end inline asm mov.b32 %f874, %r2434; add.f32 %f875, %f801, %f874; mov.b32 %r2434, %f875; mul.f32 %f876, %f801, %f802; mul.f32 %f877, %f1997, %f876; sub.f32 %f2001, %f868, %f877; // begin inline asm { mov.b32 %f803, {0,%rs430};} // end inline asm sub.f32 %f878, %f803, %f1996; mul.f32 %f879, %f1997, %f878; fma.rn.f32 %f2000, %f876, %f878, %f871; mov.b32 %f880, %r2442; fma.rn.f32 %f881, %f801, %f879, %f880; mov.b32 %r2442, %f881; $L__BB0_68: sub.s32 %r567, %r534, %r14; setp.lt.s32 %p92, %r15, %r567; and.pred %p93, %p1, %p92; mov.b32 {%rs431, %rs84}, %r2349; // begin inline asm { mov.b32 %f882, {0,%rs431};} // end inline asm @%p93 bra $L__BB0_69; bra.uni $L__BB0_70; $L__BB0_69: mov.b32 %f929, %r2457; add.f32 %f930, %f882, %f929; mov.b32 %r2457, %f930; mul.f32 %f931, %f882, %f906; mul.f32 %f932, %f1997, %f931; sub.f32 %f933, %f2001, %f932; mov.b32 {%rs456, %rs459}, %r2317; // begin inline asm { mov.b32 %f907, {0,%rs456};} // end inline asm sub.f32 %f934, %f907, %f1996; mul.f32 %f935, %f1997, %f934; fma.rn.f32 %f936, %f931, %f934, %f2000; mov.b32 %f937, %r2465; fma.rn.f32 %f938, %f882, %f935, %f937; mov.b32 %r2465, %f938; // begin inline asm { mov.b32 %f908, {0,%rs84};} // end inline asm mov.b32 %f939, %r2456; add.f32 %f940, %f908, %f939; mov.b32 %r2456, %f940; mul.f32 %f941, %f908, %f909; mul.f32 %f942, %f1997, %f941; sub.f32 %f943, %f933, %f942; // begin inline asm { mov.b32 %f910, {0,%rs459};} // end inline asm sub.f32 %f944, %f910, %f1996; mul.f32 %f945, %f1997, %f944; fma.rn.f32 %f946, %f941, %f944, %f936; mov.b32 %f947, %r2464; fma.rn.f32 %f948, %f908, %f945, %f947; mov.b32 %r2464, %f948; mov.b32 {%rs460, %rs463}, %r2348; // begin inline asm { mov.b32 %f911, {0,%rs460};} // end inline asm mov.b32 %f949, %r2455; add.f32 %f950, %f911, %f949; mov.b32 %r2455, %f950; mul.f32 %f951, %f911, %f912; mul.f32 %f952, %f1997, %f951; sub.f32 %f953, %f943, %f952; mov.b32 {%rs462, %rs465}, %r2316; // begin inline asm { mov.b32 %f913, {0,%rs462};} // end inline asm sub.f32 %f954, %f913, %f1996; mul.f32 %f955, %f1997, %f954; fma.rn.f32 %f956, %f951, %f954, %f946; mov.b32 %f957, %r2463; fma.rn.f32 %f958, %f911, %f955, %f957; mov.b32 %r2463, %f958; // begin inline asm { mov.b32 %f914, {0,%rs463};} // end inline asm mov.b32 %f959, %r2454; add.f32 %f960, %f914, %f959; mov.b32 %r2454, %f960; mul.f32 %f961, %f914, %f915; mul.f32 %f962, %f1997, %f961; sub.f32 %f963, %f953, %f962; // begin inline asm { mov.b32 %f916, {0,%rs465};} // end inline asm sub.f32 %f964, %f916, %f1996; mul.f32 %f965, %f1997, %f964; fma.rn.f32 %f966, %f961, %f964, %f956; mov.b32 %f967, %r2462; fma.rn.f32 %f968, %f914, %f965, %f967; mov.b32 %r2462, %f968; mov.b32 {%rs466, %rs469}, %r2347; // begin inline asm { mov.b32 %f917, {0,%rs466};} // end inline asm mov.b32 %f969, %r2453; add.f32 %f970, %f917, %f969; mov.b32 %r2453, %f970; mul.f32 %f971, %f917, %f918; mul.f32 %f972, %f1997, %f971; sub.f32 %f973, %f963, %f972; mov.b32 {%rs468, %rs471}, %r2315; // begin inline asm { mov.b32 %f919, {0,%rs468};} // end inline asm sub.f32 %f974, %f919, %f1996; mul.f32 %f975, %f1997, %f974; fma.rn.f32 %f976, %f971, %f974, %f966; mov.b32 %f977, %r2461; fma.rn.f32 %f978, %f917, %f975, %f977; mov.b32 %r2461, %f978; // begin inline asm { mov.b32 %f920, {0,%rs469};} // end inline asm mov.b32 %f979, %r2452; add.f32 %f980, %f920, %f979; mov.b32 %r2452, %f980; mul.f32 %f981, %f920, %f921; mul.f32 %f982, %f1997, %f981; sub.f32 %f983, %f973, %f982; // begin inline asm { mov.b32 %f922, {0,%rs471};} // end inline asm sub.f32 %f984, %f922, %f1996; mul.f32 %f985, %f1997, %f984; fma.rn.f32 %f986, %f981, %f984, %f976; mov.b32 %f987, %r2460; fma.rn.f32 %f988, %f920, %f985, %f987; mov.b32 %r2460, %f988; mov.b32 {%rs472, %rs475}, %r2346; // begin inline asm { mov.b32 %f923, {0,%rs472};} // end inline asm mov.b32 %f989, %r2451; add.f32 %f990, %f923, %f989; mov.b32 %r2451, %f990; mul.f32 %f991, %f923, %f924; mul.f32 %f992, %f1997, %f991; sub.f32 %f993, %f983, %f992; mov.b32 {%rs474, %rs477}, %r2314; // begin inline asm { mov.b32 %f925, {0,%rs474};} // end inline asm sub.f32 %f994, %f925, %f1996; mul.f32 %f995, %f1997, %f994; fma.rn.f32 %f996, %f991, %f994, %f986; mov.b32 %f997, %r2459; fma.rn.f32 %f998, %f923, %f995, %f997; mov.b32 %r2459, %f998; // begin inline asm { mov.b32 %f926, {0,%rs475};} // end inline asm mov.b32 %f999, %r2450; add.f32 %f1000, %f926, %f999; mov.b32 %r2450, %f1000; mul.f32 %f1001, %f926, %f927; mul.f32 %f1002, %f1997, %f1001; sub.f32 %f2001, %f993, %f1002; // begin inline asm { mov.b32 %f928, {0,%rs477};} // end inline asm sub.f32 %f1003, %f928, %f1996; mul.f32 %f1004, %f1997, %f1003; fma.rn.f32 %f2000, %f1001, %f1003, %f996; mov.b32 %f1005, %r2458; fma.rn.f32 %f1006, %f926, %f1004, %f1005; mov.b32 %r2458, %f1006; $L__BB0_70: sub.s32 %r1402, %r567, %r14; setp.lt.s32 %p94, %r15, %r1402; and.pred %p95, %p1, %p94; mov.b32 {%rs478, %rs87}, %r2353; // begin inline asm { mov.b32 %f1007, {0,%rs478};} // end inline asm @%p95 bra $L__BB0_71; bra.uni $L__BB0_72; $L__BB0_71: mov.b32 %f1054, %r2473; add.f32 %f1055, %f1007, %f1054; mov.b32 %r2473, %f1055; mul.f32 %f1056, %f1007, %f1031; mul.f32 %f1057, %f1997, %f1056; sub.f32 %f1058, %f2001, %f1057; mov.b32 {%rs503, %rs506}, %r2321; // begin inline asm { mov.b32 %f1032, {0,%rs503};} // end inline asm sub.f32 %f1059, %f1032, %f1996; mul.f32 %f1060, %f1997, %f1059; fma.rn.f32 %f1061, %f1056, %f1059, %f2000; mov.b32 %f1062, %r2481; fma.rn.f32 %f1063, %f1007, %f1060, %f1062; mov.b32 %r2481, %f1063; // begin inline asm { mov.b32 %f1033, {0,%rs87};} // end inline asm mov.b32 %f1064, %r2472; add.f32 %f1065, %f1033, %f1064; mov.b32 %r2472, %f1065; mul.f32 %f1066, %f1033, %f1034; mul.f32 %f1067, %f1997, %f1066; sub.f32 %f1068, %f1058, %f1067; // begin inline asm { mov.b32 %f1035, {0,%rs506};} // end inline asm sub.f32 %f1069, %f1035, %f1996; mul.f32 %f1070, %f1997, %f1069; fma.rn.f32 %f1071, %f1066, %f1069, %f1061; mov.b32 %f1072, %r2480; fma.rn.f32 %f1073, %f1033, %f1070, %f1072; mov.b32 %r2480, %f1073; mov.b32 {%rs507, %rs510}, %r2352; // begin inline asm { mov.b32 %f1036, {0,%rs507};} // end inline asm mov.b32 %f1074, %r2471; add.f32 %f1075, %f1036, %f1074; mov.b32 %r2471, %f1075; mul.f32 %f1076, %f1036, %f1037; mul.f32 %f1077, %f1997, %f1076; sub.f32 %f1078, %f1068, %f1077; mov.b32 {%rs509, %rs512}, %r2320; // begin inline asm { mov.b32 %f1038, {0,%rs509};} // end inline asm sub.f32 %f1079, %f1038, %f1996; mul.f32 %f1080, %f1997, %f1079; fma.rn.f32 %f1081, %f1076, %f1079, %f1071; mov.b32 %f1082, %r2479; fma.rn.f32 %f1083, %f1036, %f1080, %f1082; mov.b32 %r2479, %f1083; // begin inline asm { mov.b32 %f1039, {0,%rs510};} // end inline asm mov.b32 %f1084, %r2470; add.f32 %f1085, %f1039, %f1084; mov.b32 %r2470, %f1085; mul.f32 %f1086, %f1039, %f1040; mul.f32 %f1087, %f1997, %f1086; sub.f32 %f1088, %f1078, %f1087; // begin inline asm { mov.b32 %f1041, {0,%rs512};} // end inline asm sub.f32 %f1089, %f1041, %f1996; mul.f32 %f1090, %f1997, %f1089; fma.rn.f32 %f1091, %f1086, %f1089, %f1081; mov.b32 %f1092, %r2478; fma.rn.f32 %f1093, %f1039, %f1090, %f1092; mov.b32 %r2478, %f1093; mov.b32 {%rs513, %rs516}, %r2351; // begin inline asm { mov.b32 %f1042, {0,%rs513};} // end inline asm mov.b32 %f1094, %r2469; add.f32 %f1095, %f1042, %f1094; mov.b32 %r2469, %f1095; mul.f32 %f1096, %f1042, %f1043; mul.f32 %f1097, %f1997, %f1096; sub.f32 %f1098, %f1088, %f1097; mov.b32 {%rs515, %rs518}, %r2319; // begin inline asm { mov.b32 %f1044, {0,%rs515};} // end inline asm sub.f32 %f1099, %f1044, %f1996; mul.f32 %f1100, %f1997, %f1099; fma.rn.f32 %f1101, %f1096, %f1099, %f1091; mov.b32 %f1102, %r2477; fma.rn.f32 %f1103, %f1042, %f1100, %f1102; mov.b32 %r2477, %f1103; // begin inline asm { mov.b32 %f1045, {0,%rs516};} // end inline asm mov.b32 %f1104, %r2468; add.f32 %f1105, %f1045, %f1104; mov.b32 %r2468, %f1105; mul.f32 %f1106, %f1045, %f1046; mul.f32 %f1107, %f1997, %f1106; sub.f32 %f1108, %f1098, %f1107; // begin inline asm { mov.b32 %f1047, {0,%rs518};} // end inline asm sub.f32 %f1109, %f1047, %f1996; mul.f32 %f1110, %f1997, %f1109; fma.rn.f32 %f1111, %f1106, %f1109, %f1101; mov.b32 %f1112, %r2476; fma.rn.f32 %f1113, %f1045, %f1110, %f1112; mov.b32 %r2476, %f1113; mov.b32 {%rs519, %rs522}, %r2350; // begin inline asm { mov.b32 %f1048, {0,%rs519};} // end inline asm mov.b32 %f1114, %r2467; add.f32 %f1115, %f1048, %f1114; mov.b32 %r2467, %f1115; mul.f32 %f1116, %f1048, %f1049; mul.f32 %f1117, %f1997, %f1116; sub.f32 %f1118, %f1108, %f1117; mov.b32 {%rs521, %rs524}, %r2318; // begin inline asm { mov.b32 %f1050, {0,%rs521};} // end inline asm sub.f32 %f1119, %f1050, %f1996; mul.f32 %f1120, %f1997, %f1119; fma.rn.f32 %f1121, %f1116, %f1119, %f1111; mov.b32 %f1122, %r2475; fma.rn.f32 %f1123, %f1048, %f1120, %f1122; mov.b32 %r2475, %f1123; // begin inline asm { mov.b32 %f1051, {0,%rs522};} // end inline asm mov.b32 %f1124, %r2466; add.f32 %f1125, %f1051, %f1124; mov.b32 %r2466, %f1125; mul.f32 %f1126, %f1051, %f1052; mul.f32 %f1127, %f1997, %f1126; sub.f32 %f2001, %f1118, %f1127; // begin inline asm { mov.b32 %f1053, {0,%rs524};} // end inline asm sub.f32 %f1128, %f1053, %f1996; mul.f32 %f1129, %f1997, %f1128; fma.rn.f32 %f2000, %f1126, %f1128, %f1121; mov.b32 %f1130, %r2474; fma.rn.f32 %f1131, %f1051, %f1129, %f1130; mov.b32 %r2474, %f1131; $L__BB0_72: mov.u32 %r1405, %tid.z; mad.lo.s32 %r1407, %r8, %r6, %r1; mad.lo.s32 %r1408, %r1288, %r1405, %r1407; mul.wide.u32 %rd79, %r1408, 4; mov.u64 %rd80, _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE; add.s64 %rd81, %rd80, %rd79; st.shared.f32 [%rd81], %f2000; bar.sync 0; setp.ge.u32 %p96, %r1407, %r89; add.s32 %r1409, %r89, %r1407; setp.ge.u32 %p97, %r1409, %r1288; or.pred %p98, %p96, %p97; @%p98 bra $L__BB0_74; mov.u64 %rd329, _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE; add.s32 %r1416, %r89, %r1408; mul.wide.s32 %rd82, %r1416, 4; add.s64 %rd84, %rd329, %rd82; ld.shared.f32 %f1132, [%rd81]; ld.shared.f32 %f1133, [%rd84]; add.f32 %f1134, %f1133, %f1132; st.shared.f32 [%rd81], %f1134; $L__BB0_74: setp.lt.s32 %p99, %r89, 4; bar.sync 0; @%p99 bra $L__BB0_79; mov.u32 %r2482, %r90; $L__BB0_76: mad.lo.s32 %r2127, %r8, %r6, %r1; setp.ge.u32 %p100, %r2127, %r2482; @%p100 bra $L__BB0_78; mov.u64 %rd328, _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE; add.s32 %r1426, %r2482, %r1408; mul.wide.s32 %rd87, %r1426, 4; add.s64 %rd89, %rd328, %rd87; ld.shared.f32 %f1135, [%rd81]; ld.shared.f32 %f1136, [%rd89]; add.f32 %f1137, %f1136, %f1135; st.shared.f32 [%rd81], %f1137; $L__BB0_78: bar.sync 0; shr.u32 %r633, %r2482, 1; setp.gt.u32 %p101, %r2482, 3; mov.u32 %r2482, %r633; @%p101 bra $L__BB0_76; $L__BB0_79: or.b32 %r1428, %r1, %r8; setp.ne.s32 %p102, %r1428, 0; mov.f32 %f2014, 0f00000000; @%p102 bra $L__BB0_82; setp.lt.u32 %p103, %r1288, 2; ld.shared.f32 %f1139, [%rd81]; add.f32 %f2014, %f1139, 0f00000000; @%p103 bra $L__BB0_82; mov.u64 %rd327, _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE; add.s32 %r1441, %r1408, 1; mul.wide.u32 %rd95, %r1441, 4; add.s64 %rd97, %rd327, %rd95; ld.shared.f32 %f1140, [%rd97]; add.f32 %f2014, %f2014, %f1140; $L__BB0_82: bar.sync 0; st.shared.f32 [%rd81], %f2001; bar.sync 0; @%p98 bra $L__BB0_84; mov.u64 %rd326, _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE; add.s32 %r1455, %r89, %r1408; mul.wide.s32 %rd101, %r1455, 4; add.s64 %rd103, %rd326, %rd101; ld.shared.f32 %f1141, [%rd81]; ld.shared.f32 %f1142, [%rd103]; add.f32 %f1143, %f1142, %f1141; st.shared.f32 [%rd81], %f1143; $L__BB0_84: setp.lt.s32 %p347, %r89, 4; bar.sync 0; @%p347 bra $L__BB0_89; mov.u32 %r2483, %r90; $L__BB0_86: mad.lo.s32 %r2126, %r8, %r6, %r1; setp.ge.u32 %p108, %r2126, %r2483; @%p108 bra $L__BB0_88; mov.u64 %rd325, _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE; add.s32 %r1465, %r2483, %r1408; mul.wide.s32 %rd106, %r1465, 4; add.s64 %rd108, %rd325, %rd106; ld.shared.f32 %f1144, [%rd81]; ld.shared.f32 %f1145, [%rd108]; add.f32 %f1146, %f1145, %f1144; st.shared.f32 [%rd81], %f1146; $L__BB0_88: bar.sync 0; shr.u32 %r635, %r2483, 1; setp.gt.u32 %p109, %r2483, 3; mov.u32 %r2483, %r635; @%p109 bra $L__BB0_86; $L__BB0_89: mov.f32 %f2015, 0f00000000; @%p102 bra $L__BB0_92; setp.lt.u32 %p111, %r1288, 2; ld.shared.f32 %f1148, [%rd81]; add.f32 %f2015, %f1148, 0f00000000; @%p111 bra $L__BB0_92; mov.u64 %rd324, _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE; add.s32 %r1480, %r1408, 1; mul.wide.u32 %rd114, %r1480, 4; add.s64 %rd116, %rd324, %rd114; ld.shared.f32 %f1149, [%rd116]; add.f32 %f2015, %f2015, %f1149; $L__BB0_92: bar.sync 0; @%p102 bra $L__BB0_94; mov.u32 %r2125, %tid.z; mov.u64 %rd323, _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE; mul.wide.s32 %rd117, %r2125, 4; add.s64 %rd119, %rd323, %rd117; st.shared.f32 [%rd119], %f2015; $L__BB0_94: mov.u32 %r2120, %tid.z; mov.u64 %rd319, _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE; bar.sync 0; mul.wide.s32 %rd120, %r2120, 4; add.s64 %rd122, %rd319, %rd120; ld.shared.f32 %f1150, [%rd122]; bar.sync 0; mul.f32 %f52, %f1, %f1150; @%p102 bra $L__BB0_96; st.shared.f32 [%rd122], %f2014; $L__BB0_96: mul.lo.s32 %r2121, %r2612, %r14; bar.sync 0; ld.shared.f32 %f1155, [%rd122]; bar.sync 0; mul.f32 %f1156, %f1155, 0fBF000000; mul.f32 %f1157, %f1997, %f1997; mul.f32 %f1158, %f1997, %f1157; mul.f32 %f1159, %f1158, %f1156; fma.rn.f32 %f53, %f1158, %f1156, %f1159; shl.b32 %r636, %r2121, 5; neg.s32 %r1490, %r636; setp.lt.s32 %p114, %r15, %r1490; and.pred %p115, %p1, %p114; mov.b32 {%rs525, %rs90}, %r2293; // begin inline asm { mov.b32 %f1151, {0,%rs525};} // end inline asm sub.f32 %f1160, %f1151, %f1996; // begin inline asm { mov.b32 %f1152, {0,%rs149};} // end inline asm mul.f32 %f1161, %f1152, %f155; mul.f32 %f1162, %f53, %f1160; fma.rn.f32 %f1163, %f1, %f1162, %f52; fma.rn.f32 %f1154, %f1997, %f1161, %f1163; // begin inline asm { cvt.rn.bf16.f32 %rs528, %f1154;} // end inline asm @%p115 bra $L__BB0_97; bra.uni $L__BB0_98; $L__BB0_97: // begin inline asm { mov.b32 %f1192, {0,%rs90};} // end inline asm sub.f32 %f1220, %f1192, %f1996; // begin inline asm { mov.b32 %f1193, {0,%rs66};} // end inline asm mul.f32 %f1221, %f1193, %f158; mul.f32 %f1222, %f53, %f1220; fma.rn.f32 %f1223, %f1, %f1222, %f52; fma.rn.f32 %f1195, %f1997, %f1221, %f1223; // begin inline asm { cvt.rn.bf16.f32 %rs560, %f1195;} // end inline asm mov.b32 %r1491, {%rs528, %rs560}; mov.b32 {%rs561, %rs565}, %r2292; // begin inline asm { mov.b32 %f1196, {0,%rs561};} // end inline asm sub.f32 %f1224, %f1196, %f1996; mov.b32 {%rs562, %rs566}, %r2324; // begin inline asm { mov.b32 %f1197, {0,%rs562};} // end inline asm mul.f32 %f1225, %f1197, %f161; mul.f32 %f1226, %f53, %f1224; fma.rn.f32 %f1227, %f1, %f1226, %f52; fma.rn.f32 %f1199, %f1997, %f1225, %f1227; // begin inline asm { mov.b32 %f1200, {0,%rs565};} // end inline asm sub.f32 %f1228, %f1200, %f1996; // begin inline asm { mov.b32 %f1201, {0,%rs566};} // end inline asm mul.f32 %f1229, %f1201, %f164; mul.f32 %f1230, %f53, %f1228; fma.rn.f32 %f1231, %f1, %f1230, %f52; fma.rn.f32 %f1203, %f1997, %f1229, %f1231; // begin inline asm { cvt.rn.bf16.f32 %rs568, %f1203;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs564, %f1199;} // end inline asm mov.b32 %r1492, {%rs564, %rs568}; mov.b32 {%rs569, %rs573}, %r2291; // begin inline asm { mov.b32 %f1204, {0,%rs569};} // end inline asm sub.f32 %f1232, %f1204, %f1996; mov.b32 {%rs570, %rs574}, %r2323; // begin inline asm { mov.b32 %f1205, {0,%rs570};} // end inline asm mul.f32 %f1233, %f1205, %f167; mul.f32 %f1234, %f53, %f1232; fma.rn.f32 %f1235, %f1, %f1234, %f52; fma.rn.f32 %f1207, %f1997, %f1233, %f1235; // begin inline asm { mov.b32 %f1208, {0,%rs573};} // end inline asm sub.f32 %f1236, %f1208, %f1996; // begin inline asm { mov.b32 %f1209, {0,%rs574};} // end inline asm mul.f32 %f1237, %f1209, %f170; mul.f32 %f1238, %f53, %f1236; fma.rn.f32 %f1239, %f1, %f1238, %f52; fma.rn.f32 %f1211, %f1997, %f1237, %f1239; // begin inline asm { cvt.rn.bf16.f32 %rs576, %f1211;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs572, %f1207;} // end inline asm mov.b32 %r1493, {%rs572, %rs576}; mov.b32 {%rs577, %rs581}, %r2290; // begin inline asm { mov.b32 %f1212, {0,%rs577};} // end inline asm sub.f32 %f1240, %f1212, %f1996; mov.b32 {%rs578, %rs582}, %r2322; // begin inline asm { mov.b32 %f1213, {0,%rs578};} // end inline asm mul.f32 %f1241, %f1213, %f173; mul.f32 %f1242, %f53, %f1240; fma.rn.f32 %f1243, %f1, %f1242, %f52; fma.rn.f32 %f1215, %f1997, %f1241, %f1243; // begin inline asm { mov.b32 %f1216, {0,%rs581};} // end inline asm sub.f32 %f1244, %f1216, %f1996; // begin inline asm { mov.b32 %f1217, {0,%rs582};} // end inline asm mul.f32 %f1245, %f1217, %f176; mul.f32 %f1246, %f53, %f1244; fma.rn.f32 %f1247, %f1, %f1246, %f52; fma.rn.f32 %f1219, %f1997, %f1245, %f1247; // begin inline asm { cvt.rn.bf16.f32 %rs584, %f1219;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs580, %f1215;} // end inline asm mov.b32 %r1494, {%rs580, %rs584}; add.s32 %r1495, %r636, %r223; mul.wide.s32 %rd130, %r1495, 2; add.s64 %rd129, %rd21, %rd130; // begin inline asm st.global.cs.v4.s32 [%rd129], {%r1491,%r1492,%r1493,%r1494}; // end inline asm $L__BB0_98: add.s32 %r637, %r636, %r14; neg.s32 %r1496, %r637; setp.lt.s32 %p116, %r15, %r1496; and.pred %p117, %p1, %p116; mov.b32 {%rs585, %rs93}, %r2297; // begin inline asm { mov.b32 %f1248, {0,%rs585};} // end inline asm sub.f32 %f1252, %f1248, %f1996; // begin inline asm { mov.b32 %f1249, {0,%rs196};} // end inline asm mul.f32 %f1253, %f1249, %f281; mul.f32 %f1254, %f53, %f1252; fma.rn.f32 %f1255, %f1, %f1254, %f52; fma.rn.f32 %f1251, %f1997, %f1253, %f1255; // begin inline asm { cvt.rn.bf16.f32 %rs588, %f1251;} // end inline asm @%p117 bra $L__BB0_99; bra.uni $L__BB0_100; $L__BB0_99: // begin inline asm { mov.b32 %f1284, {0,%rs93};} // end inline asm sub.f32 %f1312, %f1284, %f1996; // begin inline asm { mov.b32 %f1285, {0,%rs69};} // end inline asm mul.f32 %f1313, %f1285, %f284; mul.f32 %f1314, %f53, %f1312; fma.rn.f32 %f1315, %f1, %f1314, %f52; fma.rn.f32 %f1287, %f1997, %f1313, %f1315; // begin inline asm { cvt.rn.bf16.f32 %rs620, %f1287;} // end inline asm mov.b32 %r1497, {%rs588, %rs620}; mov.b32 {%rs621, %rs625}, %r2296; // begin inline asm { mov.b32 %f1288, {0,%rs621};} // end inline asm sub.f32 %f1316, %f1288, %f1996; mov.b32 {%rs622, %rs626}, %r2328; // begin inline asm { mov.b32 %f1289, {0,%rs622};} // end inline asm mul.f32 %f1317, %f1289, %f287; mul.f32 %f1318, %f53, %f1316; fma.rn.f32 %f1319, %f1, %f1318, %f52; fma.rn.f32 %f1291, %f1997, %f1317, %f1319; // begin inline asm { mov.b32 %f1292, {0,%rs625};} // end inline asm sub.f32 %f1320, %f1292, %f1996; // begin inline asm { mov.b32 %f1293, {0,%rs626};} // end inline asm mul.f32 %f1321, %f1293, %f290; mul.f32 %f1322, %f53, %f1320; fma.rn.f32 %f1323, %f1, %f1322, %f52; fma.rn.f32 %f1295, %f1997, %f1321, %f1323; // begin inline asm { cvt.rn.bf16.f32 %rs628, %f1295;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs624, %f1291;} // end inline asm mov.b32 %r1498, {%rs624, %rs628}; mov.b32 {%rs629, %rs633}, %r2295; // begin inline asm { mov.b32 %f1296, {0,%rs629};} // end inline asm sub.f32 %f1324, %f1296, %f1996; mov.b32 {%rs630, %rs634}, %r2327; // begin inline asm { mov.b32 %f1297, {0,%rs630};} // end inline asm mul.f32 %f1325, %f1297, %f293; mul.f32 %f1326, %f53, %f1324; fma.rn.f32 %f1327, %f1, %f1326, %f52; fma.rn.f32 %f1299, %f1997, %f1325, %f1327; // begin inline asm { mov.b32 %f1300, {0,%rs633};} // end inline asm sub.f32 %f1328, %f1300, %f1996; // begin inline asm { mov.b32 %f1301, {0,%rs634};} // end inline asm mul.f32 %f1329, %f1301, %f296; mul.f32 %f1330, %f53, %f1328; fma.rn.f32 %f1331, %f1, %f1330, %f52; fma.rn.f32 %f1303, %f1997, %f1329, %f1331; // begin inline asm { cvt.rn.bf16.f32 %rs636, %f1303;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs632, %f1299;} // end inline asm mov.b32 %r1499, {%rs632, %rs636}; mov.b32 {%rs637, %rs641}, %r2294; // begin inline asm { mov.b32 %f1304, {0,%rs637};} // end inline asm sub.f32 %f1332, %f1304, %f1996; mov.b32 {%rs638, %rs642}, %r2326; // begin inline asm { mov.b32 %f1305, {0,%rs638};} // end inline asm mul.f32 %f1333, %f1305, %f299; mul.f32 %f1334, %f53, %f1332; fma.rn.f32 %f1335, %f1, %f1334, %f52; fma.rn.f32 %f1307, %f1997, %f1333, %f1335; // begin inline asm { mov.b32 %f1308, {0,%rs641};} // end inline asm sub.f32 %f1336, %f1308, %f1996; // begin inline asm { mov.b32 %f1309, {0,%rs642};} // end inline asm mul.f32 %f1337, %f1309, %f302; mul.f32 %f1338, %f53, %f1336; fma.rn.f32 %f1339, %f1, %f1338, %f52; fma.rn.f32 %f1311, %f1997, %f1337, %f1339; // begin inline asm { cvt.rn.bf16.f32 %rs644, %f1311;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs640, %f1307;} // end inline asm mov.b32 %r1500, {%rs640, %rs644}; add.s32 %r1501, %r637, %r223; mul.wide.s32 %rd132, %r1501, 2; add.s64 %rd131, %rd21, %rd132; // begin inline asm st.global.cs.v4.s32 [%rd131], {%r1497,%r1498,%r1499,%r1500}; // end inline asm $L__BB0_100: add.s32 %r638, %r637, %r14; neg.s32 %r1502, %r638; setp.lt.s32 %p118, %r15, %r1502; and.pred %p119, %p1, %p118; mov.b32 {%rs645, %rs96}, %r2301; // begin inline asm { mov.b32 %f1340, {0,%rs645};} // end inline asm sub.f32 %f1344, %f1340, %f1996; // begin inline asm { mov.b32 %f1341, {0,%rs243};} // end inline asm mul.f32 %f1345, %f1341, %f406; mul.f32 %f1346, %f53, %f1344; fma.rn.f32 %f1347, %f1, %f1346, %f52; fma.rn.f32 %f1343, %f1997, %f1345, %f1347; // begin inline asm { cvt.rn.bf16.f32 %rs648, %f1343;} // end inline asm @%p119 bra $L__BB0_101; bra.uni $L__BB0_102; $L__BB0_101: // begin inline asm { mov.b32 %f1376, {0,%rs96};} // end inline asm sub.f32 %f1404, %f1376, %f1996; // begin inline asm { mov.b32 %f1377, {0,%rs72};} // end inline asm mul.f32 %f1405, %f1377, %f409; mul.f32 %f1406, %f53, %f1404; fma.rn.f32 %f1407, %f1, %f1406, %f52; fma.rn.f32 %f1379, %f1997, %f1405, %f1407; // begin inline asm { cvt.rn.bf16.f32 %rs680, %f1379;} // end inline asm mov.b32 %r1503, {%rs648, %rs680}; mov.b32 {%rs681, %rs685}, %r2300; // begin inline asm { mov.b32 %f1380, {0,%rs681};} // end inline asm sub.f32 %f1408, %f1380, %f1996; mov.b32 {%rs682, %rs686}, %r2332; // begin inline asm { mov.b32 %f1381, {0,%rs682};} // end inline asm mul.f32 %f1409, %f1381, %f412; mul.f32 %f1410, %f53, %f1408; fma.rn.f32 %f1411, %f1, %f1410, %f52; fma.rn.f32 %f1383, %f1997, %f1409, %f1411; // begin inline asm { mov.b32 %f1384, {0,%rs685};} // end inline asm sub.f32 %f1412, %f1384, %f1996; // begin inline asm { mov.b32 %f1385, {0,%rs686};} // end inline asm mul.f32 %f1413, %f1385, %f415; mul.f32 %f1414, %f53, %f1412; fma.rn.f32 %f1415, %f1, %f1414, %f52; fma.rn.f32 %f1387, %f1997, %f1413, %f1415; // begin inline asm { cvt.rn.bf16.f32 %rs688, %f1387;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs684, %f1383;} // end inline asm mov.b32 %r1504, {%rs684, %rs688}; mov.b32 {%rs689, %rs693}, %r2299; // begin inline asm { mov.b32 %f1388, {0,%rs689};} // end inline asm sub.f32 %f1416, %f1388, %f1996; mov.b32 {%rs690, %rs694}, %r2331; // begin inline asm { mov.b32 %f1389, {0,%rs690};} // end inline asm mul.f32 %f1417, %f1389, %f418; mul.f32 %f1418, %f53, %f1416; fma.rn.f32 %f1419, %f1, %f1418, %f52; fma.rn.f32 %f1391, %f1997, %f1417, %f1419; // begin inline asm { mov.b32 %f1392, {0,%rs693};} // end inline asm sub.f32 %f1420, %f1392, %f1996; // begin inline asm { mov.b32 %f1393, {0,%rs694};} // end inline asm mul.f32 %f1421, %f1393, %f421; mul.f32 %f1422, %f53, %f1420; fma.rn.f32 %f1423, %f1, %f1422, %f52; fma.rn.f32 %f1395, %f1997, %f1421, %f1423; // begin inline asm { cvt.rn.bf16.f32 %rs696, %f1395;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs692, %f1391;} // end inline asm mov.b32 %r1505, {%rs692, %rs696}; mov.b32 {%rs697, %rs701}, %r2298; // begin inline asm { mov.b32 %f1396, {0,%rs697};} // end inline asm sub.f32 %f1424, %f1396, %f1996; mov.b32 {%rs698, %rs702}, %r2330; // begin inline asm { mov.b32 %f1397, {0,%rs698};} // end inline asm mul.f32 %f1425, %f1397, %f424; mul.f32 %f1426, %f53, %f1424; fma.rn.f32 %f1427, %f1, %f1426, %f52; fma.rn.f32 %f1399, %f1997, %f1425, %f1427; // begin inline asm { mov.b32 %f1400, {0,%rs701};} // end inline asm sub.f32 %f1428, %f1400, %f1996; // begin inline asm { mov.b32 %f1401, {0,%rs702};} // end inline asm mul.f32 %f1429, %f1401, %f427; mul.f32 %f1430, %f53, %f1428; fma.rn.f32 %f1431, %f1, %f1430, %f52; fma.rn.f32 %f1403, %f1997, %f1429, %f1431; // begin inline asm { cvt.rn.bf16.f32 %rs704, %f1403;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs700, %f1399;} // end inline asm mov.b32 %r1506, {%rs700, %rs704}; add.s32 %r1507, %r638, %r223; mul.wide.s32 %rd134, %r1507, 2; add.s64 %rd133, %rd21, %rd134; // begin inline asm st.global.cs.v4.s32 [%rd133], {%r1503,%r1504,%r1505,%r1506}; // end inline asm $L__BB0_102: add.s32 %r639, %r638, %r14; neg.s32 %r1508, %r639; setp.lt.s32 %p120, %r15, %r1508; and.pred %p121, %p1, %p120; mov.b32 {%rs705, %rs99}, %r2305; // begin inline asm { mov.b32 %f1432, {0,%rs705};} // end inline asm sub.f32 %f1436, %f1432, %f1996; // begin inline asm { mov.b32 %f1433, {0,%rs290};} // end inline asm mul.f32 %f1437, %f1433, %f531; mul.f32 %f1438, %f53, %f1436; fma.rn.f32 %f1439, %f1, %f1438, %f52; fma.rn.f32 %f1435, %f1997, %f1437, %f1439; // begin inline asm { cvt.rn.bf16.f32 %rs708, %f1435;} // end inline asm @%p121 bra $L__BB0_103; bra.uni $L__BB0_104; $L__BB0_103: // begin inline asm { mov.b32 %f1468, {0,%rs99};} // end inline asm sub.f32 %f1496, %f1468, %f1996; // begin inline asm { mov.b32 %f1469, {0,%rs75};} // end inline asm mul.f32 %f1497, %f1469, %f534; mul.f32 %f1498, %f53, %f1496; fma.rn.f32 %f1499, %f1, %f1498, %f52; fma.rn.f32 %f1471, %f1997, %f1497, %f1499; // begin inline asm { cvt.rn.bf16.f32 %rs740, %f1471;} // end inline asm mov.b32 %r1509, {%rs708, %rs740}; mov.b32 {%rs741, %rs745}, %r2304; // begin inline asm { mov.b32 %f1472, {0,%rs741};} // end inline asm sub.f32 %f1500, %f1472, %f1996; mov.b32 {%rs742, %rs746}, %r2336; // begin inline asm { mov.b32 %f1473, {0,%rs742};} // end inline asm mul.f32 %f1501, %f1473, %f537; mul.f32 %f1502, %f53, %f1500; fma.rn.f32 %f1503, %f1, %f1502, %f52; fma.rn.f32 %f1475, %f1997, %f1501, %f1503; // begin inline asm { mov.b32 %f1476, {0,%rs745};} // end inline asm sub.f32 %f1504, %f1476, %f1996; // begin inline asm { mov.b32 %f1477, {0,%rs746};} // end inline asm mul.f32 %f1505, %f1477, %f540; mul.f32 %f1506, %f53, %f1504; fma.rn.f32 %f1507, %f1, %f1506, %f52; fma.rn.f32 %f1479, %f1997, %f1505, %f1507; // begin inline asm { cvt.rn.bf16.f32 %rs748, %f1479;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs744, %f1475;} // end inline asm mov.b32 %r1510, {%rs744, %rs748}; mov.b32 {%rs749, %rs753}, %r2303; // begin inline asm { mov.b32 %f1480, {0,%rs749};} // end inline asm sub.f32 %f1508, %f1480, %f1996; mov.b32 {%rs750, %rs754}, %r2335; // begin inline asm { mov.b32 %f1481, {0,%rs750};} // end inline asm mul.f32 %f1509, %f1481, %f543; mul.f32 %f1510, %f53, %f1508; fma.rn.f32 %f1511, %f1, %f1510, %f52; fma.rn.f32 %f1483, %f1997, %f1509, %f1511; // begin inline asm { mov.b32 %f1484, {0,%rs753};} // end inline asm sub.f32 %f1512, %f1484, %f1996; // begin inline asm { mov.b32 %f1485, {0,%rs754};} // end inline asm mul.f32 %f1513, %f1485, %f546; mul.f32 %f1514, %f53, %f1512; fma.rn.f32 %f1515, %f1, %f1514, %f52; fma.rn.f32 %f1487, %f1997, %f1513, %f1515; // begin inline asm { cvt.rn.bf16.f32 %rs756, %f1487;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs752, %f1483;} // end inline asm mov.b32 %r1511, {%rs752, %rs756}; mov.b32 {%rs757, %rs761}, %r2302; // begin inline asm { mov.b32 %f1488, {0,%rs757};} // end inline asm sub.f32 %f1516, %f1488, %f1996; mov.b32 {%rs758, %rs762}, %r2334; // begin inline asm { mov.b32 %f1489, {0,%rs758};} // end inline asm mul.f32 %f1517, %f1489, %f549; mul.f32 %f1518, %f53, %f1516; fma.rn.f32 %f1519, %f1, %f1518, %f52; fma.rn.f32 %f1491, %f1997, %f1517, %f1519; // begin inline asm { mov.b32 %f1492, {0,%rs761};} // end inline asm sub.f32 %f1520, %f1492, %f1996; // begin inline asm { mov.b32 %f1493, {0,%rs762};} // end inline asm mul.f32 %f1521, %f1493, %f552; mul.f32 %f1522, %f53, %f1520; fma.rn.f32 %f1523, %f1, %f1522, %f52; fma.rn.f32 %f1495, %f1997, %f1521, %f1523; // begin inline asm { cvt.rn.bf16.f32 %rs764, %f1495;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs760, %f1491;} // end inline asm mov.b32 %r1512, {%rs760, %rs764}; add.s32 %r1513, %r639, %r223; mul.wide.s32 %rd136, %r1513, 2; add.s64 %rd135, %rd21, %rd136; // begin inline asm st.global.cs.v4.s32 [%rd135], {%r1509,%r1510,%r1511,%r1512}; // end inline asm $L__BB0_104: add.s32 %r640, %r639, %r14; neg.s32 %r1514, %r640; setp.lt.s32 %p122, %r15, %r1514; and.pred %p123, %p1, %p122; mov.b32 {%rs765, %rs102}, %r2309; // begin inline asm { mov.b32 %f1524, {0,%rs765};} // end inline asm sub.f32 %f1528, %f1524, %f1996; // begin inline asm { mov.b32 %f1525, {0,%rs337};} // end inline asm mul.f32 %f1529, %f1525, %f656; mul.f32 %f1530, %f53, %f1528; fma.rn.f32 %f1531, %f1, %f1530, %f52; fma.rn.f32 %f1527, %f1997, %f1529, %f1531; // begin inline asm { cvt.rn.bf16.f32 %rs768, %f1527;} // end inline asm @%p123 bra $L__BB0_105; bra.uni $L__BB0_106; $L__BB0_105: // begin inline asm { mov.b32 %f1560, {0,%rs102};} // end inline asm sub.f32 %f1588, %f1560, %f1996; // begin inline asm { mov.b32 %f1561, {0,%rs78};} // end inline asm mul.f32 %f1589, %f1561, %f659; mul.f32 %f1590, %f53, %f1588; fma.rn.f32 %f1591, %f1, %f1590, %f52; fma.rn.f32 %f1563, %f1997, %f1589, %f1591; // begin inline asm { cvt.rn.bf16.f32 %rs800, %f1563;} // end inline asm mov.b32 %r1515, {%rs768, %rs800}; mov.b32 {%rs801, %rs805}, %r2308; // begin inline asm { mov.b32 %f1564, {0,%rs801};} // end inline asm sub.f32 %f1592, %f1564, %f1996; mov.b32 {%rs802, %rs806}, %r2340; // begin inline asm { mov.b32 %f1565, {0,%rs802};} // end inline asm mul.f32 %f1593, %f1565, %f662; mul.f32 %f1594, %f53, %f1592; fma.rn.f32 %f1595, %f1, %f1594, %f52; fma.rn.f32 %f1567, %f1997, %f1593, %f1595; // begin inline asm { mov.b32 %f1568, {0,%rs805};} // end inline asm sub.f32 %f1596, %f1568, %f1996; // begin inline asm { mov.b32 %f1569, {0,%rs806};} // end inline asm mul.f32 %f1597, %f1569, %f665; mul.f32 %f1598, %f53, %f1596; fma.rn.f32 %f1599, %f1, %f1598, %f52; fma.rn.f32 %f1571, %f1997, %f1597, %f1599; // begin inline asm { cvt.rn.bf16.f32 %rs808, %f1571;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs804, %f1567;} // end inline asm mov.b32 %r1516, {%rs804, %rs808}; mov.b32 {%rs809, %rs813}, %r2307; // begin inline asm { mov.b32 %f1572, {0,%rs809};} // end inline asm sub.f32 %f1600, %f1572, %f1996; mov.b32 {%rs810, %rs814}, %r2339; // begin inline asm { mov.b32 %f1573, {0,%rs810};} // end inline asm mul.f32 %f1601, %f1573, %f668; mul.f32 %f1602, %f53, %f1600; fma.rn.f32 %f1603, %f1, %f1602, %f52; fma.rn.f32 %f1575, %f1997, %f1601, %f1603; // begin inline asm { mov.b32 %f1576, {0,%rs813};} // end inline asm sub.f32 %f1604, %f1576, %f1996; // begin inline asm { mov.b32 %f1577, {0,%rs814};} // end inline asm mul.f32 %f1605, %f1577, %f671; mul.f32 %f1606, %f53, %f1604; fma.rn.f32 %f1607, %f1, %f1606, %f52; fma.rn.f32 %f1579, %f1997, %f1605, %f1607; // begin inline asm { cvt.rn.bf16.f32 %rs816, %f1579;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs812, %f1575;} // end inline asm mov.b32 %r1517, {%rs812, %rs816}; mov.b32 {%rs817, %rs821}, %r2306; // begin inline asm { mov.b32 %f1580, {0,%rs817};} // end inline asm sub.f32 %f1608, %f1580, %f1996; mov.b32 {%rs818, %rs822}, %r2338; // begin inline asm { mov.b32 %f1581, {0,%rs818};} // end inline asm mul.f32 %f1609, %f1581, %f674; mul.f32 %f1610, %f53, %f1608; fma.rn.f32 %f1611, %f1, %f1610, %f52; fma.rn.f32 %f1583, %f1997, %f1609, %f1611; // begin inline asm { mov.b32 %f1584, {0,%rs821};} // end inline asm sub.f32 %f1612, %f1584, %f1996; // begin inline asm { mov.b32 %f1585, {0,%rs822};} // end inline asm mul.f32 %f1613, %f1585, %f677; mul.f32 %f1614, %f53, %f1612; fma.rn.f32 %f1615, %f1, %f1614, %f52; fma.rn.f32 %f1587, %f1997, %f1613, %f1615; // begin inline asm { cvt.rn.bf16.f32 %rs824, %f1587;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs820, %f1583;} // end inline asm mov.b32 %r1518, {%rs820, %rs824}; add.s32 %r1519, %r640, %r223; mul.wide.s32 %rd138, %r1519, 2; add.s64 %rd137, %rd21, %rd138; // begin inline asm st.global.cs.v4.s32 [%rd137], {%r1515,%r1516,%r1517,%r1518}; // end inline asm $L__BB0_106: add.s32 %r641, %r640, %r14; neg.s32 %r1520, %r641; setp.lt.s32 %p124, %r15, %r1520; and.pred %p125, %p1, %p124; mov.b32 {%rs825, %rs105}, %r2313; // begin inline asm { mov.b32 %f1616, {0,%rs825};} // end inline asm sub.f32 %f1620, %f1616, %f1996; // begin inline asm { mov.b32 %f1617, {0,%rs384};} // end inline asm mul.f32 %f1621, %f1617, %f781; mul.f32 %f1622, %f53, %f1620; fma.rn.f32 %f1623, %f1, %f1622, %f52; fma.rn.f32 %f1619, %f1997, %f1621, %f1623; // begin inline asm { cvt.rn.bf16.f32 %rs828, %f1619;} // end inline asm @%p125 bra $L__BB0_107; bra.uni $L__BB0_108; $L__BB0_107: // begin inline asm { mov.b32 %f1652, {0,%rs105};} // end inline asm sub.f32 %f1680, %f1652, %f1996; // begin inline asm { mov.b32 %f1653, {0,%rs81};} // end inline asm mul.f32 %f1681, %f1653, %f784; mul.f32 %f1682, %f53, %f1680; fma.rn.f32 %f1683, %f1, %f1682, %f52; fma.rn.f32 %f1655, %f1997, %f1681, %f1683; // begin inline asm { cvt.rn.bf16.f32 %rs860, %f1655;} // end inline asm mov.b32 %r1521, {%rs828, %rs860}; mov.b32 {%rs861, %rs865}, %r2312; // begin inline asm { mov.b32 %f1656, {0,%rs861};} // end inline asm sub.f32 %f1684, %f1656, %f1996; mov.b32 {%rs862, %rs866}, %r2344; // begin inline asm { mov.b32 %f1657, {0,%rs862};} // end inline asm mul.f32 %f1685, %f1657, %f787; mul.f32 %f1686, %f53, %f1684; fma.rn.f32 %f1687, %f1, %f1686, %f52; fma.rn.f32 %f1659, %f1997, %f1685, %f1687; // begin inline asm { mov.b32 %f1660, {0,%rs865};} // end inline asm sub.f32 %f1688, %f1660, %f1996; // begin inline asm { mov.b32 %f1661, {0,%rs866};} // end inline asm mul.f32 %f1689, %f1661, %f790; mul.f32 %f1690, %f53, %f1688; fma.rn.f32 %f1691, %f1, %f1690, %f52; fma.rn.f32 %f1663, %f1997, %f1689, %f1691; // begin inline asm { cvt.rn.bf16.f32 %rs868, %f1663;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs864, %f1659;} // end inline asm mov.b32 %r1522, {%rs864, %rs868}; mov.b32 {%rs869, %rs873}, %r2311; // begin inline asm { mov.b32 %f1664, {0,%rs869};} // end inline asm sub.f32 %f1692, %f1664, %f1996; mov.b32 {%rs870, %rs874}, %r2343; // begin inline asm { mov.b32 %f1665, {0,%rs870};} // end inline asm mul.f32 %f1693, %f1665, %f793; mul.f32 %f1694, %f53, %f1692; fma.rn.f32 %f1695, %f1, %f1694, %f52; fma.rn.f32 %f1667, %f1997, %f1693, %f1695; // begin inline asm { mov.b32 %f1668, {0,%rs873};} // end inline asm sub.f32 %f1696, %f1668, %f1996; // begin inline asm { mov.b32 %f1669, {0,%rs874};} // end inline asm mul.f32 %f1697, %f1669, %f796; mul.f32 %f1698, %f53, %f1696; fma.rn.f32 %f1699, %f1, %f1698, %f52; fma.rn.f32 %f1671, %f1997, %f1697, %f1699; // begin inline asm { cvt.rn.bf16.f32 %rs876, %f1671;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs872, %f1667;} // end inline asm mov.b32 %r1523, {%rs872, %rs876}; mov.b32 {%rs877, %rs881}, %r2310; // begin inline asm { mov.b32 %f1672, {0,%rs877};} // end inline asm sub.f32 %f1700, %f1672, %f1996; mov.b32 {%rs878, %rs882}, %r2342; // begin inline asm { mov.b32 %f1673, {0,%rs878};} // end inline asm mul.f32 %f1701, %f1673, %f799; mul.f32 %f1702, %f53, %f1700; fma.rn.f32 %f1703, %f1, %f1702, %f52; fma.rn.f32 %f1675, %f1997, %f1701, %f1703; // begin inline asm { mov.b32 %f1676, {0,%rs881};} // end inline asm sub.f32 %f1704, %f1676, %f1996; // begin inline asm { mov.b32 %f1677, {0,%rs882};} // end inline asm mul.f32 %f1705, %f1677, %f802; mul.f32 %f1706, %f53, %f1704; fma.rn.f32 %f1707, %f1, %f1706, %f52; fma.rn.f32 %f1679, %f1997, %f1705, %f1707; // begin inline asm { cvt.rn.bf16.f32 %rs884, %f1679;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs880, %f1675;} // end inline asm mov.b32 %r1524, {%rs880, %rs884}; add.s32 %r1525, %r641, %r223; mul.wide.s32 %rd140, %r1525, 2; add.s64 %rd139, %rd21, %rd140; // begin inline asm st.global.cs.v4.s32 [%rd139], {%r1521,%r1522,%r1523,%r1524}; // end inline asm $L__BB0_108: add.s32 %r642, %r641, %r14; neg.s32 %r1526, %r642; setp.lt.s32 %p126, %r15, %r1526; and.pred %p127, %p1, %p126; mov.b32 {%rs885, %rs108}, %r2317; // begin inline asm { mov.b32 %f1708, {0,%rs885};} // end inline asm sub.f32 %f1712, %f1708, %f1996; // begin inline asm { mov.b32 %f1709, {0,%rs431};} // end inline asm mul.f32 %f1713, %f1709, %f906; mul.f32 %f1714, %f53, %f1712; fma.rn.f32 %f1715, %f1, %f1714, %f52; fma.rn.f32 %f1711, %f1997, %f1713, %f1715; // begin inline asm { cvt.rn.bf16.f32 %rs888, %f1711;} // end inline asm @%p127 bra $L__BB0_109; bra.uni $L__BB0_110; $L__BB0_109: // begin inline asm { mov.b32 %f1744, {0,%rs108};} // end inline asm sub.f32 %f1772, %f1744, %f1996; // begin inline asm { mov.b32 %f1745, {0,%rs84};} // end inline asm mul.f32 %f1773, %f1745, %f909; mul.f32 %f1774, %f53, %f1772; fma.rn.f32 %f1775, %f1, %f1774, %f52; fma.rn.f32 %f1747, %f1997, %f1773, %f1775; // begin inline asm { cvt.rn.bf16.f32 %rs920, %f1747;} // end inline asm mov.b32 %r1527, {%rs888, %rs920}; mov.b32 {%rs921, %rs925}, %r2316; // begin inline asm { mov.b32 %f1748, {0,%rs921};} // end inline asm sub.f32 %f1776, %f1748, %f1996; mov.b32 {%rs922, %rs926}, %r2348; // begin inline asm { mov.b32 %f1749, {0,%rs922};} // end inline asm mul.f32 %f1777, %f1749, %f912; mul.f32 %f1778, %f53, %f1776; fma.rn.f32 %f1779, %f1, %f1778, %f52; fma.rn.f32 %f1751, %f1997, %f1777, %f1779; // begin inline asm { mov.b32 %f1752, {0,%rs925};} // end inline asm sub.f32 %f1780, %f1752, %f1996; // begin inline asm { mov.b32 %f1753, {0,%rs926};} // end inline asm mul.f32 %f1781, %f1753, %f915; mul.f32 %f1782, %f53, %f1780; fma.rn.f32 %f1783, %f1, %f1782, %f52; fma.rn.f32 %f1755, %f1997, %f1781, %f1783; // begin inline asm { cvt.rn.bf16.f32 %rs928, %f1755;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs924, %f1751;} // end inline asm mov.b32 %r1528, {%rs924, %rs928}; mov.b32 {%rs929, %rs933}, %r2315; // begin inline asm { mov.b32 %f1756, {0,%rs929};} // end inline asm sub.f32 %f1784, %f1756, %f1996; mov.b32 {%rs930, %rs934}, %r2347; // begin inline asm { mov.b32 %f1757, {0,%rs930};} // end inline asm mul.f32 %f1785, %f1757, %f918; mul.f32 %f1786, %f53, %f1784; fma.rn.f32 %f1787, %f1, %f1786, %f52; fma.rn.f32 %f1759, %f1997, %f1785, %f1787; // begin inline asm { mov.b32 %f1760, {0,%rs933};} // end inline asm sub.f32 %f1788, %f1760, %f1996; // begin inline asm { mov.b32 %f1761, {0,%rs934};} // end inline asm mul.f32 %f1789, %f1761, %f921; mul.f32 %f1790, %f53, %f1788; fma.rn.f32 %f1791, %f1, %f1790, %f52; fma.rn.f32 %f1763, %f1997, %f1789, %f1791; // begin inline asm { cvt.rn.bf16.f32 %rs936, %f1763;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs932, %f1759;} // end inline asm mov.b32 %r1529, {%rs932, %rs936}; mov.b32 {%rs937, %rs941}, %r2314; // begin inline asm { mov.b32 %f1764, {0,%rs937};} // end inline asm sub.f32 %f1792, %f1764, %f1996; mov.b32 {%rs938, %rs942}, %r2346; // begin inline asm { mov.b32 %f1765, {0,%rs938};} // end inline asm mul.f32 %f1793, %f1765, %f924; mul.f32 %f1794, %f53, %f1792; fma.rn.f32 %f1795, %f1, %f1794, %f52; fma.rn.f32 %f1767, %f1997, %f1793, %f1795; // begin inline asm { mov.b32 %f1768, {0,%rs941};} // end inline asm sub.f32 %f1796, %f1768, %f1996; // begin inline asm { mov.b32 %f1769, {0,%rs942};} // end inline asm mul.f32 %f1797, %f1769, %f927; mul.f32 %f1798, %f53, %f1796; fma.rn.f32 %f1799, %f1, %f1798, %f52; fma.rn.f32 %f1771, %f1997, %f1797, %f1799; // begin inline asm { cvt.rn.bf16.f32 %rs944, %f1771;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs940, %f1767;} // end inline asm mov.b32 %r1530, {%rs940, %rs944}; add.s32 %r1531, %r642, %r223; mul.wide.s32 %rd142, %r1531, 2; add.s64 %rd141, %rd21, %rd142; // begin inline asm st.global.cs.v4.s32 [%rd141], {%r1527,%r1528,%r1529,%r1530}; // end inline asm $L__BB0_110: add.s32 %r1532, %r642, %r14; add.s32 %r643, %r1532, %r223; neg.s32 %r1533, %r1532; setp.lt.s32 %p128, %r15, %r1533; and.pred %p129, %p1, %p128; mov.b32 {%rs945, %rs111}, %r2321; // begin inline asm { mov.b32 %f1800, {0,%rs945};} // end inline asm sub.f32 %f1804, %f1800, %f1996; // begin inline asm { mov.b32 %f1801, {0,%rs478};} // end inline asm mul.f32 %f1805, %f1801, %f1031; mul.f32 %f1806, %f53, %f1804; fma.rn.f32 %f1807, %f1, %f1806, %f52; fma.rn.f32 %f1803, %f1997, %f1805, %f1807; // begin inline asm { cvt.rn.bf16.f32 %rs948, %f1803;} // end inline asm @%p129 bra $L__BB0_111; bra.uni $L__BB0_112; $L__BB0_111: // begin inline asm { mov.b32 %f1836, {0,%rs111};} // end inline asm sub.f32 %f1864, %f1836, %f1996; // begin inline asm { mov.b32 %f1837, {0,%rs87};} // end inline asm mul.f32 %f1865, %f1837, %f1034; mul.f32 %f1866, %f53, %f1864; fma.rn.f32 %f1867, %f1, %f1866, %f52; fma.rn.f32 %f1839, %f1997, %f1865, %f1867; // begin inline asm { cvt.rn.bf16.f32 %rs980, %f1839;} // end inline asm mov.b32 %r1534, {%rs948, %rs980}; mov.b32 {%rs981, %rs985}, %r2320; // begin inline asm { mov.b32 %f1840, {0,%rs981};} // end inline asm sub.f32 %f1868, %f1840, %f1996; mov.b32 {%rs982, %rs986}, %r2352; // begin inline asm { mov.b32 %f1841, {0,%rs982};} // end inline asm mul.f32 %f1869, %f1841, %f1037; mul.f32 %f1870, %f53, %f1868; fma.rn.f32 %f1871, %f1, %f1870, %f52; fma.rn.f32 %f1843, %f1997, %f1869, %f1871; // begin inline asm { mov.b32 %f1844, {0,%rs985};} // end inline asm sub.f32 %f1872, %f1844, %f1996; // begin inline asm { mov.b32 %f1845, {0,%rs986};} // end inline asm mul.f32 %f1873, %f1845, %f1040; mul.f32 %f1874, %f53, %f1872; fma.rn.f32 %f1875, %f1, %f1874, %f52; fma.rn.f32 %f1847, %f1997, %f1873, %f1875; // begin inline asm { cvt.rn.bf16.f32 %rs988, %f1847;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs984, %f1843;} // end inline asm mov.b32 %r1535, {%rs984, %rs988}; mov.b32 {%rs989, %rs993}, %r2319; // begin inline asm { mov.b32 %f1848, {0,%rs989};} // end inline asm sub.f32 %f1876, %f1848, %f1996; mov.b32 {%rs990, %rs994}, %r2351; // begin inline asm { mov.b32 %f1849, {0,%rs990};} // end inline asm mul.f32 %f1877, %f1849, %f1043; mul.f32 %f1878, %f53, %f1876; fma.rn.f32 %f1879, %f1, %f1878, %f52; fma.rn.f32 %f1851, %f1997, %f1877, %f1879; // begin inline asm { mov.b32 %f1852, {0,%rs993};} // end inline asm sub.f32 %f1880, %f1852, %f1996; // begin inline asm { mov.b32 %f1853, {0,%rs994};} // end inline asm mul.f32 %f1881, %f1853, %f1046; mul.f32 %f1882, %f53, %f1880; fma.rn.f32 %f1883, %f1, %f1882, %f52; fma.rn.f32 %f1855, %f1997, %f1881, %f1883; // begin inline asm { cvt.rn.bf16.f32 %rs996, %f1855;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs992, %f1851;} // end inline asm mov.b32 %r1536, {%rs992, %rs996}; mov.b32 {%rs997, %rs1001}, %r2318; // begin inline asm { mov.b32 %f1856, {0,%rs997};} // end inline asm sub.f32 %f1884, %f1856, %f1996; mov.b32 {%rs998, %rs1002}, %r2350; // begin inline asm { mov.b32 %f1857, {0,%rs998};} // end inline asm mul.f32 %f1885, %f1857, %f1049; mul.f32 %f1886, %f53, %f1884; fma.rn.f32 %f1887, %f1, %f1886, %f52; fma.rn.f32 %f1859, %f1997, %f1885, %f1887; // begin inline asm { mov.b32 %f1860, {0,%rs1001};} // end inline asm sub.f32 %f1888, %f1860, %f1996; // begin inline asm { mov.b32 %f1861, {0,%rs1002};} // end inline asm mul.f32 %f1889, %f1861, %f1052; mul.f32 %f1890, %f53, %f1888; fma.rn.f32 %f1891, %f1, %f1890, %f52; fma.rn.f32 %f1863, %f1997, %f1889, %f1891; // begin inline asm { cvt.rn.bf16.f32 %rs1004, %f1863;} // end inline asm // begin inline asm { cvt.rn.bf16.f32 %rs1000, %f1859;} // end inline asm mov.b32 %r1537, {%rs1000, %rs1004}; mul.wide.s32 %rd144, %r643, 2; add.s64 %rd143, %rd21, %rd144; // begin inline asm st.global.cs.v4.s32 [%rd143], {%r1534,%r1535,%r1536,%r1537}; // end inline asm $L__BB0_112: shl.b32 %r2612, %r2612, 6; add.s32 %r2289, %r2289, 1; setp.lt.s32 %p130, %r2289, %r5; @%p130 bra $L__BB0_21; bra.uni $L__BB0_113; $L__BB0_19: mov.u32 %r2466, 0; mov.u32 %r2467, %r2466; mov.u32 %r2468, %r2466; mov.u32 %r2469, %r2466; mov.u32 %r2470, %r2466; mov.u32 %r2471, %r2466; mov.u32 %r2472, %r2466; mov.u32 %r2473, %r2466; mov.u32 %r2450, %r2466; mov.u32 %r2451, %r2466; mov.u32 %r2452, %r2466; mov.u32 %r2453, %r2466; mov.u32 %r2454, %r2466; mov.u32 %r2455, %r2466; mov.u32 %r2456, %r2466; mov.u32 %r2457, %r2466; mov.u32 %r2434, %r2466; mov.u32 %r2435, %r2466; mov.u32 %r2436, %r2466; mov.u32 %r2437, %r2466; mov.u32 %r2438, %r2466; mov.u32 %r2439, %r2466; mov.u32 %r2440, %r2466; mov.u32 %r2441, %r2466; mov.u32 %r2418, %r2466; mov.u32 %r2419, %r2466; mov.u32 %r2420, %r2466; mov.u32 %r2421, %r2466; mov.u32 %r2422, %r2466; mov.u32 %r2423, %r2466; mov.u32 %r2424, %r2466; mov.u32 %r2425, %r2466; mov.u32 %r2402, %r2466; mov.u32 %r2403, %r2466; mov.u32 %r2404, %r2466; mov.u32 %r2405, %r2466; mov.u32 %r2406, %r2466; mov.u32 %r2407, %r2466; mov.u32 %r2408, %r2466; mov.u32 %r2409, %r2466; mov.u32 %r2386, %r2466; mov.u32 %r2387, %r2466; mov.u32 %r2388, %r2466; mov.u32 %r2389, %r2466; mov.u32 %r2390, %r2466; mov.u32 %r2391, %r2466; mov.u32 %r2392, %r2466; mov.u32 %r2393, %r2466; mov.u32 %r2370, %r2466; mov.u32 %r2371, %r2466; mov.u32 %r2372, %r2466; mov.u32 %r2373, %r2466; mov.u32 %r2374, %r2466; mov.u32 %r2375, %r2466; mov.u32 %r2376, %r2466; mov.u32 %r2377, %r2466; mov.u32 %r2216, %r2466; mov.u32 %r2217, %r2466; mov.u32 %r2218, %r2466; mov.u32 %r2219, %r2466; mov.u32 %r2220, %r2466; mov.u32 %r2221, %r2466; mov.u32 %r2222, %r2466; mov.u32 %r2223, %r2466; mov.u32 %r2474, %r2466; mov.u32 %r2475, %r2466; mov.u32 %r2476, %r2466; mov.u32 %r2477, %r2466; mov.u32 %r2478, %r2466; mov.u32 %r2479, %r2466; mov.u32 %r2480, %r2466; mov.u32 %r2481, %r2466; mov.u32 %r2458, %r2466; mov.u32 %r2459, %r2466; mov.u32 %r2460, %r2466; mov.u32 %r2461, %r2466; mov.u32 %r2462, %r2466; mov.u32 %r2463, %r2466; mov.u32 %r2464, %r2466; mov.u32 %r2465, %r2466; mov.u32 %r2442, %r2466; mov.u32 %r2443, %r2466; mov.u32 %r2444, %r2466; mov.u32 %r2445, %r2466; mov.u32 %r2446, %r2466; mov.u32 %r2447, %r2466; mov.u32 %r2448, %r2466; mov.u32 %r2449, %r2466; mov.u32 %r2426, %r2466; mov.u32 %r2427, %r2466; mov.u32 %r2428, %r2466; mov.u32 %r2429, %r2466; mov.u32 %r2430, %r2466; mov.u32 %r2431, %r2466; mov.u32 %r2432, %r2466; mov.u32 %r2433, %r2466; mov.u32 %r2410, %r2466; mov.u32 %r2411, %r2466; mov.u32 %r2412, %r2466; mov.u32 %r2413, %r2466; mov.u32 %r2414, %r2466; mov.u32 %r2415, %r2466; mov.u32 %r2416, %r2466; mov.u32 %r2417, %r2466; mov.u32 %r2394, %r2466; mov.u32 %r2395, %r2466; mov.u32 %r2396, %r2466; mov.u32 %r2397, %r2466; mov.u32 %r2398, %r2466; mov.u32 %r2399, %r2466; mov.u32 %r2400, %r2466; mov.u32 %r2401, %r2466; mov.u32 %r2378, %r2466; mov.u32 %r2379, %r2466; mov.u32 %r2380, %r2466; mov.u32 %r2381, %r2466; mov.u32 %r2382, %r2466; mov.u32 %r2383, %r2466; mov.u32 %r2384, %r2466; mov.u32 %r2385, %r2466; mov.u32 %r2280, %r2466; mov.u32 %r2281, %r2466; mov.u32 %r2282, %r2466; mov.u32 %r2283, %r2466; mov.u32 %r2284, %r2466; mov.u32 %r2285, %r2466; mov.u32 %r2286, %r2466; mov.u32 %r2287, %r2466; $L__BB0_113: neg.s32 %r775, %r14; mul.lo.s32 %r776, %r2612, %r775; setp.lt.s32 %p131, %r15, %r776; setp.lt.s32 %p132, %r8, %r10; and.pred %p133, %p132, %p131; shl.b32 %r777, %r2612, 2; @%p133 bra $L__BB0_118; bra.uni $L__BB0_114; $L__BB0_118: mov.u32 %r1576, %ctaid.y; mad.lo.s32 %r1582, %r920, %r1576, %r9; add.s32 %r1583, %r777, %r1582; mul.wide.s32 %rd151, %r1583, 4; add.s64 %rd149, %rd24, %rd151; // begin inline asm st.volatile.global.v4.s32 [%rd149], {%r2287,%r2286,%r2285,%r2284}; // end inline asm add.s32 %r1584, %r1583, 4; mul.wide.s32 %rd152, %r1584, 4; add.s64 %rd150, %rd24, %rd152; // begin inline asm st.volatile.global.v4.s32 [%rd150], {%r2283,%r2282,%r2281,%r2280}; // end inline asm bra.uni $L__BB0_119; $L__BB0_114: add.s32 %r1538, %r15, -4; neg.s32 %r1539, %r777; setp.ge.s32 %p135, %r1538, %r1539; or.pred %p136, %p4, %p135; @%p136 bra $L__BB0_116; mov.u32 %r1544, %ctaid.y; mad.lo.s32 %r1550, %r920, %r1544, %r9; add.s32 %r1551, %r777, %r1550; mul.wide.s32 %rd146, %r1551, 4; add.s64 %rd145, %rd24, %rd146; // begin inline asm st.volatile.global.v4.s32 [%rd145], {%r2287,%r2286,%r2285,%r2284}; // end inline asm $L__BB0_116: mov.u32 %r1553, -4; sub.s32 %r1554, %r1553, %r777; setp.ge.s32 %p138, %r1538, %r1554; or.pred %p139, %p4, %p138; @%p139 bra $L__BB0_119; mov.u32 %r1559, %ctaid.y; mad.lo.s32 %r1565, %r920, %r1559, %r9; add.s32 %r1566, %r777, %r1565; add.s32 %r1567, %r1566, 4; mul.wide.s32 %rd148, %r1567, 4; add.s64 %rd147, %rd24, %rd148; // begin inline asm st.volatile.global.v4.s32 [%rd147], {%r2283,%r2282,%r2281,%r2280}; // end inline asm $L__BB0_119: mov.u32 %r1585, %ctaid.y; mad.lo.s32 %r1591, %r920, %r1585, %r9; add.s32 %r778, %r14, %r1591; sub.s32 %r779, %r776, %r14; setp.lt.s32 %p141, %r15, %r779; and.pred %p142, %p132, %p141; @%p142 bra $L__BB0_124; bra.uni $L__BB0_120; $L__BB0_124: add.s32 %r1616, %r777, %r778; mul.wide.s32 %rd159, %r1616, 4; add.s64 %rd157, %rd24, %rd159; // begin inline asm st.volatile.global.v4.s32 [%rd157], {%r2385,%r2384,%r2383,%r2382}; // end inline asm add.s32 %r1617, %r1616, 4; mul.wide.s32 %rd160, %r1617, 4; add.s64 %rd158, %rd24, %rd160; // begin inline asm st.volatile.global.v4.s32 [%rd158], {%r2381,%r2380,%r2379,%r2378}; // end inline asm bra.uni $L__BB0_125; $L__BB0_120: add.s32 %r1592, %r777, %r14; neg.s32 %r1593, %r1592; add.s32 %r1594, %r15, -4; setp.ge.s32 %p144, %r1594, %r1593; or.pred %p145, %p4, %p144; @%p145 bra $L__BB0_122; add.s32 %r1599, %r777, %r778; mul.wide.s32 %rd154, %r1599, 4; add.s64 %rd153, %rd24, %rd154; // begin inline asm st.volatile.global.v4.s32 [%rd153], {%r2385,%r2384,%r2383,%r2382}; // end inline asm $L__BB0_122: add.s32 %r780, %r777, 4; add.s32 %r1600, %r780, %r14; neg.s32 %r1601, %r1600; setp.ge.s32 %p147, %r1594, %r1601; or.pred %p148, %p4, %p147; @%p148 bra $L__BB0_125; add.s32 %r1607, %r780, %r778; mul.wide.s32 %rd156, %r1607, 4; add.s64 %rd155, %rd24, %rd156; // begin inline asm st.volatile.global.v4.s32 [%rd155], {%r2381,%r2380,%r2379,%r2378}; // end inline asm $L__BB0_125: shl.b32 %r781, %r14, 1; add.s32 %r782, %r778, %r14; sub.s32 %r783, %r779, %r14; setp.lt.s32 %p150, %r15, %r783; and.pred %p151, %p132, %p150; @%p151 bra $L__BB0_130; bra.uni $L__BB0_126; $L__BB0_130: add.s32 %r1642, %r777, %r782; mul.wide.s32 %rd167, %r1642, 4; add.s64 %rd165, %rd24, %rd167; // begin inline asm st.volatile.global.v4.s32 [%rd165], {%r2401,%r2400,%r2399,%r2398}; // end inline asm add.s32 %r1643, %r1642, 4; mul.wide.s32 %rd168, %r1643, 4; add.s64 %rd166, %rd24, %rd168; // begin inline asm st.volatile.global.v4.s32 [%rd166], {%r2397,%r2396,%r2395,%r2394}; // end inline asm bra.uni $L__BB0_131; $L__BB0_126: add.s32 %r1618, %r777, %r781; neg.s32 %r1619, %r1618; add.s32 %r1620, %r15, -4; setp.ge.s32 %p153, %r1620, %r1619; or.pred %p154, %p4, %p153; @%p154 bra $L__BB0_128; add.s32 %r1625, %r777, %r782; mul.wide.s32 %rd162, %r1625, 4; add.s64 %rd161, %rd24, %rd162; // begin inline asm st.volatile.global.v4.s32 [%rd161], {%r2401,%r2400,%r2399,%r2398}; // end inline asm $L__BB0_128: add.s32 %r784, %r777, 4; add.s32 %r1626, %r784, %r781; neg.s32 %r1627, %r1626; setp.ge.s32 %p156, %r1620, %r1627; or.pred %p157, %p4, %p156; @%p157 bra $L__BB0_131; add.s32 %r1633, %r784, %r782; mul.wide.s32 %rd164, %r1633, 4; add.s64 %rd163, %rd24, %rd164; // begin inline asm st.volatile.global.v4.s32 [%rd163], {%r2397,%r2396,%r2395,%r2394}; // end inline asm $L__BB0_131: mul.lo.s32 %r785, %r14, 3; add.s32 %r786, %r782, %r14; sub.s32 %r787, %r783, %r14; setp.lt.s32 %p159, %r15, %r787; and.pred %p160, %p132, %p159; @%p160 bra $L__BB0_136; bra.uni $L__BB0_132; $L__BB0_136: add.s32 %r1668, %r777, %r786; mul.wide.s32 %rd175, %r1668, 4; add.s64 %rd173, %rd24, %rd175; // begin inline asm st.volatile.global.v4.s32 [%rd173], {%r2417,%r2416,%r2415,%r2414}; // end inline asm add.s32 %r1669, %r1668, 4; mul.wide.s32 %rd176, %r1669, 4; add.s64 %rd174, %rd24, %rd176; // begin inline asm st.volatile.global.v4.s32 [%rd174], {%r2413,%r2412,%r2411,%r2410}; // end inline asm bra.uni $L__BB0_137; $L__BB0_132: add.s32 %r1644, %r777, %r785; neg.s32 %r1645, %r1644; add.s32 %r1646, %r15, -4; setp.ge.s32 %p162, %r1646, %r1645; or.pred %p163, %p4, %p162; @%p163 bra $L__BB0_134; add.s32 %r1651, %r777, %r786; mul.wide.s32 %rd170, %r1651, 4; add.s64 %rd169, %rd24, %rd170; // begin inline asm st.volatile.global.v4.s32 [%rd169], {%r2417,%r2416,%r2415,%r2414}; // end inline asm $L__BB0_134: add.s32 %r788, %r777, 4; add.s32 %r1652, %r788, %r785; neg.s32 %r1653, %r1652; setp.ge.s32 %p165, %r1646, %r1653; or.pred %p166, %p4, %p165; @%p166 bra $L__BB0_137; add.s32 %r1659, %r788, %r786; mul.wide.s32 %rd172, %r1659, 4; add.s64 %rd171, %rd24, %rd172; // begin inline asm st.volatile.global.v4.s32 [%rd171], {%r2413,%r2412,%r2411,%r2410}; // end inline asm $L__BB0_137: shl.b32 %r789, %r14, 2; add.s32 %r790, %r786, %r14; add.s32 %r791, %r2612, 4; sub.s32 %r792, %r787, %r14; setp.lt.s32 %p168, %r15, %r792; and.pred %p169, %p132, %p168; @%p169 bra $L__BB0_142; bra.uni $L__BB0_138; $L__BB0_142: add.s32 %r1694, %r777, %r790; mul.wide.s32 %rd183, %r1694, 4; add.s64 %rd181, %rd24, %rd183; // begin inline asm st.volatile.global.v4.s32 [%rd181], {%r2433,%r2432,%r2431,%r2430}; // end inline asm mad.lo.s32 %r1695, %r2612, 3, %r791; add.s32 %r1696, %r1695, %r790; mul.wide.s32 %rd184, %r1696, 4; add.s64 %rd182, %rd24, %rd184; // begin inline asm st.volatile.global.v4.s32 [%rd182], {%r2429,%r2428,%r2427,%r2426}; // end inline asm bra.uni $L__BB0_143; $L__BB0_138: add.s32 %r1670, %r777, %r789; neg.s32 %r1671, %r1670; add.s32 %r1672, %r15, -4; setp.ge.s32 %p171, %r1672, %r1671; or.pred %p172, %p4, %p171; @%p172 bra $L__BB0_140; add.s32 %r1677, %r777, %r790; mul.wide.s32 %rd178, %r1677, 4; add.s64 %rd177, %rd24, %rd178; // begin inline asm st.volatile.global.v4.s32 [%rd177], {%r2433,%r2432,%r2431,%r2430}; // end inline asm $L__BB0_140: add.s32 %r793, %r777, 4; add.s32 %r1678, %r793, %r789; neg.s32 %r1679, %r1678; setp.ge.s32 %p174, %r1672, %r1679; or.pred %p175, %p4, %p174; @%p175 bra $L__BB0_143; add.s32 %r1685, %r793, %r790; mul.wide.s32 %rd180, %r1685, 4; add.s64 %rd179, %rd24, %rd180; // begin inline asm st.volatile.global.v4.s32 [%rd179], {%r2429,%r2428,%r2427,%r2426}; // end inline asm $L__BB0_143: mul.lo.s32 %r794, %r14, 5; add.s32 %r795, %r790, %r14; sub.s32 %r796, %r792, %r14; setp.lt.s32 %p177, %r15, %r796; and.pred %p178, %p132, %p177; @%p178 bra $L__BB0_148; bra.uni $L__BB0_144; $L__BB0_148: add.s32 %r1721, %r777, %r795; mul.wide.s32 %rd191, %r1721, 4; add.s64 %rd189, %rd24, %rd191; // begin inline asm st.volatile.global.v4.s32 [%rd189], {%r2449,%r2448,%r2447,%r2446}; // end inline asm mad.lo.s32 %r1722, %r2612, 3, %r791; add.s32 %r1723, %r1722, %r795; mul.wide.s32 %rd192, %r1723, 4; add.s64 %rd190, %rd24, %rd192; // begin inline asm st.volatile.global.v4.s32 [%rd190], {%r2445,%r2444,%r2443,%r2442}; // end inline asm bra.uni $L__BB0_149; $L__BB0_144: add.s32 %r1697, %r777, %r794; neg.s32 %r1698, %r1697; add.s32 %r1699, %r15, -4; setp.ge.s32 %p180, %r1699, %r1698; or.pred %p181, %p4, %p180; @%p181 bra $L__BB0_146; add.s32 %r1704, %r777, %r795; mul.wide.s32 %rd186, %r1704, 4; add.s64 %rd185, %rd24, %rd186; // begin inline asm st.volatile.global.v4.s32 [%rd185], {%r2449,%r2448,%r2447,%r2446}; // end inline asm $L__BB0_146: add.s32 %r797, %r777, 4; add.s32 %r1705, %r797, %r794; neg.s32 %r1706, %r1705; setp.ge.s32 %p183, %r1699, %r1706; or.pred %p184, %p4, %p183; @%p184 bra $L__BB0_149; add.s32 %r1712, %r797, %r795; mul.wide.s32 %rd188, %r1712, 4; add.s64 %rd187, %rd24, %rd188; // begin inline asm st.volatile.global.v4.s32 [%rd187], {%r2445,%r2444,%r2443,%r2442}; // end inline asm $L__BB0_149: mul.lo.s32 %r798, %r14, 6; add.s32 %r799, %r795, %r14; sub.s32 %r800, %r796, %r14; setp.lt.s32 %p186, %r15, %r800; and.pred %p187, %p132, %p186; @%p187 bra $L__BB0_154; bra.uni $L__BB0_150; $L__BB0_154: add.s32 %r1748, %r777, %r799; mul.wide.s32 %rd199, %r1748, 4; add.s64 %rd197, %rd24, %rd199; // begin inline asm st.volatile.global.v4.s32 [%rd197], {%r2465,%r2464,%r2463,%r2462}; // end inline asm mad.lo.s32 %r1749, %r2612, 3, %r791; add.s32 %r1750, %r1749, %r799; mul.wide.s32 %rd200, %r1750, 4; add.s64 %rd198, %rd24, %rd200; // begin inline asm st.volatile.global.v4.s32 [%rd198], {%r2461,%r2460,%r2459,%r2458}; // end inline asm bra.uni $L__BB0_155; $L__BB0_150: add.s32 %r1724, %r777, %r798; neg.s32 %r1725, %r1724; add.s32 %r1726, %r15, -4; setp.ge.s32 %p189, %r1726, %r1725; or.pred %p190, %p4, %p189; @%p190 bra $L__BB0_152; add.s32 %r1731, %r777, %r799; mul.wide.s32 %rd194, %r1731, 4; add.s64 %rd193, %rd24, %rd194; // begin inline asm st.volatile.global.v4.s32 [%rd193], {%r2465,%r2464,%r2463,%r2462}; // end inline asm $L__BB0_152: add.s32 %r801, %r777, 4; add.s32 %r1732, %r801, %r798; neg.s32 %r1733, %r1732; setp.ge.s32 %p192, %r1726, %r1733; or.pred %p193, %p4, %p192; @%p193 bra $L__BB0_155; add.s32 %r1739, %r801, %r799; mul.wide.s32 %rd196, %r1739, 4; add.s64 %rd195, %rd24, %rd196; // begin inline asm st.volatile.global.v4.s32 [%rd195], {%r2461,%r2460,%r2459,%r2458}; // end inline asm $L__BB0_155: mul.lo.s32 %r802, %r14, 7; add.s32 %r803, %r799, %r14; sub.s32 %r1751, %r800, %r14; setp.lt.s32 %p195, %r15, %r1751; and.pred %p196, %p132, %p195; @%p196 bra $L__BB0_160; bra.uni $L__BB0_156; $L__BB0_160: add.s32 %r1776, %r777, %r803; mul.wide.s32 %rd207, %r1776, 4; add.s64 %rd205, %rd24, %rd207; // begin inline asm st.volatile.global.v4.s32 [%rd205], {%r2481,%r2480,%r2479,%r2478}; // end inline asm add.s32 %r1777, %r1776, 4; mul.wide.s32 %rd208, %r1777, 4; add.s64 %rd206, %rd24, %rd208; // begin inline asm st.volatile.global.v4.s32 [%rd206], {%r2477,%r2476,%r2475,%r2474}; // end inline asm bra.uni $L__BB0_161; $L__BB0_156: add.s32 %r1752, %r777, %r802; neg.s32 %r1753, %r1752; add.s32 %r1754, %r15, -4; setp.ge.s32 %p198, %r1754, %r1753; or.pred %p199, %p4, %p198; @%p199 bra $L__BB0_158; add.s32 %r1759, %r777, %r803; mul.wide.s32 %rd202, %r1759, 4; add.s64 %rd201, %rd24, %rd202; // begin inline asm st.volatile.global.v4.s32 [%rd201], {%r2481,%r2480,%r2479,%r2478}; // end inline asm $L__BB0_158: add.s32 %r804, %r777, 4; add.s32 %r1760, %r804, %r802; neg.s32 %r1761, %r1760; setp.ge.s32 %p201, %r1754, %r1761; or.pred %p202, %p4, %p201; @%p202 bra $L__BB0_161; add.s32 %r1767, %r804, %r803; mul.wide.s32 %rd204, %r1767, 4; add.s64 %rd203, %rd24, %rd204; // begin inline asm st.volatile.global.v4.s32 [%rd203], {%r2477,%r2476,%r2475,%r2474}; // end inline asm $L__BB0_161: shl.b32 %r805, %r2612, 1; mul.lo.s32 %r806, %r805, %r775; setp.lt.s32 %p204, %r15, %r806; and.pred %p205, %p132, %p204; shl.b32 %r807, %r2612, 3; @%p205 bra $L__BB0_166; bra.uni $L__BB0_162; $L__BB0_166: add.s32 %r1823, %r807, %r1591; mul.wide.s32 %rd215, %r1823, 4; add.s64 %rd213, %rd25, %rd215; // begin inline asm st.volatile.global.v4.s32 [%rd213], {%r2223,%r2222,%r2221,%r2220}; // end inline asm add.s32 %r1824, %r1823, 4; mul.wide.s32 %rd216, %r1824, 4; add.s64 %rd214, %rd25, %rd216; // begin inline asm st.volatile.global.v4.s32 [%rd214], {%r2219,%r2218,%r2217,%r2216}; // end inline asm bra.uni $L__BB0_167; $L__BB0_162: add.s32 %r1778, %r15, -4; neg.s32 %r1779, %r807; setp.ge.s32 %p207, %r1778, %r1779; or.pred %p208, %p4, %p207; @%p208 bra $L__BB0_164; add.s32 %r1791, %r807, %r1591; mul.wide.s32 %rd210, %r1791, 4; add.s64 %rd209, %rd25, %rd210; // begin inline asm st.volatile.global.v4.s32 [%rd209], {%r2223,%r2222,%r2221,%r2220}; // end inline asm $L__BB0_164: mov.u32 %r1793, -4; sub.s32 %r1794, %r1793, %r807; setp.ge.s32 %p210, %r1778, %r1794; or.pred %p211, %p4, %p210; @%p211 bra $L__BB0_167; add.s32 %r1806, %r807, %r1591; add.s32 %r1807, %r1806, 4; mul.wide.s32 %rd212, %r1807, 4; add.s64 %rd211, %rd25, %rd212; // begin inline asm st.volatile.global.v4.s32 [%rd211], {%r2219,%r2218,%r2217,%r2216}; // end inline asm $L__BB0_167: sub.s32 %r808, %r806, %r14; setp.lt.s32 %p213, %r15, %r808; and.pred %p214, %p132, %p213; @%p214 bra $L__BB0_172; bra.uni $L__BB0_168; $L__BB0_172: add.s32 %r1849, %r807, %r778; mul.wide.s32 %rd223, %r1849, 4; add.s64 %rd221, %rd25, %rd223; // begin inline asm st.volatile.global.v4.s32 [%rd221], {%r2377,%r2376,%r2375,%r2374}; // end inline asm add.s32 %r1850, %r1849, 4; mul.wide.s32 %rd224, %r1850, 4; add.s64 %rd222, %rd25, %rd224; // begin inline asm st.volatile.global.v4.s32 [%rd222], {%r2373,%r2372,%r2371,%r2370}; // end inline asm bra.uni $L__BB0_173; $L__BB0_168: add.s32 %r1825, %r807, %r14; neg.s32 %r1826, %r1825; add.s32 %r1827, %r15, -4; setp.ge.s32 %p216, %r1827, %r1826; or.pred %p217, %p4, %p216; @%p217 bra $L__BB0_170; add.s32 %r1832, %r807, %r778; mul.wide.s32 %rd218, %r1832, 4; add.s64 %rd217, %rd25, %rd218; // begin inline asm st.volatile.global.v4.s32 [%rd217], {%r2377,%r2376,%r2375,%r2374}; // end inline asm $L__BB0_170: add.s32 %r809, %r807, 4; add.s32 %r1833, %r809, %r14; neg.s32 %r1834, %r1833; setp.ge.s32 %p219, %r1827, %r1834; or.pred %p220, %p4, %p219; @%p220 bra $L__BB0_173; add.s32 %r1840, %r809, %r778; mul.wide.s32 %rd220, %r1840, 4; add.s64 %rd219, %rd25, %rd220; // begin inline asm st.volatile.global.v4.s32 [%rd219], {%r2373,%r2372,%r2371,%r2370}; // end inline asm $L__BB0_173: sub.s32 %r810, %r808, %r14; setp.lt.s32 %p222, %r15, %r810; and.pred %p223, %p132, %p222; @%p223 bra $L__BB0_178; bra.uni $L__BB0_174; $L__BB0_178: add.s32 %r1875, %r807, %r782; mul.wide.s32 %rd231, %r1875, 4; add.s64 %rd229, %rd25, %rd231; // begin inline asm st.volatile.global.v4.s32 [%rd229], {%r2393,%r2392,%r2391,%r2390}; // end inline asm add.s32 %r1876, %r1875, 4; mul.wide.s32 %rd232, %r1876, 4; add.s64 %rd230, %rd25, %rd232; // begin inline asm st.volatile.global.v4.s32 [%rd230], {%r2389,%r2388,%r2387,%r2386}; // end inline asm bra.uni $L__BB0_179; $L__BB0_174: add.s32 %r1851, %r807, %r781; neg.s32 %r1852, %r1851; add.s32 %r1853, %r15, -4; setp.ge.s32 %p225, %r1853, %r1852; or.pred %p226, %p4, %p225; @%p226 bra $L__BB0_176; add.s32 %r1858, %r807, %r782; mul.wide.s32 %rd226, %r1858, 4; add.s64 %rd225, %rd25, %rd226; // begin inline asm st.volatile.global.v4.s32 [%rd225], {%r2393,%r2392,%r2391,%r2390}; // end inline asm $L__BB0_176: add.s32 %r811, %r807, 4; add.s32 %r1859, %r811, %r781; neg.s32 %r1860, %r1859; setp.ge.s32 %p228, %r1853, %r1860; or.pred %p229, %p4, %p228; @%p229 bra $L__BB0_179; add.s32 %r1866, %r811, %r782; mul.wide.s32 %rd228, %r1866, 4; add.s64 %rd227, %rd25, %rd228; // begin inline asm st.volatile.global.v4.s32 [%rd227], {%r2389,%r2388,%r2387,%r2386}; // end inline asm $L__BB0_179: sub.s32 %r812, %r810, %r14; setp.lt.s32 %p231, %r15, %r812; and.pred %p232, %p132, %p231; @%p232 bra $L__BB0_184; bra.uni $L__BB0_180; $L__BB0_184: add.s32 %r1901, %r807, %r786; mul.wide.s32 %rd239, %r1901, 4; add.s64 %rd237, %rd25, %rd239; // begin inline asm st.volatile.global.v4.s32 [%rd237], {%r2409,%r2408,%r2407,%r2406}; // end inline asm add.s32 %r1902, %r1901, 4; mul.wide.s32 %rd240, %r1902, 4; add.s64 %rd238, %rd25, %rd240; // begin inline asm st.volatile.global.v4.s32 [%rd238], {%r2405,%r2404,%r2403,%r2402}; // end inline asm bra.uni $L__BB0_185; $L__BB0_180: add.s32 %r1877, %r807, %r785; neg.s32 %r1878, %r1877; add.s32 %r1879, %r15, -4; setp.ge.s32 %p234, %r1879, %r1878; or.pred %p235, %p4, %p234; @%p235 bra $L__BB0_182; add.s32 %r1884, %r807, %r786; mul.wide.s32 %rd234, %r1884, 4; add.s64 %rd233, %rd25, %rd234; // begin inline asm st.volatile.global.v4.s32 [%rd233], {%r2409,%r2408,%r2407,%r2406}; // end inline asm $L__BB0_182: add.s32 %r813, %r807, 4; add.s32 %r1885, %r813, %r785; neg.s32 %r1886, %r1885; setp.ge.s32 %p237, %r1879, %r1886; or.pred %p238, %p4, %p237; @%p238 bra $L__BB0_185; add.s32 %r1892, %r813, %r786; mul.wide.s32 %rd236, %r1892, 4; add.s64 %rd235, %rd25, %rd236; // begin inline asm st.volatile.global.v4.s32 [%rd235], {%r2405,%r2404,%r2403,%r2402}; // end inline asm $L__BB0_185: add.s32 %r814, %r805, 4; sub.s32 %r815, %r812, %r14; setp.lt.s32 %p240, %r15, %r815; and.pred %p241, %p132, %p240; @%p241 bra $L__BB0_190; bra.uni $L__BB0_186; $L__BB0_190: add.s32 %r1927, %r807, %r790; mul.wide.s32 %rd247, %r1927, 4; add.s64 %rd245, %rd25, %rd247; // begin inline asm st.volatile.global.v4.s32 [%rd245], {%r2425,%r2424,%r2423,%r2422}; // end inline asm mad.lo.s32 %r1928, %r2612, 6, %r814; add.s32 %r1929, %r1928, %r790; mul.wide.s32 %rd248, %r1929, 4; add.s64 %rd246, %rd25, %rd248; // begin inline asm st.volatile.global.v4.s32 [%rd246], {%r2421,%r2420,%r2419,%r2418}; // end inline asm bra.uni $L__BB0_191; $L__BB0_186: add.s32 %r1903, %r807, %r789; neg.s32 %r1904, %r1903; add.s32 %r1905, %r15, -4; setp.ge.s32 %p243, %r1905, %r1904; or.pred %p244, %p4, %p243; @%p244 bra $L__BB0_188; add.s32 %r1910, %r807, %r790; mul.wide.s32 %rd242, %r1910, 4; add.s64 %rd241, %rd25, %rd242; // begin inline asm st.volatile.global.v4.s32 [%rd241], {%r2425,%r2424,%r2423,%r2422}; // end inline asm $L__BB0_188: add.s32 %r816, %r807, 4; add.s32 %r1911, %r816, %r789; neg.s32 %r1912, %r1911; setp.ge.s32 %p246, %r1905, %r1912; or.pred %p247, %p4, %p246; @%p247 bra $L__BB0_191; add.s32 %r1918, %r816, %r790; mul.wide.s32 %rd244, %r1918, 4; add.s64 %rd243, %rd25, %rd244; // begin inline asm st.volatile.global.v4.s32 [%rd243], {%r2421,%r2420,%r2419,%r2418}; // end inline asm $L__BB0_191: sub.s32 %r817, %r815, %r14; setp.lt.s32 %p249, %r15, %r817; and.pred %p250, %p132, %p249; @%p250 bra $L__BB0_196; bra.uni $L__BB0_192; $L__BB0_196: add.s32 %r1954, %r807, %r795; mul.wide.s32 %rd255, %r1954, 4; add.s64 %rd253, %rd25, %rd255; // begin inline asm st.volatile.global.v4.s32 [%rd253], {%r2441,%r2440,%r2439,%r2438}; // end inline asm mad.lo.s32 %r1955, %r2612, 6, %r814; add.s32 %r1956, %r1955, %r795; mul.wide.s32 %rd256, %r1956, 4; add.s64 %rd254, %rd25, %rd256; // begin inline asm st.volatile.global.v4.s32 [%rd254], {%r2437,%r2436,%r2435,%r2434}; // end inline asm bra.uni $L__BB0_197; $L__BB0_192: add.s32 %r1930, %r807, %r794; neg.s32 %r1931, %r1930; add.s32 %r1932, %r15, -4; setp.ge.s32 %p252, %r1932, %r1931; or.pred %p253, %p4, %p252; @%p253 bra $L__BB0_194; add.s32 %r1937, %r807, %r795; mul.wide.s32 %rd250, %r1937, 4; add.s64 %rd249, %rd25, %rd250; // begin inline asm st.volatile.global.v4.s32 [%rd249], {%r2441,%r2440,%r2439,%r2438}; // end inline asm $L__BB0_194: add.s32 %r818, %r807, 4; add.s32 %r1938, %r818, %r794; neg.s32 %r1939, %r1938; setp.ge.s32 %p255, %r1932, %r1939; or.pred %p256, %p4, %p255; @%p256 bra $L__BB0_197; add.s32 %r1945, %r818, %r795; mul.wide.s32 %rd252, %r1945, 4; add.s64 %rd251, %rd25, %rd252; // begin inline asm st.volatile.global.v4.s32 [%rd251], {%r2437,%r2436,%r2435,%r2434}; // end inline asm $L__BB0_197: sub.s32 %r819, %r817, %r14; setp.lt.s32 %p258, %r15, %r819; and.pred %p259, %p132, %p258; @%p259 bra $L__BB0_202; bra.uni $L__BB0_198; $L__BB0_202: add.s32 %r1981, %r807, %r799; mul.wide.s32 %rd263, %r1981, 4; add.s64 %rd261, %rd25, %rd263; // begin inline asm st.volatile.global.v4.s32 [%rd261], {%r2457,%r2456,%r2455,%r2454}; // end inline asm mad.lo.s32 %r1982, %r2612, 6, %r814; add.s32 %r1983, %r1982, %r799; mul.wide.s32 %rd264, %r1983, 4; add.s64 %rd262, %rd25, %rd264; // begin inline asm st.volatile.global.v4.s32 [%rd262], {%r2453,%r2452,%r2451,%r2450}; // end inline asm bra.uni $L__BB0_203; $L__BB0_198: add.s32 %r1957, %r807, %r798; neg.s32 %r1958, %r1957; add.s32 %r1959, %r15, -4; setp.ge.s32 %p261, %r1959, %r1958; or.pred %p262, %p4, %p261; @%p262 bra $L__BB0_200; add.s32 %r1964, %r807, %r799; mul.wide.s32 %rd258, %r1964, 4; add.s64 %rd257, %rd25, %rd258; // begin inline asm st.volatile.global.v4.s32 [%rd257], {%r2457,%r2456,%r2455,%r2454}; // end inline asm $L__BB0_200: add.s32 %r820, %r807, 4; add.s32 %r1965, %r820, %r798; neg.s32 %r1966, %r1965; setp.ge.s32 %p264, %r1959, %r1966; or.pred %p265, %p4, %p264; @%p265 bra $L__BB0_203; add.s32 %r1972, %r820, %r799; mul.wide.s32 %rd260, %r1972, 4; add.s64 %rd259, %rd25, %rd260; // begin inline asm st.volatile.global.v4.s32 [%rd259], {%r2453,%r2452,%r2451,%r2450}; // end inline asm $L__BB0_203: sub.s32 %r1984, %r819, %r14; setp.lt.s32 %p267, %r15, %r1984; and.pred %p268, %p132, %p267; @%p268 bra $L__BB0_208; bra.uni $L__BB0_204; $L__BB0_208: add.s32 %r2009, %r807, %r803; mul.wide.s32 %rd271, %r2009, 4; add.s64 %rd269, %rd25, %rd271; // begin inline asm st.volatile.global.v4.s32 [%rd269], {%r2473,%r2472,%r2471,%r2470}; // end inline asm mad.lo.s32 %r2010, %r2612, 6, %r814; add.s32 %r2011, %r2010, %r803; mul.wide.s32 %rd272, %r2011, 4; add.s64 %rd270, %rd25, %rd272; // begin inline asm st.volatile.global.v4.s32 [%rd270], {%r2469,%r2468,%r2467,%r2466}; // end inline asm bra.uni $L__BB0_209; $L__BB0_204: add.s32 %r1985, %r807, %r802; neg.s32 %r1986, %r1985; add.s32 %r1987, %r15, -4; setp.ge.s32 %p270, %r1987, %r1986; or.pred %p271, %p4, %p270; @%p271 bra $L__BB0_206; add.s32 %r1992, %r807, %r803; mul.wide.s32 %rd266, %r1992, 4; add.s64 %rd265, %rd25, %rd266; // begin inline asm st.volatile.global.v4.s32 [%rd265], {%r2473,%r2472,%r2471,%r2470}; // end inline asm $L__BB0_206: add.s32 %r821, %r807, 4; add.s32 %r1993, %r821, %r802; neg.s32 %r1994, %r1993; setp.ge.s32 %p273, %r1987, %r1994; or.pred %p274, %p4, %p273; @%p274 bra $L__BB0_209; add.s32 %r2000, %r821, %r803; mul.wide.s32 %rd268, %r2000, 4; add.s64 %rd267, %rd25, %rd268; // begin inline asm st.volatile.global.v4.s32 [%rd267], {%r2469,%r2468,%r2467,%r2466}; // end inline asm $L__BB0_209: membar.gl; bar.sync 0; or.b32 %r2013, %r1, %r8; mov.u32 %r822, %tid.z; or.b32 %r2014, %r2013, %r822; setp.ne.s32 %p275, %r2014, 0; @%p275 bra $L__BB0_213; mov.u32 %r2123, %nctaid.y; add.s32 %r2122, %r2123, -1; ld.param.u64 %rd322, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_10]; cvta.to.global.u64 %rd273, %rd322; mov.u32 %r2015, %ctaid.z; mov.u32 %r2016, %nctaid.x; mov.u32 %r2017, %ctaid.x; mad.lo.s32 %r2018, %r2015, %r2016, %r2017; mul.wide.s32 %rd274, %r2018, 8; add.s64 %rd9, %rd273, %rd274; setp.eq.s32 %p276, %r1585, %r2122; cvt.s64.s32 %rd275, %r2123; mov.u64 %rd276, -9223372036854775807; sub.s64 %rd277, %rd276, %rd275; selp.b64 %rd278, %rd277, 1, %p276; atom.global.add.u64 %rd10, [%rd9], %rd278; ld.volatile.global.u64 %rd279, [%rd9]; xor.b64 %rd280, %rd279, %rd10; setp.lt.s64 %p277, %rd280, 0; @%p277 bra $L__BB0_213; mov.u32 %r2613, 8; $L__BB0_212: // begin inline asm nanosleep.u32 %r2613; // end inline asm setp.lt.u32 %p278, %r2613, 256; selp.u32 %r2024, 1, 0, %p278; shl.b32 %r2613, %r2613, %r2024; ld.volatile.global.u64 %rd281, [%rd9]; xor.b64 %rd282, %rd281, %rd10; setp.gt.s64 %p279, %rd282, -1; @%p279 bra $L__BB0_212; $L__BB0_213: bar.sync 0; setp.lt.s32 %p280, %r12, 1; @%p280 bra $L__BB0_314; mad.lo.s32 %r2026, %r7, %r822, %r8; mad.lo.s32 %r825, %r2026, %r6, %r1; mul.wide.u32 %rd283, %r825, 4; mov.u64 %rd284, _ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_3705045arrayE; add.s64 %rd11, %rd284, %rd283; clz.b32 %r2029, %r7; mov.u32 %r2030, 31; sub.s32 %r2031, %r2030, %r2029; mov.u32 %r2032, 1; shl.b32 %r826, %r2032, %r2031; setp.lt.u32 %p281, %r8, %r826; add.s32 %r2033, %r826, %r8; setp.lt.u32 %p282, %r2033, %r7; and.pred %p2, %p281, %p282; shl.b32 %r2034, %r6, %r2031; add.s32 %r2035, %r825, %r2034; mul.wide.s32 %rd285, %r2035, 4; add.s64 %rd12, %rd284, %rd285; shr.u32 %r2036, %r826, 31; add.s32 %r2037, %r826, %r2036; shr.s32 %r827, %r2037, 1; add.s32 %r2038, %r825, %r6; mul.wide.u32 %rd286, %r2038, 4; add.s64 %rd13, %rd284, %rd286; shl.b32 %r2039, %r6, 2; mul.lo.s32 %r828, %r2039, %r955; mul.lo.s32 %r2042, %r2039, %r1585; shl.b32 %r2043, %r1, 2; add.s32 %r2044, %r2042, %r2043; mad.lo.s32 %r829, %r920, %r8, %r2044; or.b32 %r2046, %r969, 3; sub.s32 %r2047, %r2046, %r2043; add.s32 %r830, %r2047, %r2042; mov.u32 %r2614, 0; not.pred %p287, %p2; $L__BB0_215: .pragma "nounroll"; mul.lo.s32 %r832, %r2614, %r828; add.s32 %r833, %r832, %r830; setp.lt.s32 %p283, %r13, 1; mov.f32 %f2020, 0f00000000; mov.f32 %f2021, %f2020; mov.f32 %f2022, %f2020; mov.f32 %f2023, %f2020; @%p283 bra $L__BB0_221; add.s32 %r834, %r832, %r829; mov.u32 %r2048, 0; mov.f32 %f2020, 0f00000000; mov.u32 %r2615, %r2048; $L__BB0_217: .pragma "nounroll"; setp.ge.s32 %p284, %r833, %r920; mov.u32 %r2616, %r2048; mov.u32 %r2617, %r2048; mov.u32 %r2618, %r2048; mov.u32 %r2619, %r2048; @%p284 bra $L__BB0_220; mad.lo.s32 %r2057, %r2615, %r7, %r8; setp.ge.s32 %p285, %r2057, %r955; mov.u32 %r2616, %r2048; mov.u32 %r2617, %r2048; mov.u32 %r2618, %r2048; mov.u32 %r2619, %r2048; @%p285 bra $L__BB0_220; mul.lo.s32 %r2063, %r920, %r7; mad.lo.s32 %r2064, %r2615, %r2063, %r834; mul.wide.s32 %rd288, %r2064, 4; add.s64 %rd287, %rd24, %rd288; // begin inline asm ld.volatile.global.v4.s32 {%r2619,%r2618,%r2617,%r2616}, [%rd287]; // end inline asm $L__BB0_220: mov.b32 %f1900, %r2619; add.f32 %f2023, %f2023, %f1900; mov.b32 %f1901, %r2618; add.f32 %f2022, %f2022, %f1901; mov.b32 %f1902, %r2617; add.f32 %f2021, %f2021, %f1902; mov.b32 %f1903, %r2616; add.f32 %f2020, %f2020, %f1903; add.s32 %r2615, %r2615, 1; setp.lt.s32 %p286, %r2615, %r13; @%p286 bra $L__BB0_217; $L__BB0_221: st.shared.f32 [%rd11], %f2023; bar.sync 0; @%p287 bra $L__BB0_223; ld.shared.f32 %f1904, [%rd12]; ld.shared.f32 %f1905, [%rd11]; add.f32 %f1906, %f1904, %f1905; st.shared.f32 [%rd11], %f1906; $L__BB0_223: setp.lt.s32 %p288, %r826, 4; bar.sync 0; @%p288 bra $L__BB0_228; mov.u32 %r2620, %r827; $L__BB0_225: setp.ge.u32 %p289, %r8, %r2620; @%p289 bra $L__BB0_227; mad.lo.s32 %r2066, %r2620, %r6, %r825; mul.wide.s32 %rd289, %r2066, 4; add.s64 %rd291, %rd284, %rd289; ld.shared.f32 %f1907, [%rd11]; ld.shared.f32 %f1908, [%rd291]; add.f32 %f1909, %f1908, %f1907; st.shared.f32 [%rd11], %f1909; $L__BB0_227: bar.sync 0; shr.u32 %r846, %r2620, 1; setp.gt.u32 %p290, %r2620, 3; mov.u32 %r2620, %r846; @%p290 bra $L__BB0_225; $L__BB0_228: setp.ne.s32 %p291, %r8, 0; mov.f32 %f2024, 0f00000000; @%p291 bra $L__BB0_231; setp.lt.u32 %p292, %r7, 2; ld.shared.f32 %f1911, [%rd11]; add.f32 %f2024, %f1911, 0f00000000; @%p292 bra $L__BB0_231; ld.shared.f32 %f1912, [%rd13]; add.f32 %f2024, %f2024, %f1912; $L__BB0_231: bar.sync 0; // begin inline asm { cvt.rn.bf16.f32 %rs1005, %f2024;} // end inline asm st.shared.f32 [%rd11], %f2022; bar.sync 0; @%p287 bra $L__BB0_233; ld.shared.f32 %f1914, [%rd12]; ld.shared.f32 %f1915, [%rd11]; add.f32 %f1916, %f1914, %f1915; st.shared.f32 [%rd11], %f1916; $L__BB0_233: bar.sync 0; @%p288 bra $L__BB0_238; mov.u32 %r2621, %r827; $L__BB0_235: setp.ge.u32 %p295, %r8, %r2621; @%p295 bra $L__BB0_237; mad.lo.s32 %r2068, %r2621, %r6, %r825; mul.wide.s32 %rd292, %r2068, 4; add.s64 %rd294, %rd284, %rd292; ld.shared.f32 %f1917, [%rd11]; ld.shared.f32 %f1918, [%rd294]; add.f32 %f1919, %f1918, %f1917; st.shared.f32 [%rd11], %f1919; $L__BB0_237: bar.sync 0; shr.u32 %r848, %r2621, 1; setp.gt.u32 %p296, %r2621, 3; mov.u32 %r2621, %r848; @%p296 bra $L__BB0_235; $L__BB0_238: mov.f32 %f2025, 0f00000000; @%p291 bra $L__BB0_241; setp.lt.u32 %p298, %r7, 2; ld.shared.f32 %f1921, [%rd11]; add.f32 %f2025, %f1921, 0f00000000; @%p298 bra $L__BB0_241; ld.shared.f32 %f1922, [%rd13]; add.f32 %f2025, %f2025, %f1922; $L__BB0_241: bar.sync 0; // begin inline asm { cvt.rn.bf16.f32 %rs1006, %f2025;} // end inline asm st.shared.f32 [%rd11], %f2021; bar.sync 0; @%p287 bra $L__BB0_243; ld.shared.f32 %f1924, [%rd12]; ld.shared.f32 %f1925, [%rd11]; add.f32 %f1926, %f1924, %f1925; st.shared.f32 [%rd11], %f1926; $L__BB0_243: bar.sync 0; @%p288 bra $L__BB0_248; mov.u32 %r2622, %r827; $L__BB0_245: setp.ge.u32 %p301, %r8, %r2622; @%p301 bra $L__BB0_247; mad.lo.s32 %r2070, %r2622, %r6, %r825; mul.wide.s32 %rd295, %r2070, 4; add.s64 %rd297, %rd284, %rd295; ld.shared.f32 %f1927, [%rd11]; ld.shared.f32 %f1928, [%rd297]; add.f32 %f1929, %f1928, %f1927; st.shared.f32 [%rd11], %f1929; $L__BB0_247: bar.sync 0; shr.u32 %r850, %r2622, 1; setp.gt.u32 %p302, %r2622, 3; mov.u32 %r2622, %r850; @%p302 bra $L__BB0_245; $L__BB0_248: mov.f32 %f2026, 0f00000000; @%p291 bra $L__BB0_251; setp.lt.u32 %p304, %r7, 2; ld.shared.f32 %f1931, [%rd11]; add.f32 %f2026, %f1931, 0f00000000; @%p304 bra $L__BB0_251; ld.shared.f32 %f1932, [%rd13]; add.f32 %f2026, %f2026, %f1932; $L__BB0_251: bar.sync 0; // begin inline asm { cvt.rn.bf16.f32 %rs1007, %f2026;} // end inline asm st.shared.f32 [%rd11], %f2020; bar.sync 0; @%p287 bra $L__BB0_253; ld.shared.f32 %f1934, [%rd12]; ld.shared.f32 %f1935, [%rd11]; add.f32 %f1936, %f1934, %f1935; st.shared.f32 [%rd11], %f1936; $L__BB0_253: bar.sync 0; @%p288 bra $L__BB0_258; mov.u32 %r2623, %r827; $L__BB0_255: setp.ge.u32 %p307, %r8, %r2623; @%p307 bra $L__BB0_257; mad.lo.s32 %r2072, %r2623, %r6, %r825; mul.wide.s32 %rd298, %r2072, 4; add.s64 %rd300, %rd284, %rd298; ld.shared.f32 %f1937, [%rd11]; ld.shared.f32 %f1938, [%rd300]; add.f32 %f1939, %f1938, %f1937; st.shared.f32 [%rd11], %f1939; $L__BB0_257: bar.sync 0; shr.u32 %r852, %r2623, 1; setp.gt.u32 %p308, %r2623, 3; mov.u32 %r2623, %r852; @%p308 bra $L__BB0_255; $L__BB0_258: mov.f32 %f2027, 0f00000000; @%p291 bra $L__BB0_261; setp.lt.u32 %p310, %r7, 2; ld.shared.f32 %f1941, [%rd11]; add.f32 %f2027, %f1941, 0f00000000; @%p310 bra $L__BB0_261; ld.shared.f32 %f1942, [%rd13]; add.f32 %f2027, %f2027, %f1942; $L__BB0_261: bar.sync 0; // begin inline asm { cvt.rn.bf16.f32 %rs1008, %f2027;} // end inline asm setp.ge.s32 %p312, %r833, %r920; or.pred %p313, %p291, %p312; @%p313 bra $L__BB0_263; ld.param.u64 %rd321, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_6]; mov.b32 %r2074, {%rs1007, %rs1008}; mad.lo.s32 %r2080, %r2039, %r1585, %r2043; add.s32 %r2081, %r832, %r2080; mul.wide.s32 %rd302, %r2081, 2; add.s64 %rd301, %rd321, %rd302; mov.b32 %r2073, {%rs1005, %rs1006}; // begin inline asm st.global.cs.v2.s32 [%rd301], {%r2073,%r2074}; // end inline asm $L__BB0_263: add.s32 %r2614, %r2614, 1; setp.lt.s32 %p314, %r2614, %r12; @%p314 bra $L__BB0_215; mad.lo.s32 %r854, %r920, %r8, %r2043; shl.b32 %r855, %r1585, 2; shl.b32 %r856, %r955, 2; mov.u32 %r2624, 0; $L__BB0_265: .pragma "nounroll"; mul.lo.s32 %r858, %r2624, %r828; add.s32 %r859, %r858, %r830; mov.f32 %f2032, 0f00000000; mov.f32 %f2033, %f2032; mov.f32 %f2034, %f2032; mov.f32 %f2035, %f2032; @%p283 bra $L__BB0_271; mad.lo.s32 %r2088, %r856, %r2624, %r855; mad.lo.s32 %r2625, %r6, %r2088, %r854; mul.lo.s32 %r861, %r920, %r7; mov.u32 %r2087, 0; mov.f32 %f2032, 0f00000000; mov.u32 %r2626, %r8; mov.u32 %r2627, %r2087; $L__BB0_267: .pragma "nounroll"; setp.ge.s32 %p316, %r859, %r920; mov.u32 %r2628, %r2087; mov.u32 %r2629, %r2087; mov.u32 %r2630, %r2087; mov.u32 %r2631, %r2087; @%p316 bra $L__BB0_270; setp.ge.s32 %p317, %r2626, %r955; mov.u32 %r2628, %r2087; mov.u32 %r2629, %r2087; mov.u32 %r2630, %r2087; mov.u32 %r2631, %r2087; @%p317 bra $L__BB0_270; mul.wide.s32 %rd304, %r2625, 4; add.s64 %rd303, %rd25, %rd304; // begin inline asm ld.volatile.global.v4.s32 {%r2631,%r2630,%r2629,%r2628}, [%rd303]; // end inline asm $L__BB0_270: mov.b32 %f1952, %r2631; add.f32 %f2035, %f2035, %f1952; mov.b32 %f1953, %r2630; add.f32 %f2034, %f2034, %f1953; mov.b32 %f1954, %r2629; add.f32 %f2033, %f2033, %f1954; mov.b32 %f1955, %r2628; add.f32 %f2032, %f2032, %f1955; add.s32 %r2626, %r2626, %r7; add.s32 %r2625, %r2625, %r861; add.s32 %r2627, %r2627, 1; setp.lt.s32 %p318, %r2627, %r13; @%p318 bra $L__BB0_267; $L__BB0_271: st.shared.f32 [%rd11], %f2035; bar.sync 0; @%p287 bra $L__BB0_273; ld.shared.f32 %f1956, [%rd12]; ld.shared.f32 %f1957, [%rd11]; add.f32 %f1958, %f1956, %f1957; st.shared.f32 [%rd11], %f1958; $L__BB0_273: bar.sync 0; @%p288 bra $L__BB0_278; mov.u32 %r2632, %r827; $L__BB0_275: setp.ge.u32 %p321, %r8, %r2632; @%p321 bra $L__BB0_277; mad.lo.s32 %r2104, %r2632, %r6, %r825; mul.wide.s32 %rd305, %r2104, 4; add.s64 %rd307, %rd284, %rd305; ld.shared.f32 %f1959, [%rd11]; ld.shared.f32 %f1960, [%rd307]; add.f32 %f1961, %f1960, %f1959; st.shared.f32 [%rd11], %f1961; $L__BB0_277: bar.sync 0; shr.u32 %r877, %r2632, 1; setp.gt.u32 %p322, %r2632, 3; mov.u32 %r2632, %r877; @%p322 bra $L__BB0_275; $L__BB0_278: mov.f32 %f2036, 0f00000000; @%p291 bra $L__BB0_281; setp.lt.u32 %p324, %r7, 2; ld.shared.f32 %f1963, [%rd11]; add.f32 %f2036, %f1963, 0f00000000; @%p324 bra $L__BB0_281; ld.shared.f32 %f1964, [%rd13]; add.f32 %f2036, %f2036, %f1964; $L__BB0_281: bar.sync 0; // begin inline asm { cvt.rn.bf16.f32 %rs1009, %f2036;} // end inline asm st.shared.f32 [%rd11], %f2034; bar.sync 0; @%p287 bra $L__BB0_283; ld.shared.f32 %f1966, [%rd12]; ld.shared.f32 %f1967, [%rd11]; add.f32 %f1968, %f1966, %f1967; st.shared.f32 [%rd11], %f1968; $L__BB0_283: bar.sync 0; @%p288 bra $L__BB0_288; mov.u32 %r2633, %r827; $L__BB0_285: setp.ge.u32 %p327, %r8, %r2633; @%p327 bra $L__BB0_287; mad.lo.s32 %r2106, %r2633, %r6, %r825; mul.wide.s32 %rd308, %r2106, 4; add.s64 %rd310, %rd284, %rd308; ld.shared.f32 %f1969, [%rd11]; ld.shared.f32 %f1970, [%rd310]; add.f32 %f1971, %f1970, %f1969; st.shared.f32 [%rd11], %f1971; $L__BB0_287: bar.sync 0; shr.u32 %r879, %r2633, 1; setp.gt.u32 %p328, %r2633, 3; mov.u32 %r2633, %r879; @%p328 bra $L__BB0_285; $L__BB0_288: mov.f32 %f2037, 0f00000000; @%p291 bra $L__BB0_291; setp.lt.u32 %p330, %r7, 2; ld.shared.f32 %f1973, [%rd11]; add.f32 %f2037, %f1973, 0f00000000; @%p330 bra $L__BB0_291; ld.shared.f32 %f1974, [%rd13]; add.f32 %f2037, %f2037, %f1974; $L__BB0_291: bar.sync 0; // begin inline asm { cvt.rn.bf16.f32 %rs1010, %f2037;} // end inline asm st.shared.f32 [%rd11], %f2033; bar.sync 0; @%p287 bra $L__BB0_293; ld.shared.f32 %f1976, [%rd12]; ld.shared.f32 %f1977, [%rd11]; add.f32 %f1978, %f1976, %f1977; st.shared.f32 [%rd11], %f1978; $L__BB0_293: bar.sync 0; @%p288 bra $L__BB0_298; mov.u32 %r2634, %r827; $L__BB0_295: setp.ge.u32 %p333, %r8, %r2634; @%p333 bra $L__BB0_297; mad.lo.s32 %r2108, %r2634, %r6, %r825; mul.wide.s32 %rd311, %r2108, 4; add.s64 %rd313, %rd284, %rd311; ld.shared.f32 %f1979, [%rd11]; ld.shared.f32 %f1980, [%rd313]; add.f32 %f1981, %f1980, %f1979; st.shared.f32 [%rd11], %f1981; $L__BB0_297: bar.sync 0; shr.u32 %r881, %r2634, 1; setp.gt.u32 %p334, %r2634, 3; mov.u32 %r2634, %r881; @%p334 bra $L__BB0_295; $L__BB0_298: mov.f32 %f2038, 0f00000000; @%p291 bra $L__BB0_301; setp.lt.u32 %p336, %r7, 2; ld.shared.f32 %f1983, [%rd11]; add.f32 %f2038, %f1983, 0f00000000; @%p336 bra $L__BB0_301; ld.shared.f32 %f1984, [%rd13]; add.f32 %f2038, %f2038, %f1984; $L__BB0_301: bar.sync 0; // begin inline asm { cvt.rn.bf16.f32 %rs1011, %f2038;} // end inline asm st.shared.f32 [%rd11], %f2032; bar.sync 0; @%p287 bra $L__BB0_303; ld.shared.f32 %f1986, [%rd12]; ld.shared.f32 %f1987, [%rd11]; add.f32 %f1988, %f1986, %f1987; st.shared.f32 [%rd11], %f1988; $L__BB0_303: bar.sync 0; @%p288 bra $L__BB0_308; mov.u32 %r2635, %r827; $L__BB0_305: setp.ge.u32 %p339, %r8, %r2635; @%p339 bra $L__BB0_307; mad.lo.s32 %r2110, %r2635, %r6, %r825; mul.wide.s32 %rd314, %r2110, 4; add.s64 %rd316, %rd284, %rd314; ld.shared.f32 %f1989, [%rd11]; ld.shared.f32 %f1990, [%rd316]; add.f32 %f1991, %f1990, %f1989; st.shared.f32 [%rd11], %f1991; $L__BB0_307: bar.sync 0; shr.u32 %r883, %r2635, 1; setp.gt.u32 %p340, %r2635, 3; mov.u32 %r2635, %r883; @%p340 bra $L__BB0_305; $L__BB0_308: mov.f32 %f2039, 0f00000000; @%p291 bra $L__BB0_311; setp.lt.u32 %p342, %r7, 2; ld.shared.f32 %f1993, [%rd11]; add.f32 %f2039, %f1993, 0f00000000; @%p342 bra $L__BB0_311; ld.shared.f32 %f1994, [%rd13]; add.f32 %f2039, %f2039, %f1994; $L__BB0_311: bar.sync 0; // begin inline asm { cvt.rn.bf16.f32 %rs1012, %f2039;} // end inline asm setp.ge.s32 %p344, %r859, %r920; or.pred %p345, %p291, %p344; @%p345 bra $L__BB0_313; ld.param.u64 %rd320, [_ZN90_GLOBAL__N__00000000_50___tmp_kernel_inner_outer_persistent_f0_c1_r0_g0_cu_44e693ad_37050442nvfuser_inner_outer_persistent_f0_c1_r0_g0ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IS1_Li1ELi1EEES2_S5_S5_S4_S4_NS0_IxLi1ELi1EEE_param_7]; mov.b32 %r2112, {%rs1011, %rs1012}; mad.lo.s32 %r2118, %r2039, %r1585, %r2043; add.s32 %r2119, %r858, %r2118; mul.wide.s32 %rd318, %r2119, 2; add.s64 %rd317, %rd320, %rd318; mov.b32 %r2111, {%rs1009, %rs1010}; // begin inline asm st.global.cs.v2.s32 [%rd317], {%r2111,%r2112}; // end inline asm $L__BB0_313: add.s32 %r2624, %r2624, 1; setp.lt.s32 %p346, %r2624, %r12; @%p346 bra $L__BB0_265; $L__BB0_314: ret; }