**Dependency Analysis Using LLVM** //From runtime profile IIVM IR br label %BB1 #define edi prof 100000 BB1 #define eax prof 0 %eax.0 = phi i32 [ 0, %BB ], [ %13, %BB1 ] #define esi prof 200000 %0 = shl nsw i32 %eax.0. 2 #define ebx prof 300000 %1 = add nuw nsw i32 %0. 100000 #define ecx prof 1000 %2 = Ishr exact i32 %1. 2 void loop(int\* mem) %3 = getelementptr inbounds i32\* %mem, i32 %2 %4 = load i32\* %3, align 4, !tbaa !1 int edi = edi prof: clang %5 = add nuw nsw i32 %0, 200000 int eax = eax prof: %6 = Ishr exact i32 %5. 2 int esi = esi prof: %7 = getelementptr inbounds i32\* %mem. i32 %6 int ecx = ecx prof; %8 = load i32\* %7, align 4, !tbaa !1 int ebx = ebx prof; %9 = add nsw i32 %8, %4 int edx: %10 = add nuw nsw i32 %0, 300000 hh20: %11 = Ishr exact i32 %10. 2 edx = \*(mem+(edi + eax\*4)/sizeof(int));%12 = getelementptr inbounds i32\* %mem, i32 %11 edx += \*(mem + (esi + eax\*4)/sizeof(int));store i32 %9, i32\* %12, align 4, !tbaa !1 \*(mem + (ebx+eax\*4)/sizeof(int)) = edx; %13 = add nuw nsw i32 %eax.0. 1 eax = eax+1: %14 = icmp eq i32 %13, 1000 if(eax != ecx) br i1 %14. label %BB2. label %BB1 goto bb20; BB2 ret void Preprocessing + Banerjee's profile info Method 20: mov (%edi.%eax.4).%edx 23: add (%esi,%eax,4),%edx No alias between Ld1 26: mov %edx.(%ebx.%eax.4) the two loads 29: add \$0x1,%eax Ld2 and the store 2c: cmp %ecx,%eax 2e: jne 20 **Output from Disassembler** void loop(int\* restrict memLd1, int\* restrict memLd2, int\* restrict memSt, int edi. int eax. int esi. int ecx. int ebx) Separate memory ports for non-aliasing int edx; accesses, facilitating bb20: datapath synthesis edx = \*(memLd1+(edi + eax\*4)/sizeof(int));edx += \*(memLd2 + (esi + eax\*4)/sizeof(int));\*(memSt + (ebx+eax\*4)/sizeof(int)) = edx; Parallelizable loop eax = eax+1;if(eax != ecx) goto bb20;

Input to Accelerator Synthesis