-
Notifications
You must be signed in to change notification settings - Fork 0
/
at_policy.c
1068 lines (895 loc) · 31.3 KB
/
at_policy.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <fcntl.h> /* For O_* constants */
#include <sys/stat.h> /* For mode constants */
#include <semaphore.h>
#include <sys/types.h>
#include <unistd.h>
#include <string.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <errno.h>
#include <time.h>
#include <sys/syscall.h>
#include "all.h"
#include "autotuner.h"
#include "at_static_linked_list.h"
#include "autotuner_perfm.h"
#include "at_wrap.h"
#include "at_cpu_util.h"
//autotuner handle for whole application
extern at_handles athandle;
//processor num
int cpu_cnt = 4;
//predefined cpu allocation plan, this should be dynamically generated in the future
int one_task_cpu_plans[3][4] = {{0x3,0x3,0xc,0xc},{0x5,0x5,0xa,0xa},{0x1,0x2,0x4,0x8}};
int one_tasks_cpu_plan_cnt = 2;
int two_tasks_cpu_plans[2][2] = {{0x5,0xa},{0x3,0xc}};
int two_tasks_cpu_plan_cnt = 2;
//perfmon counter results
extern long long counterValue1[64];
extern long long counterValue2[64];
extern long long counterValue3[64];
//do we want to enable perfmon profiling module, or just use load balancing
extern int AT_ENABLE_PROFILING;
//do we want to enable the whole perfmon policy
extern int AT_ENABLED;
//pin asll tasks' threads in master
int at_pin_all_threads(at_handles * ath);
int at_run_os_sched_mode(at_handles * ath);
//sending signal to a particuler thread
int tkill(int tid, int sig)
{
return syscall(SYS_tkill, tid, sig);
}
//sending perfmon start signal to all threads in this application
int at_start_perfm(at_handles * ath)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
int j = t->thread_table_head;
while( j != -1 )
{
if(t->threads_table[j].active) //only pin active threads
{
STRATA_LOG("autotunerd", "Task %d: sending perfmon start signal to thread %d\n", ath->index, t->threads_table[j].tid);
tkill(t->threads_table[j].tid, PERFMON_START_SIGNAL);
}
j = t->threads_table[j].next;
}
return 0;
}
//sending perfmon start signal to all threads in this application
int at_stop_perfm(at_handles * ath)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
int j = t->thread_table_head;
while( j != -1 )
{
if(t->threads_table[j].active) //only pin active threads
{
STRATA_LOG("autotunerd", "Task %d: sending perfmon stop signal to thread %d\n", ath->index, t->threads_table[j].tid);
tkill(t->threads_table[j].tid, PERFMON_STOP_SIGNAL);
}
j = t->threads_table[j].next;
}
return 0;
}
//collect results returned from each threads in this application
int at_collect_perfm_results(at_handles * ath, double * avg_ipc)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
//tell all threads to start sampling
at_start_perfm(ath);
//sleep for 1sec, let each thread to run
sleep(1);
//tell all threads to stop sampling
at_stop_perfm(ath);
//collect results
//first, sleep 0.5 seconds, give other threads the chance to respond
//NOTE: This is not a precise implementation, sometimes we may not be able to collect all results. However, this method has little overhead.
usleep(500000);
int j = t->thread_table_head;
int k = 0;
double ipc = 0;
long long total_insn = 0;
while(j != -1 )
{
if(t->threads_table[j].active)
{
if( counterValue1[j] != 0 )
{
double tempipc = (((double)counterValue3[j])/((double)counterValue1[j]));
total_insn += counterValue3[j];
ipc = ipc + tempipc;
STRATA_LOG("autotunerd", "Computing IPC from %lld / %lld = %f, accmulated IPC %f\n", counterValue3[j], counterValue1[j], tempipc, ipc);
k++;
}
}
j = t->threads_table[j].next;
}
*avg_ipc = 0;
if(k != 0)
*avg_ipc = ipc/k;
t->insn_retired = total_insn;
STRATA_LOG("autotunerd", "Returning IPC %f\n", *avg_ipc);
return 0;
}
int at_sampling_one_task(at_handles * ath)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
int best_plan_index = 0;
double avg_ipc1, avg_ipc2;
//only one thread, no need to do sampling
//sem_post((sem_t*)ath->shmem_sem);
STRATA_LOG("autotunerd", "There is only one task for sampling\n");
//if there are more than or less than 2 threads, then using all available cores is the only correct way of scheduling
//so there is no need to sample for more than 2 cores
if(t->active_thread_count != 2)
return -2;
//test first plan
STRATA_LOG("autotunerd", "Sampling Plan 1\n");
t->cpus[0] = one_task_cpu_plans[0][0];
t->cpus[1] = one_task_cpu_plans[0][1];
t->cpus[2] = one_task_cpu_plans[0][2];
t->cpus[3] = one_task_cpu_plans[0][3];
pin_threads_as_plan(ath);
at_collect_perfm_results(ath, &avg_ipc1);
STRATA_LOG("autotunerd", "Sampled Plan 1, with avg ipc %f\n", avg_ipc1);
//test second plan
STRATA_LOG("autotunerd", "Sampling Plan 2\n");
t->cpus[0] = one_task_cpu_plans[1][0];
t->cpus[1] = one_task_cpu_plans[1][1];
t->cpus[2] = one_task_cpu_plans[1][2];
t->cpus[3] = one_task_cpu_plans[1][3];
pin_threads_as_plan(ath);
at_collect_perfm_results(ath, &avg_ipc2);
STRATA_LOG("autotunerd", "Sampled Plan 2, with avg ipc %f\n", avg_ipc2);
if( avg_ipc1 >= avg_ipc2 )
best_plan_index = 0;
else
best_plan_index = 1;
STRATA_LOG("autotunerd", "New Best Plan %d for Single Task\n", best_plan_index);
return best_plan_index;
}
int at_sampling_two_tasks(at_handles * ath)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
//only sampling when two tasks have the nearly same number of active threads, i.e. |threadcountA - threadcountB| <= 1
//otherwise, we need to test if isolation is better then load balancing or cpu usage priority mode
int ignore_first_plan = 0;
int use_insn_retired = 0;
if( (abs(m->tl[m->head].active_thread_count - m->tl[m->tail].active_thread_count) > 1) ||
m->cpu_usage_priority_mode )
{
ignore_first_plan = 1;
//TODO:this is just for testing
//use_insn_retired = 1;
}
STRATA_LOG("autotunerd", "Sampling 2 tasks:CPU Usage Mode %d, Ignore First Plan %d, Use_insn_retired %d\n", m->cpu_usage_priority_mode, ignore_first_plan, use_insn_retired);
int i = m->head;
int j,k;
int cur_plan_index = 0;
double best_value = 0;
long long best_value_ir = 0;
int best_plan_index = 0;
task_list_item * tmp;
while(cur_plan_index < 2)
{
STRATA_LOG("autotunerd", "Sampling 2 tasks with plan %d\n", cur_plan_index);
i = m->head;
k = 0; //task index in the plans array
while(i != -1)
{
tmp = (m->tl) + i;
if( !(cur_plan_index == 0 && ignore_first_plan) )
{
for(j = 0; j < tmp->active_thread_count; j++)
tmp->cpus[j] = two_tasks_cpu_plans[cur_plan_index][k];
STRATA_LOG("autotunerd", "Setting up sampling plan for task %d with plan %d,%d,%d,%d\n", i, m->tl[i].cpus[0], m->tl[i].cpus[1], m->tl[i].cpus[2], m->tl[i].cpus[3]);
}
STRATA_LOG("autotunerd", "Finished setting up sampling plan for task %d\n", i);
//send message out if they are not this task (aka master)
if( i != ath->index)
{
STRATA_LOG("autotunerd", "Sending message TYPE_TEST to task %d\n", i);
at_message msg;
msg.sender_id = ath->index;
msg.type = AT_MSG_TYPE_TEST;
msg.value = 0;
at_send_message(m, &msg, i);
}
i = tmp->next;
k++;
}
//do self sampling
{
double avg_ipc = 0;
pin_threads_as_plan(ath);
at_collect_perfm_results(ath, &avg_ipc);
t->avg_ipc = avg_ipc;
STRATA_LOG("autotunerd", "Master self sampling, result is %f\n", avg_ipc);
}
at_message recv_msg;
int type;
int break_out = 0;
i = 0;
while(i < (m->task_cnt - 1))
{
//wait for sampling reply
STRATA_LOG("autotunerd", "Master waiting for sampling replying\n");
sem_wait((sem_t*)ath->comm_sem);
//process the message, it could be all kinds of messages
//sem_wait((sem_t*)ath->comm_sem);
type = at_receive_message(ath, &recv_msg);
//sem_post((sem_t*)ath->shmem_sem);
if(type == AT_MSG_TYPE_QUIT)
{
//the whole application is going to quit
STRATA_LOG("autotuner", "Got quit message while master waiting for test result\n");
return -1;
}
else if(type == AT_MSG_TYPE_DATA_READY)
{
STRATA_LOG("autotunerd", "Master got one sampling reply from %d\n", recv_msg.sender_id);
i++;
}
else if(type == AT_MSG_TYPE_THREAD_COUNT_CHANGED)
{
STRATA_LOG("autotunerd", "Master %d: Waiting for sampling results but got thread count change message, stop sampling and go for load balancing and cpu usage test");
return -3;
}
//ignore all other messages... this is not so good, we need to fix it
}
if( !use_insn_retired )
{
STRATA_LOG("autotunerd", "Master got all sampling replies with ipc %f and ipc %f\n", m->tl[m->head].avg_ipc, m->tl[m->tail].avg_ipc);
double avg_ipc = (m->tl[m->head].avg_ipc + m->tl[m->tail].avg_ipc)/2;
STRATA_LOG("autotunerd", "Master got new avg ipc %f\n", avg_ipc);
if( avg_ipc > best_value )
{
best_value = avg_ipc;
best_plan_index = cur_plan_index;
STRATA_LOG("autotunerd", "New best plan is %d\n", best_plan_index);
}
}
else
{
STRATA_LOG("autotunerd", "Master got all sampling replies with insn retired %lld and %lld\n", m->tl[m->head].insn_retired, m->tl[m->tail].insn_retired);
long long total_insn = m->tl[m->head].insn_retired + m->tl[m->tail].insn_retired;
STRATA_LOG("autotunerd", "Master got new total insn retired %lld\n", total_insn);
if( total_insn > best_value_ir )
{
best_value_ir = total_insn;
best_plan_index = cur_plan_index;
STRATA_LOG("autotunerd", "New best plan is %d\n", best_plan_index);
}
}
//test next plan
cur_plan_index++;
}
if(best_plan_index == 0 && ignore_first_plan)
best_plan_index = -3;
return best_plan_index;
}
//assume share memory lock acquired
//nobody is allowed to quit during testing
//returns the number of best plan.
//if return value is -1, then this master task is going to quit
//if return value is -2, then there is no plan for current running tasks
//if return value is -3, then use load balancing mode or cpu usage priority mode
int at_start_sampling(at_handles * ath)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
int sampling_not_done = 1;
int cur_plan_index = 0;
int best_plan_index = 0;
double best_value = 0;
//sem_wait((sem_t*)ath->shmem_sem);
if(m->task_cnt == 1)
{
return at_sampling_one_task(ath);
}
else if(m->task_cnt == 2)
{
return at_sampling_two_tasks(ath);
}
//for the rest number of tasks, we don't have an implmentation yet
return -2;
}
//pin threads in this application as plan
int pin_threads_as_plan(at_handles * ath)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
int j = t->thread_table_head;
int k = 0;
cpu_set_t cpus;
int * p = (int*)&cpus;
int iRet = 0;
while( j != -1 )
{
if(t->threads_table[j].active) //only pin active threads
{
CPU_ZERO(&cpus);
*p = t->cpus[k];//this is a little bit tricky, cpu_set_t is actually a bit vector, but it is declared into a structure, so I have to manually write to its memory
STRATA_LOG("autotunerd", "Task %d: Self assigning thread %d to core map 0x%x\n", ath->index, t->threads_table[j].tid, t->cpus[k]);
iRet = sched_setaffinity(t->threads_table[j].tid, sizeof(cpu_set_t), &cpus);
if(iRet != 0 )
STRATA_LOG("autotuner", "Failed set affinity of thread %d to core 0x%x\n", t->threads_table[j].tid, t->cpus[k]);
k++;
}
j = t->threads_table[j].next;
}
return 0;
}
int run_as_plan_one_task(at_handles * ath, int plan_no)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
//set up the scheduling plan
int i;
for(i = 0; i < t->active_thread_count; i++)
{
t->cpus[i] = one_task_cpu_plans[plan_no][i%4];
}
//pin threads following this plan
pin_threads_as_plan(ath);
return 0;
}
int run_as_plan_two_tasks(at_handles * ath, int plan_no)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
int i = m->head;
int j,k;
task_list_item * tmp = NULL;
k = 0;
while(i != -1)
{
//for first task
tmp = (m->tl) + i;
for(j = 0; j < tmp->active_thread_count; j++)
tmp->cpus[j] = two_tasks_cpu_plans[plan_no][k];
STRATA_LOG("autotunerd", "Setting up sampling plan for task %d with plan %d,%d,%d,%d\n", i, m->tl[i].cpus[0], m->tl[i].cpus[1], m->tl[i].cpus[2], m->tl[i].cpus[3]);
//send message out if they are not this task (aka master)
if( i != ath->index)
{
STRATA_LOG("autotunerd", "Sending message TYPE_RUN_AS_PLAN to task %d\n", i);
at_message msg;
msg.sender_id = ath->index;
msg.type = AT_MSG_TYPE_RUN_AS_PLAN;
msg.value = 0;
at_send_message(m, &msg, i);
}
i = tmp->next;
k++;
}
//pin master's threads as plan
pin_threads_as_plan(ath);
return 0;
}
//tell the slaves to follow the best plan
int run_as_plan(at_handles * ath, int plan_no)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
if( m->task_cnt == 1 ) //only valid for one or two tasks
{
run_as_plan_one_task(ath, plan_no);
}
else if(m->task_cnt == 2)
{
run_as_plan_two_tasks(ath, plan_no);
}
return 0;
}
int at_msg_slave_quit_mst(at_handles * ath, at_message * msg)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
//if a slave is quitting
STRATA_LOG("autotuner", "Slave task %d is quitting\n", msg->value);
//reload balancing they system
at_load_balance_2L2(ath);
//redetect processor usage
at_improve_cpu_usage(ath);
return 0;
}
int at_msg_quit_mst(at_handles * ath, at_message * msg)
{
return 0;
}
int at_msg_timeout_mst(at_handles * ath, int * quit)
{
//here comes new sampling. Nobody is allowed to quit during sampling, and nobody is allow to join
STRATA_LOG("autotunerd", "Master %d wake up due to time out\n", ath->pid);
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
//if we are in OS scheduling mode, then we do not have to sample
if(m->os_sched_mode)
return 0;
//lost the shared memory so no task can join no task can quit
sem_wait((sem_t*)ath->shmem_sem);
m->slave_not_allowed_to_quit = 1;
int iRet = at_start_sampling(ath);
if( iRet == -1 )
{
//got quitting message while sampling
STRATA_LOG("autotunerd", "Master %d got quit message will sampling, now going to quit\n", ath->pid);
m->slave_not_allowed_to_quit = 0;
sem_post((sem_t*)ath->shmem_sem); //release this lock so other tasks can quit as well
task_unregister(ath, 1);
*quit = 1;
return 0;
}
else if( iRet == -3 )
{
//load balancing first then cpu usage priority mode
STRATA_LOG("autotunerd", "Master %d: We are going to follow load balancing and cpu usage priority mode %d\n", ath->index, iRet);
//load balance the system
at_load_balance_2L2(ath);
//pin all task threads
at_pin_all_threads(ath);
//see if we need improve cpu usage
at_improve_cpu_usage(ath);
}
else if( iRet >= 0 )
{
//we have a best plan, tell slaves to follow it
STRATA_LOG("autotunerd", "Master %d: We have found a good plan %d\n", ath->index, iRet);
m->cpu_usage_priority_mode = 0;
run_as_plan(ath, iRet);
}
//slave is free to quit
m->slave_not_allowed_to_quit = 0;
sem_post((sem_t*)ath->shmem_sem);
return 0;
}
int at_msg_test_slv(at_handles * ath, at_message * msg)
{
shared_mem * m = ath->shmem;
task_list_item * t = (m->tl) + ath->index;
//we have some sampling to do.
STRATA_LOG("autotunerd", "Slave %d got msg TYPE_TEST, with plan %d,%d,%d,%d\n", ath->index, t->cpus[0], t->cpus[1], t->cpus[2], t->cpus[3]);
//do sampling
{
pin_threads_as_plan(ath);
at_collect_perfm_results(ath, &(t->avg_ipc));
STRATA_LOG("autotunerd", "Slave got sampling result %f\n", t->avg_ipc);
}
//tell master we have done the sampling
STRATA_LOG("autotunerd", "Slave sending sampling results back \n");
at_message outmsg;
outmsg.value = 0;
outmsg.sender_id = ath->index;
outmsg.type = AT_MSG_TYPE_DATA_READY;
at_send_message(m, &outmsg, m->master_id);
return 0;
}
int at_msg_quit_slv(at_handles * ath, at_message * msg)
{
return 0;
}
int at_msg_mst_quit_slv(at_handles * ath, at_message * msg)
{
return 0;
}
int at_msg_run_plan_slv(at_handles * ath, at_message * msg)
{
STRATA_LOG("autotunerd", "Slave %d got msg TYPE_RUN_AS_PLAN\n", ath->index);
pin_threads_as_plan(ath);
return 0;
}
//find available thread table cell
int at_find_thread_table_cell(task_list_item * t)
{
int i;
int index = -1;
for(i = 0; i < THREAD_TABLE_LENGTH; i++)
{
if(t->threads_table[i].valid == 0)
{
index = i;
break;
}
}
return index;
}
//add current application thread to thread table
int autotuner_add_this_thread_to_table()
{
if(!AT_ENABLED)
return 0;
shared_mem * m = athandle.shmem;
task_list_item * t = (m->tl) + athandle.index;
//no need to use lock by far since Strata provides a lock during thread startup
pthread_mutex_lock(&(t->thread_table_lock));
pid_t tid = syscall(SYS_gettid);
int index = at_find_thread_table_cell(t);
STRATA_LOG("autotunerd", "Adding new thread to slot %d, id is %d\n", index, tid);
if(index != -1 )
{
t->threads_table[index].tid = tid;
t->threads_table[index].valid = 1;
t->threads_table[index].pthread_handle = (void*)pthread_self();
t->threads_table[index].active = 1;
at_slist_insert(&(t->thread_table_head), &(t->thread_table_tail), t->threads_table, sizeof(thread_table_item), index);
}
//perfmon initialization for each thread
if(t->tbl_index <= THREAD_TABLE_LENGTH)
init_reeact_perfm_perthread(index);
t->thread_count++;
t->active_thread_count++;
t->tbl_index++;
pthread_mutex_unlock(&(t->thread_table_lock));
if(t->tbl_index <= THREAD_TABLE_LENGTH)
{
//notify master that this task's thread number changed
at_message msg;
msg.sender_id = athandle.index;
msg.type = AT_MSG_TYPE_THREAD_COUNT_CHANGED;
msg.value = 0;
STRATA_LOG("autotunerd", "Sending thread count changed message to master %d\n", m->master_id);
at_send_message(m, &msg, m->master_id);//there may be a problem, if the master is quitting then the message is acutally lost, so every new master must rebalance the system
//tell the master to do sampling and find best scheduling plan, since we are probably in ROI now
msg.sender_id = athandle.index;
msg.type = AT_MSG_TYPE_RESET_TIMER;
msg.value = 3; //TODO: this number should be changed to 3 seconds, since we are now doing another 2 seconds cpu usage test
STRATA_LOG("autotunerd", "Sending thread count changed message to master %d\n", m->master_id);
at_send_message(m, &msg, m->master_id);
}
else
{
//TODO:possible race condition
m->os_sched_mode = 1;
//this is the application that requested OS scheduling mode
t->os_sched_mode_req = 1;
}
return 0;
}
//remove current application thread from thread table
//no re-load-balancing triggered in this function since usually at this moment, the application is going to quit or new threads will be spawned
int autotuner_rm_this_thread_from_table()
{
if(!AT_ENABLED)
return 0;
shared_mem * m = athandle.shmem;
task_list_item * t = (m->tl) + athandle.index;
int i,k;
pid_t tid = (pid_t) syscall(SYS_gettid);
STRATA_LOG("autotunerd", "Removing thread, id is %d\n", tid);
//no need to use lock by far
pthread_mutex_lock(&(t->thread_table_lock));
i = t->thread_table_head;
while(i != -1)
{
if( tid == t->threads_table[i].tid )
{
at_slist_remove(&(t->thread_table_head), &(t->thread_table_tail), t->threads_table, sizeof(thread_table_item), i);
t->threads_table[i].tid = -1;
t->threads_table[i].active = 0;
t->threads_table[i].valid = 0;
t->thread_count--;
t->active_thread_count--;
pthread_mutex_unlock(&(t->thread_table_lock));
STRATA_LOG("autotunerd", "Thread %d Removed\n", tid);
return 0;
}
i = (t->threads_table[i]).next;
}
pthread_mutex_unlock(&(t->thread_table_lock));
//perfmon per thread cleanning up
close_reeact_perfm_perthread();
return -1;
}
//redistribute all tasks' threads for a 2 L2 processor, make the system load balanced
int at_load_balance_2L2(at_handles * ath)
{
shared_mem * m = ath->shmem;
m->cpu_usage_priority_mode = 0;
//first get the total number of threads
int i = m->head;
int j;
int th_cnt = 0;
int min_th_cnt = 1000;
int min_th_task = -1;
int task_cnt = 0;
while(i != -1)
{
th_cnt += m->tl[i].active_thread_count;
//find the task with the minimal thread count, we gonna pin it first to ensure its processor time
if(min_th_cnt > m->tl[i].active_thread_count)
{
min_th_cnt = m->tl[i].active_thread_count;
min_th_task = i;
STRATA_LOG("autotunerd", "New Smallest Task is %d with %d threads\n", min_th_task, min_th_cnt);
}
task_cnt++;
i = m->tl[i].next;
STRATA_LOG("autotunerd", "Load balancing reading next application %d's thread count\n", i);
}
STRATA_LOG("autotunerd", "Total Thread count is %d\n", th_cnt);
if(task_cnt == 1) //if there is only one task then we just pin its threads to the cores one by one
{
int tmpcpus[4] = {0x1, 0x2, 0x4, 0x8};
for( j = 0; j < m->tl[min_th_task].active_thread_count; j++)
{
m->tl[min_th_task].cpus[j] = tmpcpus[j%4];
}
return 0;
}
//second, compute how many threads per L2 cache
int fst_L2_th_cnt = th_cnt/2;
int snd_L2_th_cnt = th_cnt - fst_L2_th_cnt;
STRATA_LOG("autotunerd", "First L2 run %d threads, second L2 run %d threads\n", fst_L2_th_cnt, snd_L2_th_cnt);
//allocate processors to the smallest task first
for(j = 0; j < m->tl[min_th_task].active_thread_count;j++)
{
STRATA_LOG("autotunerd", "Assigning Task %d to 0x3\n", min_th_task);
m->tl[min_th_task].cpus[j]=0x3;
fst_L2_th_cnt--;
}
//go over each task, assigned an L2 cache to it
i = m->head;
while(i != -1)
{
if( i != min_th_task )
{
for(j = 0; j < m->tl[i].active_thread_count; j++)
{
if(fst_L2_th_cnt > 0)
{
STRATA_LOG("autotunerd", "Assigning Task %d to 0x3\n", i);
m->tl[i].cpus[j] = 0x3; //assign to core 0 and 1
fst_L2_th_cnt--;
}
else if(snd_L2_th_cnt > 0)
{
STRATA_LOG("autotunerd", "Assigning Task %d to 0xc\n", i);
m->tl[i].cpus[j] = 0xc;//assign to core 2 and 3
snd_L2_th_cnt--;
}
else
{
STRATA_LOG("autotunerd", "Assigning Task %d to 0xf\n", i);
m->tl[i].cpus[j] = 0xf;//some threads were just created before we can handle it, allocate all 4 cores for it, we have to handle it any way later
}
}
}
i = m->tl[i].next;
}
return 0;
}
//pin asll tasks' threads in master
int at_pin_all_threads(at_handles * ath)
{
shared_mem * m = ath->shmem;
//go over each tasks and pin there threads
int i = m->head;
int j,k; //j used to go over task's thread table, and k used to go over task's cpu affinities
task_list_item * t = NULL; //pointer to current task
cpu_set_t cpus;
int * p = (int*)&cpus;
int iRet = 0;
while(i != -1 )
{
t = (m->tl) + i;
j = t->thread_table_head;
k = 0;
while( j != -1 )
{
if(t->threads_table[j].active) //only pin active threads
{
CPU_ZERO(&cpus);
if( t->cpus[k] != 0x0)
*p = t->cpus[k];//this is a little bit tricky, cpu_set_t is actually a bit vector, but it is declared into a structure, so I have to manually write to its memory
else
{
//STRATA_LOG("autotunerd", "Crashed Here?\n");
*p = 0xf;
}
STRATA_LOG("autotunerd", "Task %d: Assigning thread %d to core map 0x%x\n", i, t->threads_table[j].tid, t->cpus[k]);
iRet = sched_setaffinity(t->threads_table[j].tid, sizeof(cpu_set_t), &cpus);
if(iRet != 0 )
STRATA_LOG("autotuner", "Failed set affinity of thread %d to core 0x%x\n", t->threads_table[j].tid, t->cpus[k]);
k++;
}
j = t->threads_table[j].next;
}
i = t->next;
}
return 0;
}
//master message function for AT_MSG_TYPE_THREAD_COUNT_CHANGED
int at_msg_th_cnt_chg_mst(at_handles * ath, at_message * msg)
{
STRATA_LOG("autotunerd", "Rebalancing the system\n");
//load balance the system
at_load_balance_2L2(ath);
//pin all task threads
at_pin_all_threads(ath);
return;
}
//phtread_join call will be transferred here. We will detect if the calling thread will be blocked.
//If it will be, then we will release its processor
int at_pthread_join(pthread_t thread, void **value_ptr)
{
if(!AT_ENABLED)
return at_join_thread(thread, value_ptr);
shared_mem * m = athandle.shmem;
task_list_item * t = (m->tl) + athandle.index;
STRATA_LOG("autotunerd", "Pthread join called\n");
pid_t tid = (pid_t) syscall(SYS_gettid);
int caller_id = -1;
//go over the thread table, find corresponding thread and see if they are the same
int i = t->thread_table_head;
int caller_blocked = 1;
at_message msg;
while( i != -1 )
{
STRATA_LOG("autotunerd", "Checking thread %d status\n", t->threads_table[i].tid );
if(pthread_equal((pthread_t)t->threads_table[i].pthread_handle, thread))
{
if(t->threads_table[i].tid == -1)
{
//if this thread is dead, then calling thread is probably not going to be blocked
caller_blocked = 0;
}
}
else if(t->threads_table[i].tid == tid )
caller_id = i;
i = t->threads_table[i].next;
}
if(caller_blocked)
{
STRATA_LOG("autotunerd", "Pthread join: Caller thread will be blocked\n");
t->threads_table[caller_id].active = 0;
t->active_thread_count--;
//Set this thread's affinity to any cores; later any other thread created by this thread will be allowed
//to run on any processor; this is necessary for some crazy thread creating threads
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(0, &cpuset);
CPU_SET(1, &cpuset);
CPU_SET(2, &cpuset);
CPU_SET(3, &cpuset);
sched_setaffinity(t->threads_table[caller_id].tid, sizeof(cpu_set_t), &cpuset);
//tell the master to reload balance the system
if(t->tbl_index <= THREAD_TABLE_LENGTH)
{
msg.sender_id = athandle.index;
msg.type = AT_MSG_TYPE_THREAD_COUNT_CHANGED;
msg.value = 0;
STRATA_LOG("autotunerd", "Sending thread count changed message to master %d\n", m->master_id);
at_send_message(m, &msg, m->master_id);
}
}
int iRet = at_join_thread(thread, value_ptr);
//if caller was blocked, then we have to reallocate processors to it
if(caller_blocked)
{
STRATA_LOG("autotunerd", "Pthread join: Caller thread is released\n");
t->threads_table[caller_id].active = 1;
t->active_thread_count++;
if(t->tbl_index <= THREAD_TABLE_LENGTH)
{
msg.sender_id = athandle.index;
msg.type = AT_MSG_TYPE_THREAD_COUNT_CHANGED;
msg.value = 0;
STRATA_LOG("autotunerd", "Sending thread count changed message to master %d\n", m->master_id);
at_send_message(m, &msg, m->master_id);
}
}
return iRet;
}
int at_get_total_thread_cnt(at_handles * ath)
{
shared_mem * m = ath->shmem;
//first get the total number of threads
int i = m->head;
int th_cnt = 0;
while(i != -1)
{
th_cnt += m->tl[i].active_thread_count;
i = m->tl[i].next;
}
return th_cnt;
}
int at_release_under_used_cpu(at_handles * ath, int * cpulist, int number)
{
shared_mem * m = ath->shmem;
int i,j;
int cpu_map = 0;
//prepare bitmap for released processors
for(i = 0; i < number; i++)
{
if(cpulist[i] == 1)
{
cpu_map |= (1 << i);
}
}
STRATA_LOG("autotuner", "CPU Usage Mode: Releasing Processors Map 0x%x\n", cpu_map);
//give processors to each thread
i = m->head;
while(i != -1)
{
for(j = 0; j < m->tl[i].active_thread_count; j++)
{
m->tl[i].cpus[j] |= cpu_map;//some threads were just created before we can handle it, allocate all 4 cores for it, we have to handle it any way later
}
i = m->tl[i].next;
}
return 0;
}
//use this function after isolation to see if load balancing is cause significant processor time wasting
//however, wasting processor time is not always bad, as in the case of Streamcluster and Bodytrack, so
//we still have to do sampling to test
int at_improve_cpu_usage(at_handles * ath)
{
shared_mem * m = ath->shmem;
m->cpu_usage_priority_mode = 0;
//if application has requested OS scheduling mode, then we has to do OS scheduling
if(m->os_sched_mode)
at_run_os_sched_mode(ath);
//if there is only one application or there are less threads than processors than we should quit
if(m->task_cnt == 1)