boost/graph/distributed/triangle_counting_sucsuc.hpp

// Copyright (C) 2018 Thejaka Amila Kanewala, Marcin Zalewski, Andrew Lumsdaine.

// Boost Software License - Version 1.0 - August 17th, 2003

// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
// this license (the "Software") to use, reproduce, display, distribute,
// execute, and transmit the Software, and to prepare derivative works of the
// Software, and to permit third-parties to whom the Software is furnished to
// do so, all subject to the following:

// The copyright notices in the Software and this entire statement, including
// the above license grant, this restriction and the following disclaimer,
// must be included in all copies of the Software, in whole or in part, and
// all derivative works of the Software, unless such copies or derivative
// works are solely in the form of machine-executable object code generated by
// a source language processor.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.

//  Authors: Thejaka Kanewala
//           Andrew Lumsdaine

//======== Triangle Counting Algortihm================//
//===========================================================//


#ifndef BOOST_GRAPH_TC_SUCSUC
#define BOOST_GRAPH_TC_SUCSUC

#ifndef BOOST_GRAPH_USE_MPI
#error "Parallel BGL files should not be included unless <boost/graph/use_mpi.hpp> has been included"
#endif

#include <am++/detail/thread_support.hpp>

#include <boost/parallel/append_buffer.hpp>
#include <boost/graph/graph_traits.hpp>
#include <boost/property_map/property_map.hpp>
#include <boost/graph/iteration_macros.hpp>
#include <boost/graph/parallel/algorithm.hpp> // for all_reduce
#include <boost/graph/parallel/iteration_macros.hpp> // for all_reduce
#include <boost/graph/parallel/thread_support.hpp> // for compare_and_swap
#include <algorithm> // for std::min, std::max
#include <boost/format.hpp>
#include <iostream>
#include <atomic>
#include "boost/tuple/tuple.hpp"
#include "thread_pq_def.hpp"
#include <boost/graph/distributed/owner_defs.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/discrete_distribution.hpp>
#include <am++/size_coalesced_message_type.hpp>
#include <chrono>

//for profiling
#ifdef CRAYPAT
#include <pat_api.h>
#endif


namespace boost { namespace graph { namespace distributed {

template<typename Graph, 
	 typename IdDistribution, 
	 typename NeighborMap,
         typename MessageGenerator = 
	 amplusplus::simple_generator<amplusplus::counter_coalesced_message_type_gen> >
class triangle_counting_sucsuc {

  typedef triangle_counting_sucsuc<Graph, IdDistribution, NeighborMap, MessageGenerator> self_type;

  typedef typename boost::property_map<Graph, vertex_owner_t>::const_type OwnerMap;

  typedef typename graph_traits<Graph>::vertex_descriptor Vertex;
  typedef typename graph_traits<Graph>::degree_size_type Degree;

  typedef typename std::pair< typename std::vector<Vertex>::iterator,
			      typename std::vector<Vertex>::iterator > IteratorPair_t;

  struct block_msg;
  struct processing_function;
  // AM++ message type
  //  typedef amplusplus::message_type<Vertex> tm_type;
  typedef typename amplusplus::size_coalesced_message_type_gen::inner<block_msg, processing_function>::type tm_type;


  struct degree_processing_function;
  typedef std::pair<Vertex, std::pair<Vertex, uint32_t > > work_item_t;
  struct minimum_pair_first
  {
    template<typename T>
    const T& operator()(const T& x, const T& y) const { return x.first < y.first ? x : y; }

    template<typename F>
    struct result {
      typedef typename boost::function_traits<F>::arg1_type type;
    };
  };


  typedef typename MessageGenerator::template call_result<work_item_t, 
							  degree_processing_function, 
							  owner_from_pair<OwnerMap, work_item_t>, 
							  amplusplus::idempotent_combination_t<minimum_pair_first > >::type RelaxMessage;


public:
  triangle_counting_sucsuc(Graph& g,
			    amplusplus::transport &t,
			    const IdDistribution& idd,
			    int offset,
			    uint64_t& bz,
			    uint64_t& sucbz,
			    uint64_t& csz,
			   NeighborMap& succ,
			   MessageGenerator message_gen =
			   MessageGenerator(amplusplus::counter_coalesced_message_type_gen(1 << 17)))
    : dummy_first_member_for_init_order((amplusplus::register_mpi_datatype<work_item_t>(), 0)),
      g(g), 
      transport(t), 
      nthreads(t.get_nthreads()),
      id_distribution(idd),
      owner(get(vertex_owner, g)), 
      core_offset(offset),
      block_size(bz),
      suc_block_size(sucbz),
    coalescing_size(csz),
    vertex_successors(succ),
    msg_type(amplusplus::size_coalesced_message_type_gen(coalescing_size), t),
    relax_msg(message_gen, transport, owner_from_pair<OwnerMap, work_item_t>(owner),
	      amplusplus::idempotent_combination(minimum_pair_first()))
  {
    initialize();
  }

  //destructor
  ~triangle_counting_sucsuc() {
#ifdef TRIANGLE_ENUMERATE
    for (int tid=0; tid < nthreads; ++tid) {
      work_item_t* arr = all_triangles[tid];
      delete [] arr;
    }

    delete [] all_triangles;
#endif

    delete [] threaded_triangle_indexes;
  }

  void operator() (int tid) { 
    run(tid); 
  }

  void run(int tid = 0);

  time_type get_start_time() {
    return start_time;
  }


  time_type get_elapsed_time() {
    return (end_time - start_time);
  }


#ifdef TRIANGLE_ENUMERATE
  // must be executed in a single thread
  void get_local_triangles(std::vector<work_item_t>& out) {
    for(int tid=0; tid < nthreads; ++tid) {
      out.insert(out.end(), all_triangles[tid], (all_triangles[tid]+threaded_triangle_indexes[tid]));
    }
  }
#endif

  uint64_t get_local_triangle_counts() {
    uint64_t total = 0;
    for(int tid=0; tid < nthreads; ++tid) {
      total += threaded_triangle_indexes[tid];
    }

    return total;
  }

  void print_triangle_counts() {
    std::cout << "========== Printing triangle counts per each thread ==============" << std::endl;
    for(int tid=0; tid < nthreads; ++tid) {
      std::cout << "[Rank=" << transport.rank() << "TID=" << tid << "] -- " << threaded_triangle_indexes[tid] << std::endl; 
    }
  }

#ifdef TC_STATS
  void print_stats() {
    uint64_t all_send_msgs = 0;
    uint64_t all_recv_msgs = 0;
    uint64_t all_local_msgs = 0;
    uint64_t all_preds = 0;
    uint64_t all_succs = 0;
    uint64_t all_succ_preds = 0;
    uint64_t tot_avg_set_int_times = 0;
    uint64_t total_set_inters = 0;
    uint64_t all_comparisons = 0;
    uint64_t all_setint_sizes = 0;
    uint64_t all_bytes_over_nw = 0;
    uint64_t all_init_succs = 0;
    uint64_t all_init_preds = 0;
    uint64_t all_predicted_psp_bytes = 0;
    uint64_t all_predicted_ss_bytes = 0;
    uint64_t vmax_set1_block = 0;
    uint64_t vmax_set2_block = 0;

    uint64_t t_all_send_msgs = 0;
    uint64_t t_all_recv_msgs = 0;
    uint64_t t_all_local_msgs = 0;
    uint64_t t_all_preds = 0;
    uint64_t t_all_succs = 0;
    uint64_t t_all_succ_preds = 0;
    uint64_t t_tot_avg_set_int_times = 0;
    uint64_t t_total_set_inters = 0;
    uint64_t t_all_comparisons = 0;
    uint64_t t_all_setint_sizes = 0;
    uint64_t t_all_bytes_over_nw = 0;
    uint64_t t_all_init_succs = 0;
    uint64_t t_all_init_preds = 0;
    uint64_t t_all_predicted_psp_bytes = 0;
    uint64_t t_all_predicted_ss_bytes = 0;
    uint64_t t_vmax_set1_block = 0;
    uint64_t t_vmax_set2_block = 0;


    for (int i=0; i < nthreads; ++i) {
      t_all_send_msgs += send_msgs[i];
      t_all_recv_msgs += recv_msgs[i];
      t_all_local_msgs += local_msgs[i];
      t_all_preds += total_preds[i];
      t_all_succs += total_succs[i];
      t_all_succ_preds += total_succs_preds[i];
      t_tot_avg_set_int_times += tot_set_int_time[i];
      t_total_set_inters += total_setints[i];
      t_all_comparisons += tot_comparisons[i];
      std::cout << "Thread : " << i << " Comparisons : " << tot_comparisons[i] << std::endl;
      std::cout << "Thread : " << i << " Max degree : " << max_degree_vertex[i] << std::endl;
      std::cout << "Thread : " << i << " Max successors : " << max_suc_degree[i] << std::endl;
      t_all_setint_sizes += tot_setsizes[i];
      t_all_bytes_over_nw += num_bytes_sent_over_nw[i];
      t_all_init_succs += init_succs[i];
      t_all_init_preds += init_preds[i];
      t_all_predicted_psp_bytes += predicted_psp_bytes[i];
      t_all_predicted_ss_bytes += predicted_ss_bytes[i];

      if (max_set1_block[i] > t_vmax_set1_block)
	t_vmax_set1_block = max_set1_block[i];

      if (max_set2_block[i] > t_vmax_set2_block)
	t_vmax_set2_block = max_set2_block[i];

    }

    MPI_Reduce(&t_all_send_msgs, &all_send_msgs, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_recv_msgs, &all_recv_msgs, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_local_msgs, &all_local_msgs, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_preds, &all_preds, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_succs, &all_succs, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_succ_preds, &all_succ_preds, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_tot_avg_set_int_times, &tot_avg_set_int_times, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_total_set_inters, &total_set_inters, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_comparisons, &all_comparisons, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_setint_sizes, &all_setint_sizes, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_bytes_over_nw, &all_bytes_over_nw, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_init_succs, &all_init_succs, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_init_preds, &all_init_preds, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_predicted_psp_bytes, &all_predicted_psp_bytes, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_all_predicted_ss_bytes, &all_predicted_ss_bytes, 
	       1, MPI_LONG_LONG_INT , MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_vmax_set1_block, &vmax_set1_block, 
	       1, MPI_LONG_LONG_INT , MPI_MAX, 0, MPI_COMM_WORLD);
    MPI_Reduce(&t_vmax_set2_block, &vmax_set2_block, 
	       1, MPI_LONG_LONG_INT , MPI_MAX, 0, MPI_COMM_WORLD);


    auto nranks = transport.size();
    if (transport.rank() == 0) {
      std::string with_ordering = "Yes";
#ifdef TC_NO_ORDERING
      with_ordering = "No";
#endif

      std::string algorithm = "PredPred";
#ifdef TC_ALGO_SUCSUC
      algorithm = "SucSuc";
#endif

      std::cout << "[INFO][STATS] Per rank stats. Ranks : " << nranks
		<< ", Algorithm : " << algorithm
		<< ", Sent : " << (all_send_msgs/nranks)
		<< ", Received : " << (all_recv_msgs/nranks)
		<< ", Local :" << (all_local_msgs/nranks)
		<< ", Predecessors in Set Intersection : " << (all_preds/nranks)
		<< ", Successors in Set Intersection : " << (all_succs/nranks)
		<< ", Predecessors of successors in Set Intersection : " << (all_succ_preds/nranks)
		<< ", Average set intersection time : " << (tot_avg_set_int_times/total_set_inters) << " nanoseconds"
		<< ", Total set intersections : " << (total_set_inters/nranks)
		<< ", Total set comparisons : " << all_comparisons
		<< ", Total set intersection sizes (sum of two sets) : " << all_setint_sizes
		<< ", All bytes sent over : " << all_bytes_over_nw 
		<< ", Bytes sent per rank :" << (all_bytes_over_nw/nranks)
		<< ", With Ordering ? : " << with_ordering
		<< ", Predicted PSP Bytes : " << (all_predicted_psp_bytes * sizeof(Vertex))
		<< ", Predicted SS Bytes : " << (all_predicted_ss_bytes * sizeof(Vertex))
		<< ", Max set-1 block size : " << vmax_set1_block
		<< ", Max set-2 (group)block size : " << vmax_set2_block
		<< std::endl;
    }

  }
#endif


private:

  //  template<typename Vertex>
  struct block_msg {
  private:
    typename std::vector<Vertex>::iterator sucbegin;
    uint64_t succount;
    typename std::vector<Vertex>::iterator predbegin;
    uint64_t predcount;
    Vertex* successor_array;
    Vertex* predecessor_array;

  public:
    block_msg(typename std::vector<Vertex>::iterator s,
	      uint64_t sc,
	      typename std::vector<Vertex>::iterator p,
	      uint64_t pc): sucbegin(s),
			    succount(sc),
			    predbegin(p),
			    predcount(pc),
			    predecessor_array(NULL),
                            successor_array(NULL){}

    block_msg(): succount(0), predcount(0),
		 predecessor_array(NULL),
		 successor_array(NULL){}

    block_msg(const block_msg& bmsg): sucbegin(bmsg.sucbegin),
				      succount(bmsg.succount),
				      predbegin(bmsg.predbegin),
				      predcount(bmsg.predcount),
				      successor_array(bmsg.successor_array),
				      predecessor_array(bmsg.predecessor_array){}


    size_t get_size() {       
      // additional 1 is to store the number of successors
      return (sizeof(Vertex)*(succount+predcount+1)); 
    }

    void serialize(COALESCE_TYPE* buf) {
      Vertex* vbuf = (Vertex*)buf;
      vbuf[0] = (Vertex)succount; // Vertex is also uint64_t so should not be an issue
      // copy successors
      std::copy(sucbegin, sucbegin+succount, vbuf+1);
      // copy predecessors to the buffer
      std::copy(predbegin, predbegin+predcount, vbuf+succount+1);
    }

    void deserialize(COALESCE_TYPE* buf, uint64_t bytecount) {
      Vertex* array = (Vertex*)buf;
      succount = array[0];
      assert(succount != 0);
      successor_array = array+1;
      predcount = (bytecount/sizeof(Vertex)) - (1+succount);
      assert(predcount != 0);
      predecessor_array = array+(1+succount);
    }

    Vertex* get_predecessor_array() {
      assert(predecessor_array != NULL);
      return predecessor_array;
    }

    Vertex* get_successor_array() {
      assert(successor_array != NULL);
      return successor_array;
    }

    uint64_t get_pred_count() {
      return predcount;
    }

    uint64_t get_suc_count() {
      return succount;
    }
  };


  void initialize();

  template<typename ItePred, typename IteSuc>
  void many_set_intersection(ItePred itepbegin, 
			     uint64_t predcount,
			     IteSuc itesbegin,
			     uint64_t succount,
			     int tid);
				 
	
  void populate_successors(const work_item_t& wi);

  template<typename SizeType>
  inline SizeType logical_id(SizeType k) {
    //return k;
    /*uint64_t x = 1;
      x = x << 48;
      x = x - 1;
      uint64_t val = k & x;
      return (SizeType)val;*/

    //return k;
    return id_distribution(k);
    //return g.distribution().global(k);
  }

  template<typename SizeType>
  inline amplusplus::transport::rank_type owner_for_logical_id(SizeType k) {
    return id_distribution.owner(k);
  }

  template<typename SizeType>
  inline SizeType to_vertex_descriptor(SizeType k) {
    return id_distribution.to_vertex_descriptor(k);
  }

  template<typename SizeType>
  inline SizeType local_id(SizeType k) {
    return g.distribution().local(k);
  }


  work_item_t construct_wi(Vertex s,
			   Vertex c,
			   uint32_t d) {
    // typedef std::pair<Vertex, std::pair<diff_t, level_t> > work_item_t;
    work_item_t wi(s, std::make_pair(c, d));
    return wi;
  }

  inline void copy_wi(work_item_t& to, const work_item_t& from) {
    to.first = from.first;
    to.second.first = from.second.first;
    to.second.second = from.second.second;
  }


#ifdef TC_STATS
  std::vector<uint64_t> send_msgs;
  std::vector<uint64_t> recv_msgs;
  std::vector<uint64_t> local_msgs;
  std::vector<uint64_t> total_preds;
  std::vector<uint64_t> total_succs;
  std::vector<uint64_t> total_succs_preds;
  std::vector<uint64_t> total_setints;
  std::vector<uint64_t> tot_set_int_time;
  std::vector<uint64_t> tot_comparisons;
  std::vector<uint64_t> tot_setsizes;
  std::vector<uint64_t> num_bytes_sent_over_nw;
  std::vector<uint64_t> init_succs;
  std::vector<uint64_t> init_preds;
  std::vector<uint64_t> predicted_ss_bytes;
  std::vector<uint64_t> predicted_psp_bytes;
  std::vector<uint64_t> max_set1_block;
  std::vector<uint64_t> max_set2_block;
  std::vector<uint64_t> max_degree_vertex;
  std::vector<uint64_t> max_suc_degree;
#endif


private:
  const int dummy_first_member_for_init_order;
  const Graph& g;
  amplusplus::transport& transport;
  const int nthreads;
  const IdDistribution& id_distribution;
  const OwnerMap& owner; 
  int core_offset;
  NeighborMap& vertex_successors;

  uint64_t block_size;
  uint64_t suc_block_size;
  uint64_t coalescing_size = 1 << 21;
  // AM++ message type
  tm_type msg_type;
  RelaxMessage relax_msg;

  shared_ptr<amplusplus::detail::barrier> t_bar;

  time_type start_time;
  time_type end_time;

  std::map<Vertex, spinlock*> locks;
#ifdef TRIANGLE_ENUMERATE
  work_item_t** all_triangles;
#endif
  uint64_t* threaded_triangle_indexes;

#ifdef PRINT_DEBUG
  uint64_t no_sends = 0;
  uint64_t no_receives = 0;
  Vertex lastsucc;
  uint64_t lastcount = 0;
#endif

};

#define TC_PARAMS_SUCSUC                                   \
      typename Graph, typename IdDistribution, typename NeighborMap, typename MessageGenerator

#define TC_TYPE_SUCSUC                                    \
      triangle_counting_sucsuc<Graph, IdDistribution, NeighborMap, MessageGenerator>


#ifdef TRIANGLE_ENUMERATE
#define MAX_TRIANGLE_COUNT 100000000
#endif

template<TC_PARAMS_SUCSUC>
void
TC_TYPE_SUCSUC::initialize() {

  // debugging
  /*int i = 0;
  const static uint64_t local_id_mask = ((uint64_t)1 << 48)-1;
  BGL_FORALL_VERTICES_T(v, g, Graph) {
    if (i == 5) {
      break;
    }
      
    auto localid = v & local_id_mask;
    auto owner = (v >> 48);

    std::cout << "R = " << transport.rank() 
	      << "v = " << v << ", logicalv = " << logical_id(v)
	      << ", localid = " << local_id(v)
	      << ", calc localid = " << localid
	      << ", calc owner = " << owner
	      << std::endl;
    ++i;
  }

  { amplusplus::scoped_epoch epoch(transport); }
  exit(0);*/
  // end debugging
  
  relax_msg.set_handler(degree_processing_function(*this));
  msg_type.set_handler(processing_function(*this));

  //  threaded_triangles.resize(nthreads);
  threaded_triangle_indexes = new uint64_t[nthreads];

  for (int i=0; i < nthreads; ++i) {
    threaded_triangle_indexes[i] = 0;
  }

  // check the coalescing size is at least upto the block size
  if (coalescing_size < block_size) {
    std::cout << "[ERROR] Coalescing size must be greater than the block size. " << std::endl;
    assert(false);
  }

  BGL_FORALL_VERTICES_T(v, g, Graph) {
    locks.insert(std::make_pair(v, new spinlock()));
  }

#ifdef TC_STATS
  send_msgs.resize(nthreads, 0);
  recv_msgs.resize(nthreads, 0);
  local_msgs.resize(nthreads, 0);
  total_preds.resize(nthreads, 0);
  total_succs.resize(nthreads, 0);
  total_succs_preds.resize(nthreads, 0);
  total_setints.resize(nthreads, 0);
  tot_set_int_time.resize(nthreads, 0);
  tot_comparisons.resize(nthreads, 0);
  tot_setsizes.resize(nthreads, 0);
  num_bytes_sent_over_nw.resize(nthreads, 0);
  init_succs.resize(nthreads, 0);
  init_preds.resize(nthreads, 0);
  predicted_ss_bytes.resize(nthreads, 0);
  predicted_psp_bytes.resize(nthreads, 0);
  max_set1_block.resize(nthreads, 0);
  max_set2_block.resize(nthreads, 0);
  max_degree_vertex.resize(nthreads, 0);
  max_suc_degree.resize(nthreads, 0);
#endif

#ifdef TRIANGLE_ENUMERATE
  all_triangles = new work_item_t*[nthreads];
#endif
}


template<TC_PARAMS_SUCSUC>
template<typename ItePred, typename IteSuc>
void
TC_TYPE_SUCSUC::many_set_intersection(ItePred itepbegin,
			       uint64_t predcount,
			       IteSuc itesbegin,
			       uint64_t succount,
			       int tid) {

#ifdef TC_STATS
  total_preds[tid] += predcount;
  total_succs[tid] += succount;
#endif

  ItePred itepend = itepbegin + predcount;

  typedef typename std::vector<Vertex>::iterator SucVertexIter_t;
  typedef typename std::pair<SucVertexIter_t,
			     SucVertexIter_t> IteratorPair_t;

  std::vector<IteratorPair_t> sucpreds(succount);
  for (int i=0; i < succount; ++i) {
    Vertex s = *(itesbegin+i);

    assert(get(owner, s) == transport.rank());

    //    sucpreds[i].first = vertex_successors[s].begin();
    sucpreds[i].first = std::lower_bound(vertex_successors[s].begin(), 
					 vertex_successors[s].end(), 
					 (*itepbegin));
    sucpreds[i].second = vertex_successors[s].end();
#ifdef TC_STATS
    total_succs_preds[tid] += vertex_successors[s].size();
#endif
  }

  for(int i=0; i < sucpreds.size(); ++i) {

    counting_output_iterator output_ite;
#ifdef TC_STATS
    //auto newsucite = sucpreds[i].first;
    //newsucite = std::lower_bound(sucpreds[i].first, sucpreds[i].second, (*itepbegin));
    tot_setsizes[tid] += (predcount + (sucpreds[i].second - sucpreds[i].first));
    output_ite = std::set_intersection(itepbegin, itepend,
				       sucpreds[i].first,
				       sucpreds[i].second,
				       output_ite,
      [&](Vertex left, Vertex right){ ++tot_comparisons[tid]; return (left < right);});
#else
    //auto newsucite = sucpreds[i].first;
    //newsucite = std::lower_bound(sucpreds[i].first, sucpreds[i].second, (*itepbegin));
    output_ite = std::set_intersection(itepbegin, itepend,
				       sucpreds[i].first,
				       sucpreds[i].second,
				       output_ite);
#endif

    threaded_triangle_indexes[tid] += output_ite.get_count();;

  }
}


template<TC_PARAMS_SUCSUC>
void
TC_TYPE_SUCSUC::populate_successors(const work_item_t& wi) {
  Vertex d = wi.first;
  Vertex s = wi.second.first;
  uint32_t deg = wi.second.second;

  if (deg > out_degree(d, g)) {
    locks[d]->lock();
    vertex_successors[d].push_back(s);
    locks[d]->unlock();
  } else {
    if (deg == out_degree(d, g)) {
      if (logical_id(d) < logical_id(s)) {
	locks[d]->lock();
	vertex_successors[d].push_back(s);
	locks[d]->unlock();
      }
    }
  }
}

template<TC_PARAMS_SUCSUC>
void
TC_TYPE_SUCSUC::run(int tid) {
  AMPLUSPLUS_WITH_THREAD_ID(tid) {

    int nthreads = transport.get_nthreads();
    if (0 == tid) {
      // Set the number of threads to the barrier
      t_bar.reset(new amplusplus::detail::barrier(nthreads));
    }

    { amplusplus::scoped_epoch epoch(transport); }

    // Now above if branch needs to be executed to every thread
    // Therefore wait till every thread comes to this point
    t_bar->wait();

    // if two processes are running on the same node, core_offset
    // is important to achieve thread affinity
    if (pin(tid+core_offset) != 0) {
      std::cerr << "[ERROR] Unable to pin current thread to "
		<< "core : " << tid << std::endl;
      assert(false);
    }

    // wait till all threads are pinned
    t_bar->wait();
    { amplusplus::scoped_epoch epoch(transport); }

    validate_thread_core_relation();

    t_bar->wait();

    time_type atcalps = get_time();

#ifdef TC_NO_ORDERING
    // Parallel iterate over all the vertices and collect predecessors and successors
    BGL_PARFORALL_VERTICES_T(v, g, Graph, tid, nthreads) {
      BGL_FORALL_OUTEDGES_T(v, e, g, Graph) {
	Vertex u = target(e, g);
#ifdef TC_ALGO_SUCSUC
	if (logical_id(v) < logical_id(u)) { // successors
	  vertex_successors[v].push_back(u);
	}
#else
	if (logical_id(v) > logical_id(u)) { // predecessors
	  vertex_successors[v].push_back(u);
	}
#endif
      }

      vertex_successors[v].shrink_to_fit();
      std::sort(vertex_successors[v].begin(), vertex_successors[v].end());

#ifdef TC_STATS
      // calculate the bytes transferred if we do ss
      size_t s = vertex_successors[v].size();

      std::set<amplusplus::transport::rank_type> setranks;

      if (s > max_set1_block[tid])
	max_set1_block[tid] = s;

      uint64_t groupsz = 0;
      int sranks = 0;
      uint64_t nonlocalsuccs = 0;
      amplusplus::transport::rank_type last_rank = -1;
      for (auto i = 0; i < s; ++i) {
	auto ite = vertex_successors[v].begin() + i;
	amplusplus::transport::rank_type dest = get(owner, (*ite));
	if (dest != transport.rank()) {
	  if (last_rank != dest) {
	    last_rank = dest;
	    if (groupsz > max_set2_block[tid])
	      max_set2_block[tid] = groupsz;

	    groupsz = 0;

	  }
	  setranks.insert(dest);
	  ++groupsz;
	  ++nonlocalsuccs;
	}
      }

      sranks = setranks.size();
      setranks.clear();

      // suppose we are doing psp and calculate the number bytes transferred
      if (s != 0) {
	auto ssbytes = (s*sranks) + nonlocalsuccs;
	predicted_ss_bytes[tid] += ssbytes;
      }

      init_succs[tid] += vertex_successors[v].size();
#endif

    }

#else
    { 
      amplusplus::scoped_epoch epoch(transport); 

      BGL_PARFORALL_VERTICES_T(v, g, Graph, tid, nthreads) {
	BGL_FORALL_OUTEDGES_T(v, e, g, Graph) {
	  Vertex u = target(e, g);
	  work_item_t wi = construct_wi(u, v, out_degree(v, g));
	  relax_msg.send(wi);
	}
      }
    }

    BGL_PARFORALL_VERTICES_T(v, g, Graph, tid, nthreads) {
      std::sort(vertex_successors[v].begin(), vertex_successors[v].end());

#ifdef TC_STATS
      // calculate the bytes transferred if we do ss
      size_t s = vertex_successors[v].size();

      std::set<amplusplus::transport::rank_type> setranks;

      if (s > max_set1_block[tid])
	max_set1_block[tid] = s;

      uint64_t groupsz = 0;
      int sranks = 0;
      uint64_t nonlocalsuccs = 0;
      amplusplus::transport::rank_type last_rank = -1;
      for (auto i = 0; i < s; ++i) {
	auto ite = vertex_successors[v].begin() + i;
	amplusplus::transport::rank_type dest = get(owner, (*ite));
	if (dest != transport.rank()) {
	  if (last_rank != dest) {
	    last_rank = dest;
	    if (groupsz > max_set2_block[tid])
	      max_set2_block[tid] = groupsz;

	    groupsz = 0;

	  }
	  setranks.insert(dest);
	  ++groupsz;
	  ++nonlocalsuccs;
	}
      }

      sranks = setranks.size();
      setranks.clear();

      // suppose we are doing psp and calculate the number bytes transferred
      if (s != 0) {
	auto ssbytes = (s*sranks) + nonlocalsuccs;
	predicted_ss_bytes[tid] += ssbytes;
      }

      init_succs[tid] += vertex_successors[v].size();
#endif

    }
#endif

#ifdef PRINT_DEBUG
    t_bar->wait();
    if (tid == 0) {
      size_t cap = 0;
      size_t totsz = 0;
      BGL_FORALL_VERTICES_T(v, g, Graph) {
	cap += vertex_successors[v].capacity();
	totsz += vertex_successors[v].size();
      }

      std::cout << "Rank : " << transport.rank() << " capacity : " << cap << " size : " << totsz << std::endl;
    }        
#endif
    // For debugging
    { amplusplus::scoped_epoch epoch(transport); }


    typedef std::pair<typename std::vector<Vertex>::iterator, uint64_t> SuccIterSizePair_t;
    //    std::vector< std::vector<Vertex> > rank_successors(transport.size());
    std::vector< SuccIterSizePair_t > rank_successors(transport.size());

    for (amplusplus::transport::rank_type r=0; 
		 r < transport.size(); ++r) {
      rank_successors[r].second = 0;
    }


    time_type etcalps = get_time();
    if (tid == 0)
      std::cout << "Time to calculate predecessors and successors : " << (etcalps-atcalps) << std::endl;

    uint64_t offset = 0;
    // should come before begin epoch
    start_time = get_time();
    // Start the algorithm

    t_bar->wait();

    std::pair<typename boost::graph_traits<Graph>::vertex_iterator,
    	      typename boost::graph_traits<Graph>::vertex_iterator> itepair = vertices(g);
    
    typename boost::graph_traits<Graph>::vertex_iterator startite = itepair.first;

    time_type atint = get_time();
    {
      amplusplus::scoped_epoch epoch(transport);

#ifdef CRAYPAT
    if (PAT_region_begin ( 1, "tcrun" ) == PAT_API_FAIL) {
      std::cout << "PAT begin failed ! " << std::endl;
      assert(false);
    }
#endif

      for(; startite != itepair.second; ++startite) {

	// no successors or predecessors, then continue
	if (vertex_successors[*startite].size() == 0)
	  continue;

	offset = (local_id(*startite) + tid) % nthreads;

#ifdef PRINT_DEBUG
	if (transport.rank() == 0) {
	  if (local_id(*startite) % 1000) {
	    std::cout << "R: " << transport.rank() << "vertex : " << *startite << std::endl;
	    std::cout << "R: " << transport.rank() << "offset : " << offset << std::endl;	
	  }
	}
#endif

	uint64_t pred_count = vertex_successors[*startite].size();
	uint64_t succ_count = vertex_successors[*startite].size();

#ifdef PRINT_DEBUG
	std::cout << "pred_count : " << pred_count << std::endl;
	std::cout << "succ_count : " << succ_count << std::endl;
#endif
	//	for (auto pos = offset; pos < pred_count; pos=pos+nthreads) {
	auto numblocks = (pred_count+block_size-1)/block_size;
	auto numsucblocks = (succ_count+suc_block_size-1)/suc_block_size;

	//	std::cout << "numblocks=" << numblocks << std::endl;
	for (auto pos = offset; pos < numblocks; pos=pos+nthreads) {
#ifdef TC_STATS
	  //assert(numsucblocks == 1);
	  //assert(numblocks == 1);
	  if (max_degree_vertex[tid] < out_degree(*startite, g))
	    max_degree_vertex[tid] = out_degree(*startite, g);

	  if (max_suc_degree[tid] < succ_count)
	    max_suc_degree[tid] = succ_count;
#endif
	  auto begin = pos * block_size;
	  size_t size = std::min(block_size, (pred_count-(pos*block_size)));
	  auto end = begin + size;

	  auto predbegin = vertex_successors[*startite].begin() + begin;
	  auto predend = vertex_successors[*startite].begin() + end;

	  for (uint64_t sucblkpos = 0; sucblkpos < numsucblocks; ++sucblkpos) {
#ifdef PRINT_DEBUG
	    std::cout << "begin :" << begin << " end : " << end << std::endl;
#endif
	    auto sucpos = sucblkpos * suc_block_size;
	    auto sucposend = sucpos + std::min(suc_block_size, (succ_count-(sucblkpos*suc_block_size)));
	    
	    for(; sucpos < sucposend; ++sucpos) {
	      Vertex send_succ = *(vertex_successors[*startite].begin() + sucpos);
	      amplusplus::transport::rank_type dest = get(owner, send_succ);	    
	      //rank_successors[dest].push_back(send_succ);
	      //rank_successors[dest].shrink_to_fit();
	      if (rank_successors[dest].second == 0) {
		rank_successors[dest].first = (vertex_successors[*startite].begin() + sucpos);
	      }
	      
	      rank_successors[dest].second++;
	    }

	    // now send these to appropriate ranks
	    // first send to remote ranks
	    for (amplusplus::transport::rank_type r=0; 
		 r < transport.size(); ++r) {
	      if ((r != transport.rank()) && (rank_successors[r].second != 0)) {

#ifdef TC_STATS
		send_msgs[tid]++;
#endif

		block_msg bmsg(rank_successors[r].first, rank_successors[r].second,
			       predbegin, size);
#ifdef TC_STATS
		num_bytes_sent_over_nw[tid] += (bmsg.get_size() - sizeof(Vertex));
#endif

		msg_type.send(bmsg, r);
		rank_successors[r].second = 0;
	      }
	    }

	    // now do the local rank
	    if (rank_successors[transport.rank()].second != 0) {
#ifdef TC_STATS
	      local_msgs[tid]++;
	      auto sistart = std::chrono::system_clock::now();
#endif


	      many_set_intersection(predbegin, size,
				    rank_successors[transport.rank()].first,
				    rank_successors[transport.rank()].second,
				    tid);
#ifdef TC_STATS
	      auto siend = std::chrono::system_clock::now();
	      total_setints[tid]++;
	      auto durationforthis 
		= std::chrono::duration_cast<std::chrono::nanoseconds>(siend - sistart).count();
	      tot_set_int_time[tid] += durationforthis;
#endif

	      rank_successors[transport.rank()].second = 0;
	    }
	  }
	}
      }
#ifdef CRAYPAT
    if (PAT_region_end(1) == PAT_API_FAIL) {
      std::cout << "PAT end failed ! " << std::endl;
      assert(false);
    }
#endif

    }

    time_type etint = get_time();
    if (tid == 0)
      std::cout << "Set intersection time : " << (etint-atint) << std::endl;
  

#ifdef PRINT_DEBUG
    std::cout << "End of epoch ............................." << std::endl;
    std::cout << "Sends : " << no_sends 
	      << ", Receives : " << no_receives 
	      << ", Last Succ : " << lastsucc 
	      << ", Last Count : " << lastcount << std::endl;

#endif


    t_bar->wait();
    end_time = get_time();
  }
}


template<TC_PARAMS_SUCSUC>
struct TC_TYPE_SUCSUC::
processing_function {
  
  processing_function() : self(NULL) {}
  processing_function(triangle_counting_sucsuc& self) : self(&self) {}
  
  void operator() (const amplusplus::transport::rank_type src, 
		   block_msg& msg) const {
    int tid = amplusplus::detail::get_thread_id();

#ifdef TC_STATS
    self->recv_msgs[tid]++;
#endif

#ifdef TC_STATS
    auto sistart = std::chrono::system_clock::now();
#endif

    self->many_set_intersection(msg.get_predecessor_array(),
				msg.get_pred_count(),
				msg.get_successor_array(),
				msg.get_suc_count(),
				tid);

#ifdef TC_STATS
    auto siend = std::chrono::system_clock::now();
    self->total_setints[tid]++;
    auto currenttot = self->tot_set_int_time[tid] * (self->total_setints[tid]-1) +
      std::chrono::duration_cast<std::chrono::milliseconds>(siend - sistart).count();
    self->tot_set_int_time[tid] = currenttot / self->total_setints[tid];
#endif

  }

protected:
  triangle_counting_sucsuc* self;
};

template<TC_PARAMS_SUCSUC>
struct TC_TYPE_SUCSUC::
degree_processing_function {
  
  degree_processing_function() : self(NULL) {}
  degree_processing_function(triangle_counting_sucsuc& self) : self(&self) {}
  
  void operator() (const work_item_t& data) const {
    int tid = amplusplus::detail::get_thread_id();
    self->populate_successors(data);
  }

protected:
  triangle_counting_sucsuc* self;
};


}}}
#endif