Skip to content

Commit

Permalink
Merge pull request #2465 from verilog-to-routing/par_router3_2
Browse files Browse the repository at this point in the history
Improved parallel router: add DecompNetlistRouter
  • Loading branch information
duck2 committed Feb 12, 2024
2 parents fe9089c + 45b1ff2 commit f3a08de
Show file tree
Hide file tree
Showing 25 changed files with 1,338 additions and 198 deletions.
7 changes: 4 additions & 3 deletions libs/EXTERNAL/libargparse/argparse_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,10 +399,11 @@ int main(
.show_in(argparse::ShowIn::HELP_ONLY);
route_grp.add_argument(args.router_algorithm, "--router_algorithm")
.help("Specifies the router algorithm to use.\n"
" * parallel: timing_driven with tricks to run on multiple cores (may be worse)\n"
" * timing driven: focuses on routability and circuit speed\n")
" * timing driven: focuses on routability and circuit speed [default]\n"
" * parallel: timing_driven with nets in different regions of the chip routed in parallel\n"
" * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n")
.default_value("timing_driven")
.choices({"parallel", "timing_driven"})
.choices({"parallel", "parallel_decomp", "timing_driven"})
.show_in(argparse::ShowIn::HELP_ONLY);
route_grp.add_argument(args.min_incremental_reroute_fanout, "--min_incremental_reroute_fanout")
.help("The net fanout thershold above which nets will be re-routed incrementally.")
Expand Down
5 changes: 0 additions & 5 deletions libs/librrgraph/src/base/rr_graph_storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -667,11 +667,6 @@ class t_rr_graph_storage {
static inline Direction get_node_direction(
vtr::array_view_id<RRNodeId, const t_rr_node_data> node_storage,
RRNodeId id) {
auto& node_data = node_storage[id];
if (node_data.type_ != CHANX && node_data.type_ != CHANY) {
VTR_LOG_ERROR("Attempted to access RR node 'direction' for non-channel type '%s'",
rr_node_typename[node_data.type_]);
}
return node_storage[id].dir_side_.direction;
}

Expand Down
38 changes: 38 additions & 0 deletions libs/libvtrutil/src/vtr_dynamic_bitset.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ class dynamic_bitset {
static_assert(std::numeric_limits<Storage>::is_integer,
"dynamic_bitset storage must be integer!");

constexpr dynamic_bitset() = default;
constexpr dynamic_bitset(Index size) {
resize(size);
}

///@brief Reize to the determined size
void resize(size_t size) {
array_.resize((size + kWidth - 1) / kWidth);
Expand Down Expand Up @@ -63,6 +68,39 @@ class dynamic_bitset {
return (array_[index_value / kWidth] & (1u << (index_value % kWidth))) != 0;
}

///@brief Return count of set bits.
constexpr size_t count(void) const {
size_t out = 0;
for (auto x : array_)
out += __builtin_popcount(x);
return out;
}

///@brief Bitwise OR with rhs. Truncate the operation if one operand is smaller.
constexpr dynamic_bitset<Index, Storage>& operator|=(const dynamic_bitset<Index, Storage>& x) {
size_t n = std::min(array_.size(), x.array_.size());
for (size_t i = 0; i < n; i++)
array_[i] |= x.array_[i];
return *this;
}

///@brief Bitwise AND with rhs. Truncate the operation if one operand is smaller.
constexpr dynamic_bitset<Index, Storage>& operator&=(const dynamic_bitset<Index, Storage>& x) {
size_t n = std::min(array_.size(), x.array_.size());
for (size_t i = 0; i < n; i++)
array_[i] &= x.array_[i];
return *this;
}

///@brief Return inverted bitset.
inline dynamic_bitset<Index, Storage> operator~(void) const {
dynamic_bitset<Index, Storage> out(size());
size_t n = array_.size();
for (size_t i = 0; i < n; i++)
out.array_[i] = ~array_[i];
return out;
}

private:
std::vector<Storage> array_;
};
Expand Down
3 changes: 3 additions & 0 deletions vpr/src/base/ShowSetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ static void ShowRouterOpts(const t_router_opts& RouterOpts) {
case PARALLEL:
VTR_LOG("PARALLEL\n");
break;
case PARALLEL_DECOMP:
VTR_LOG("PARALLEL_DECOMP\n");
break;
case TIMING_DRIVEN:
VTR_LOG("TIMING_DRIVEN\n");
break;
Expand Down
9 changes: 6 additions & 3 deletions vpr/src/base/read_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ struct ParseRouterAlgorithm {
ConvertedValue<e_router_algorithm> conv_value;
if (str == "parallel")
conv_value.set_value(PARALLEL);
else if (str == "parallel_decomp")
conv_value.set_value(PARALLEL_DECOMP);
else if (str == "timing_driven")
conv_value.set_value(TIMING_DRIVEN);
else {
Expand Down Expand Up @@ -2403,10 +2405,11 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
route_grp.add_argument<e_router_algorithm, ParseRouterAlgorithm>(args.RouterAlgorithm, "--router_algorithm")
.help(
"Specifies the router algorithm to use.\n"
" * parallel: [experimental] timing_driven but multithreaded\n"
" * timing_driven: focuses on routability and circuit speed\n")
" * timing driven: focuses on routability and circuit speed [default]\n"
" * parallel: timing_driven with nets in different regions of the chip routed in parallel\n"
" * parallel_decomp: timing_driven with additional parallelism obtained by decomposing high-fanout nets, possibly reducing quality\n")
.default_value("timing_driven")
.choices({"parallel", "timing_driven"})
.choices({"parallel", "parallel_decomp", "timing_driven"})
.show_in(argparse::ShowIn::HELP_ONLY);

route_grp.add_argument(args.min_incremental_reroute_fanout, "--min_incremental_reroute_fanout")
Expand Down
1 change: 1 addition & 0 deletions vpr/src/base/vpr_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,7 @@ struct t_placer_opts {

enum e_router_algorithm {
PARALLEL,
PARALLEL_DECOMP,
TIMING_DRIVEN,
};

Expand Down
126 changes: 126 additions & 0 deletions vpr/src/route/DecompNetlistRouter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#pragma once

/** @file Parallel and net-decomposing case for NetlistRouter. Works like
* \see ParallelNetlistRouter, but tries to "decompose" nets and assign them to
* the next level of the partition tree where possible. */
#include "netlist_routers.h"

#include <tbb/task_group.h>

/** Maximum number of iterations for net decomposition
* 5 is found experimentally: higher values get more speedup on initial iters but # of iters increases */
const int MAX_DECOMP_ITER = 5;

/** Maximum # of decomposition for a net: 2 means one net gets divided down to <4 virtual nets.
* Higher values are more aggressive: better thread utilization but worse congestion resolving */
const int MAX_DECOMP_DEPTH = 2;

/** Minimum # of fanouts of a net to consider decomp. */
const int MIN_DECOMP_SINKS = 8;

/** Minimum # of fanouts of a virtual net to consider decomp. */
const int MIN_DECOMP_SINKS_VNET = 8;

template<typename HeapType>
class DecompNetlistRouter : public NetlistRouter {
public:
DecompNetlistRouter(
const Netlist<>& net_list,
const RouterLookahead* router_lookahead,
const t_router_opts& router_opts,
CBRR& connections_inf,
NetPinsMatrix<float>& net_delay,
const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
std::shared_ptr<SetupHoldTimingInfo> timing_info,
NetPinTimingInvalidator* pin_timing_invalidator,
route_budgets& budgeting_inf,
const RoutingPredictor& routing_predictor,
const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots,
bool is_flat)
: _routers_th(_make_router(router_lookahead, is_flat))
, _net_list(net_list)
, _router_opts(router_opts)
, _connections_inf(connections_inf)
, _net_delay(net_delay)
, _netlist_pin_lookup(netlist_pin_lookup)
, _timing_info(timing_info)
, _pin_timing_invalidator(pin_timing_invalidator)
, _budgeting_inf(budgeting_inf)
, _routing_predictor(routing_predictor)
, _choking_spots(choking_spots)
, _is_flat(is_flat)
, _net_known_samples(net_list.nets().size())
, _is_decomp_disabled(net_list.nets().size()) {}
~DecompNetlistRouter() {}

/** Run a single iteration of netlist routing for this->_net_list. This usually means calling
* \ref route_net for each net, which will handle other global updates.
* \return RouteIterResults for this iteration. */
RouteIterResults route_netlist(int itry, float pres_fac, float worst_neg_slack);
/** Set RCV enable flag for all routers managed by this netlist router.
* Net decomposition does not work with RCV, so calling this fn with x=true is a fatal error. */
void set_rcv_enabled(bool x);
void set_timing_info(std::shared_ptr<SetupHoldTimingInfo> timing_info);

private:
/** Should we decompose this net? */
bool should_decompose_net(ParentNetId net_id, const PartitionTreeNode& node);
/** Get a bitset with sinks to route before net decomposition */
vtr::dynamic_bitset<> get_decomposition_mask(ParentNetId net_id, const PartitionTreeNode& node);
/** Get a bitset with sinks to route before virtual net decomposition */
vtr::dynamic_bitset<> get_vnet_decomposition_mask(const VirtualNet& vnet, const PartitionTreeNode& node);
/** Decompose and route a regular net. Output the resulting vnets to \p left and \p right.
* \return Success status: true if routing is successful and left and right now contain valid virtual nets: false otherwise. */
bool decompose_and_route_net(ParentNetId net_id, const PartitionTreeNode& node, VirtualNet& left, VirtualNet& right);
/** Decompose and route a virtual net. Output the resulting vnets to \p left and \p right.
* \return Success status: true if routing is successful and left and right now contain valid virtual nets: false otherwise. */
bool decompose_and_route_vnet(VirtualNet& vnet, const PartitionTreeNode& node, VirtualNet& left, VirtualNet& right);
/** A single task to route nets inside a PartitionTree node and add tasks for its child nodes to task group \p g. */
void route_partition_tree_node(tbb::task_group& g, PartitionTreeNode& node);

ConnectionRouter<HeapType> _make_router(const RouterLookahead* router_lookahead, bool is_flat) {
auto& device_ctx = g_vpr_ctx.device();
auto& route_ctx = g_vpr_ctx.mutable_routing();

return ConnectionRouter<HeapType>(
device_ctx.grid,
*router_lookahead,
device_ctx.rr_graph.rr_nodes(),
&device_ctx.rr_graph,
device_ctx.rr_rc_data,
device_ctx.rr_graph.rr_switch(),
route_ctx.rr_node_route_inf,
is_flat);
}

/* Context fields. Most of them will be forwarded to route_net (see route_net.tpp) */
/** Per-thread storage for ConnectionRouters. */
tbb::enumerable_thread_specific<ConnectionRouter<HeapType>> _routers_th;
const Netlist<>& _net_list;
const t_router_opts& _router_opts;
CBRR& _connections_inf;
/** Per-thread storage for RouteIterResults. */
tbb::enumerable_thread_specific<RouteIterResults> _results_th;
NetPinsMatrix<float>& _net_delay;
const ClusteredPinAtomPinsLookup& _netlist_pin_lookup;
std::shared_ptr<SetupHoldTimingInfo> _timing_info;
NetPinTimingInvalidator* _pin_timing_invalidator;
route_budgets& _budgeting_inf;
const RoutingPredictor& _routing_predictor;
const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& _choking_spots;
bool _is_flat;

/** Cached routing parameters for current iteration (inputs to \see route_netlist()) */
int _itry;
float _pres_fac;
float _worst_neg_slack;

/** Sinks to be always sampled for decomposition for each net: [0.._net_list.size()-1]
* (i.e. when routing fails after decomposition for a sink, sample it on next iteration) */
vtr::vector<ParentNetId, vtr::dynamic_bitset<>> _net_known_samples;

/** Is decomposition disabled for this net? [0.._net_list.size()-1] */
vtr::vector<ParentNetId, bool> _is_decomp_disabled;
};

#include "DecompNetlistRouter.tpp"

0 comments on commit f3a08de

Please sign in to comment.