diff --git a/README.md b/README.md index 3a8e1c959..024c41dbe 100644 --- a/README.md +++ b/README.md @@ -365,13 +365,17 @@ location prefix "/society/" { A back end HTTP server is defined with `server` directive. The full syntax is as follows: ``` -server [:] [conns_n=]; +server [:] [conns_n=] [weight=]; ``` -`IPADDR` can be either IPv4 or IPv6 address. Hostnames are not allowed. +* `IPADDR` can be either IPv4 or IPv6 address. Hostnames are not allowed. IPv6 address must be enclosed in square brackets (e.g. "[::0]" but not "::0"). -`PORT` defaults to 80 if not specified. -`conns_n=` is the number of parallel connections to the server. +* `PORT` defaults to 80 if not specified. +* `conns_n=` is the number of parallel connections to the server. `N` defaults to 32 if not specified. +* `weight=` is the static weight of the server. The weight must be +in the range of 1 to 100. If not specified, then the default weight of 50 +is used with the static ratio scheduler. Just the weight that differs from +default value may be specified for convenience. Multiple back end servers may be defined. For example: ``` @@ -442,11 +446,12 @@ with this directive. If not specified, the queue size is set to 1000. Back end servers can be grouped together into a single unit for the purpose of load balancing. Servers within a group are considered interchangeable. The load is distributed evenly among servers within a group. -If a server goes offline, other servers in a group take the load. +If a server goes offline, then other servers in a group take the load. The full syntax is as follows: ``` srv_group { - server [:] [conns_n=]; + sched ; + server [:] [conns_n=] [weight=]; ... } ``` @@ -479,14 +484,18 @@ Scheduler is used to distribute load among servers within a group. The group can be either explicit, defined with `srv_group` directive, or implicit. The syntax is as follows: ``` -sched ; +sched [OPTIONS]; ``` `SCHED_NAME` is the name of a scheduler available in Tempesta. +`OPTIONS` are optional. Not all schedulers have additional options. Currently there are two schedulers available: -* **round-robin** - Rotates all servers in a group in round-robin manner so -that requests are distributed uniformly across servers. This is the default -scheduler. +* **ratio** - Balances the load across servers in a group based on each +server's weight. Requests are forwarded more to servers with more weight, +and less to servers with less weight. As a result, each server in a group +receives an optimal load. In default configuration where weights are not +specified, servers weights are considered equal, and the scheduler works +in pure round-robin fashion. This is the default scheduler. * **hash** - Chooses a server based on a URI/Host hash of a request. Requests are distributed uniformly, and requests with the same URI/Host are always sent to the same server. @@ -504,8 +513,70 @@ A scheduler defined for the implicit group becomes the scheduler for an explicit group defined with `srv_group` directive if the explicit group is missing the `sched` directive. -If no scheduler is defined for a group, then scheduler defaults -to `round-robin`. +If no scheduler is defined, then scheduler defaults to `ratio`. + +**ratio** scheduler may have the following options: +* **static** - The weight of each server in a group is defined statically +with `[weight=]` option of the `server` directive. This is the default +`ratio` scheduler option. +* **dynamic** - The weight of each server in a group is defined dynamically. +Specific type of dynamic weight is specified with additional options: + * **minimum** - The current minimum response time from a server; + * **maximum** - The current maximum response time from a server; + * **average** - The current average response time from a server; + * **percentile `[]`** - The current response time from a server that + is within specified percentile. The percentile may be one of 50, 75, 90, + 95, 99. If none is given, then the default percentile of 90 is used. +If a specific type of dynamic weight is not specified, then the default type +of `average` is used. +* **predict** - The weight of each server in a group is predicted dynamically +for a time in the future, based on server's behavior in the past. Additional +options include those that are defined for **dynamic** weight, as well as +the following options: + * **past** - Period of time (in seconds) to keep past response time + values from a server. The default value is 30 seconds. + * **rate** - Rate (times per second) of retrieval of past response time + values. The default value is 20 times per second. + * **ahead** - Period of time (in seconds) for which to make a prediction; + It can't be more than half of **past**. The default value is 15 seconds. + +Naturally, if a Dynamic Scheduler is specified for a group, and there's +a server in that group with the `weight` option, then an error is produced +as that combination is incompatible. Same is true for Predictive Scheduler. + +The following are examples of scheduler specification in configuration. +Again, only one `sched` directive is allowed per group. +``` +# Use hash scheduler +sched hash; +# Use ratio scheduler. By default, static weight distribution is used. +sched ratio; +# Use ratio scheduler with static weight distribution. +sched ratio static; +# Use dynamic scheduler. By default, current average response time is used +# for weight distribution. +sched dynamic; +# Use dynamic scheduler with maximum response time for weight distribution. +sched dynamic maximum; +# Use dynamic scheduler, default percentile of 90 is used. +sched dynamic percentile; +# Use dynamic scheduler, percentile of 75 is used for weight distribution. +sched dynamic percentile 75; +# Use predictive scheduler, percentile of 75 is used for weight distribution. +# The values of weights of each server are collected for past 60 seconds +# at the rate of 20 times per second, the weight of each server in predicted +# for the time of 2 seconds ahead. +sched predict percentile 75 past=60 rate=20 ahead=2; +``` + +Servers should be grouped together with proper care. Server groups should +be created with servers that handle similar resources. For instance, if +servers with static content that is served quickly are grouped together +with servers with dynamic content that is I/O bound, then the quick +response times from servers with static content will be nearly invisible +in comparison to longer response times from servers with dynamic content. +In that case the distribution of load among these servers will be severely +skewed. #### HTTP Scheduler diff --git a/etc/tempesta_fw.conf b/etc/tempesta_fw.conf index f469d6a5c..aa9f80a39 100644 --- a/etc/tempesta_fw.conf +++ b/etc/tempesta_fw.conf @@ -8,17 +8,49 @@ # a group. # # Syntax: -# sched SCHED_NAME; +# sched SCHED_NAME [OPTIONS]; # # SCHED_NAME is a name of a scheduler module that distributes the load # among servers within a group. There are two schedulers available: -# - "round-robin" (default) - rotates all servers in the group in -# the round-robin manner, so requests are distributed uniformly across -# servers. +# - "ratio" (default) - Balances the load across servers in a group based +# on each server's weight. Requests are forwarded more to servers with +# more weight, and less to servers with less weight. As a result, each +# server in a group receives an optimal load. In default configuration +# where weights are not specified, servers weights are considered equal, +# and the scheduler works in pure round-robin fashion. # - "hash" - chooses a server based on a URI/Host hash of a request. # Requests are still distributed uniformly, but a request with the same # URI/Host is always sent to the same server. # +# OPTIONS are optional. Not all schedulers have additional options. +# +# "ratio" scheduler may have the following options: +# - static - The weight of each server in a group is defined statically +# with [weight=] option of the `server` directive. This is the +# default Ratio scheduler option. +# - dynamic - The weight of each server in a group is defined dynamically. +# Specific type of dynamic weight is specified with additional options: +# - minimum - The current minimum response time from a server; +# - maximum - The current maximum response time from a server; +# - average - The current average response time from a server; +# - percentile [] - The current response time from a server +# that is within specified percentile. The percentile may be +# one of 50, 75, 90, 95, 99. If none is given, then the default +# percentile of 90 is used. +# If a specific type of dynamic weight is not specified, then +# the default type of "average" is used. +# - predict - The weight of each server in a group is predicted dynamically +# for a time in the future, based on server's behavior in the past. +# Additional options include those that are defined for "dynamic" weight, +# as well as the following options: +# - past - Period of time (in seconds) to keep past response time +# values from a server. The default value is 30 seconds. +# - rate - Rate (times per second) of retrieval of past response time +# values. The default value is 20 times per second. +# - ahead - Period of time (in seconds) for which to make a prediction; +# It can't be more than half of **past**. The default value is 15 +# seconds. +# # Note that there's also the HTTP scheduler. It dispatches requests among # server groups only. Round-robin or hash scheduler must be used to select # a server within a group. @@ -30,7 +62,7 @@ # the `sched` directive. # # Default: -# sched round-robin; +# sched ratio; # # TAG: server. @@ -38,17 +70,23 @@ # Specifies an IP address/port of a back-end HTTP server. # # Syntax: -# server IPADDR[:PORT] [conns_n=N] +# server IPADDR[:PORT] [conns_n=N] [weight=N]; # # IPADDR may be either IPv4 or IPv6 address, hostnames are not allowed. # IPv6 address must be enclosed in square brackets (e.g. "[::0]" but not "::0"). # PORT defaults to 80 if not set. # # conns_n=N is the number of parallel connections to the server. -# The N defaults to 32 if not set. +# The N defaults to 32 if the option is not specified. +# +# weight=N is the static weight of the server. The weight must be in +# the range of 1 to 100. If not specified, then the default weight of 50 +# is used with the static ratio scheduler. Just the weight that differs +# from default value may be specified for convenience. +# # # Multiple back-end servers may be specified, for example: -# server 10.1.0.1:80 +# server 10.1.0.1:80; # server [fc00::1]:80; # # Default: @@ -60,7 +98,7 @@ # Defines a request that is considered non-idempotent. # # Syntax: -# nonidempotent +# nonidempotent ; # # is one of supported HTTP methods, such as GET, HEAD, POST, etc. # is a string matching operator, one of "eq", "prefix", "suffix", or "*". diff --git a/tempesta_fw/apm.c b/tempesta_fw/apm.c index 6354e6d88..d863b9001 100644 --- a/tempesta_fw/apm.c +++ b/tempesta_fw/apm.c @@ -138,7 +138,7 @@ static void __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) { int i; - unsigned long tmp; + unsigned long cnt_full, cnt_half; --pc->order; pc->begin = pc->end - ((TFW_STATS_BCKTS - 1) << pc->order); @@ -153,14 +153,15 @@ __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) */ for (i = 1; i < TFW_STATS_BCKTS / 2; ++i) atomic_add(atomic_read(&rng->cnt[r][i]), &rng->cnt[r][0]); - tmp = atomic_read(&rng->cnt[r][TFW_STATS_BCKTS / 2]) / 2; - atomic_add(tmp, &rng->cnt[r][0]); - atomic_set(&rng->cnt[r][1], tmp); + cnt_full = atomic_read(&rng->cnt[r][TFW_STATS_BCKTS / 2]); + cnt_half = cnt_full / 2; + atomic_add(cnt_half, &rng->cnt[r][0]); + atomic_set(&rng->cnt[r][1], cnt_full - cnt_half); for (i = 1; i < TFW_STATS_BCKTS / 2; ++i) { - tmp = atomic_read(&rng->cnt[r][TFW_STATS_BCKTS / 2 + i]); - tmp /= 2; - atomic_set(&rng->cnt[r][i * 2], tmp); - atomic_set(&rng->cnt[r][i * 2 + 1], tmp); + cnt_full = atomic_read(&rng->cnt[r][TFW_STATS_BCKTS / 2 + i]); + cnt_half = cnt_full / 2; + atomic_set(&rng->cnt[r][i * 2], cnt_half); + atomic_set(&rng->cnt[r][i * 2 + 1], cnt_full - cnt_half); } } @@ -203,7 +204,7 @@ tfw_stats_extend(TfwPcntRanges *rng, unsigned int r_time) * largest response time faced. */ static void -__tfw_stats_adjust(TfwPcntRanges *rng, int r) +tfw_stats_adjust(TfwPcntRanges *rng, int r) { TfwPcntCtl pc; unsigned long i, cnt = 0, sum = 0, max = 0, i_max = 0; @@ -263,23 +264,6 @@ __tfw_stats_adjust(TfwPcntRanges *rng, int r) } } -/* - * See if the range @r contains large outliers. Adjust it if so. - * This is the locked version. - * - * If the lock is busy then either the ranges are being adjusted - * or the percentiles are being calculated at this very moment. - * Just skip the adjustment of ranges and do it next time. - */ -static inline void -tfw_stats_adjust(TfwPcntRanges *rng, int r, spinlock_t *slock) -{ - if (!spin_trylock(slock)) - return; - __tfw_stats_adjust(rng, r); - spin_unlock(slock); -} - /* * Set the new maximum value. * Return true if the new value has been set. @@ -289,14 +273,15 @@ static inline bool tfw_stats_adj_max(TfwPcntRanges *rng, unsigned int r_time) { int old_val, max_val = atomic_read(&rng->max_val); - while (1) { - if (r_time <= max_val) - return false; + + while (r_time > max_val) { old_val = atomic_cmpxchg(&rng->max_val, max_val, r_time); if (likely(old_val == max_val)) return true; max_val = old_val; } + + return false; } /* @@ -308,14 +293,15 @@ static inline bool tfw_stats_adj_min(TfwPcntRanges *rng, unsigned int r_time) { int old_val, min_val = atomic_read(&rng->min_val); - while (1) { - if (r_time >= min_val) - return false; + + while (r_time < min_val) { old_val = atomic_cmpxchg(&rng->min_val, min_val, r_time); if (likely(old_val == min_val)) return true; min_val = old_val; } + + return false; } /** @@ -328,7 +314,7 @@ tfw_stats_adj_min(TfwPcntRanges *rng, unsigned int r_time) * We only care about correct array indexing. */ static void -tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time, spinlock_t *slock) +tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time) { TfwPcntCtl pc3, pc2 = { .atomic = rng->ctl[2].atomic }; @@ -343,7 +329,7 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time, spinlock_t *slock) TfwPcntCtl pc0, pc1 = { .atomic = rng->ctl[1].atomic }; if (pc1.end < r_time) { atomic_inc(__rng(&pc2, rng->cnt[2], r_time)); - tfw_stats_adjust(rng, 2, slock); + tfw_stats_adjust(rng, 2); atomic64_inc(&rng->tot_cnt); return; } @@ -352,27 +338,24 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time, spinlock_t *slock) BUG_ON(pc0.begin != 1); /* left bound is never moved */ if (pc0.end < r_time) { atomic_inc(__rng(&pc1, rng->cnt[1], r_time)); - tfw_stats_adjust(rng, 1, slock); + tfw_stats_adjust(rng, 1); atomic64_inc(&rng->tot_cnt); return; } atomic_inc(__rng(&pc0, rng->cnt[0], r_time)); - tfw_stats_adjust(rng, 0, slock); + tfw_stats_adjust(rng, 0); atomic64_inc(&rng->tot_cnt); return; } - if (!spin_trylock(slock)) - return; pc3.atomic = rng->ctl[3].atomic; if (unlikely(r_time > pc3.end)) { tfw_stats_extend(rng, r_time); pc3.atomic = rng->ctl[3].atomic; } atomic_inc(__rng(&pc3, rng->cnt[3], r_time)); - __tfw_stats_adjust(rng, 3); + tfw_stats_adjust(rng, 3); atomic64_inc(&rng->tot_cnt); - spin_unlock(slock); } /* @@ -385,6 +368,7 @@ tfw_stats_update(TfwPcntRanges *rng, unsigned int r_time, spinlock_t *slock) */ /* * A ring buffer entry structure. + * * @pcntrng - Struct for response time data by the percentiles algorithm. * @jtmistamp - The start of the time interval for the current entry. * @reset - The entry can be reset by one thread at a time. @@ -395,10 +379,25 @@ typedef struct { atomic_t reset; } TfwApmRBEnt; +/* + * The ring buffer structure. + * + * @rbent - Array of ring buffer entries. + * @slock - The lock to adjust the ranges in the current entry. + * @rbufsz - The size of @rbent. + */ +typedef struct { + TfwApmRBEnt *rbent; + spinlock_t slock; + int rbufsz; +} TfwApmRBuf; + /* * The ring buffer contol structure. + * * This is a supporting structure. It keeps related data that is useful * in making decisions on the need of recalculation of percentiles. + * * @jtmwstamp - The start of the time window the percentiles are for. * @entry_cnt - The number of hits in the current buffer ring entry. * @total_cnt - The number of hits within the current time window. @@ -410,21 +409,10 @@ typedef struct { unsigned long total_cnt; } TfwApmRBCtl; -/* - * The ring buffer structure. - * @rbent - Array of ring buffer entries. - * @slock - The lock to adjust the ranges in the current entry. - * @rbufsz - The size of @rbent. - */ -typedef struct { - TfwApmRBEnt *rbent; - spinlock_t slock; - int rbufsz; -} TfwApmRBuf; - /* * The stats entry data structure. * Keeps the latest values of calculated percentiles. + * * @pstats - The percentile stats structure. * @rwlock - Protect updates. */ @@ -435,43 +423,90 @@ typedef struct { /* * The stats data structure. + * * There's only one updater that runs on timer. It calculates the latest * percentiles and updates the stored values. There are multiple readers * of the stored values. The stored values of the latest percentiles are * a shared resource that needs a lock to access. An array of two entries * is used to decrease the lock contention. Readers read the stored values - * at @prcntl[@rdidx % 2]. The writer writes the new percentile values to - * @prcntl[(@rdidx + 1) % 2], and then increments @rdidx. The reading and + * at @asent[@rdidx % 2]. The writer writes the new percentile values to + * @asent[(@rdidx + 1) % 2], and then increments @rdidx. The reading and * the writing are protected by a rwlock. + * * @asent - The stats entries for reading/writing (flip-flop manner). - * @rdidx - The current index in @prcntl for readers. + * @rdidx - The current index in @asent for readers. */ typedef struct { TfwApmSEnt asent[2]; atomic_t rdidx; } TfwApmStats; +/* + * An update buffer entry that holds RTT data for updates. + * + * The value of @centry depends on @jtstamp that comes as part of data + * for the update. Ideally, @jtstamp and @rtt would be stored instead + * of @centry and @rtt. However, together they occupy more than 64 bits, + * and it's highly desirable to read/write them in a single operation. + * + * @centry - The entry number in the array of ring buffer entries. + * @rtt - The RTT of the message, in milliseconds. + */ +typedef union { + struct { + unsigned int centry; + unsigned int rtt; + } __attribute__((packed)); + uint64_t data; +} TfwApmUBEnt; + +/* + * The buffer that holds RTT data for updates, per CPU. + * + * The data for an update is stored in an array per CPU. The actual + * updates,and then the percentile recalculation is done periodically + * by a single thread, which removes concurrency between updates and + * the calculation. The data for an update is stored in one array of + * the two, while the processing thread processes the accumulated + * data in the other array. The switch between these two arrays is + * managed by way of @counter by the processing thread. + * + * @ubent - Arrays of ring buffer entries (flip-flop manner). + * @ubufsz - The size of @ubent. + * @counter - The counter that controls which @ubent to use. + */ +typedef struct { + TfwApmUBEnt *ubent[2]; + size_t ubufsz; + atomic64_t counter; +} TfwApmUBuf; /* * APM Data structure. + * * Note that the organization of the supporting data heavily depends * on the fact that there's only one party that does the calculation * of percentiles - the function that runs periodically on timer. * If there are several different parties that do the calculation, * then the data may need to be organized differently. + * * @rbuf - The ring buffer for the specified time window. * @rbctl - The control data helpful in taking optimizations. * @stats - The latest percentiles. + * @ubuf - The buffer that holds data for updates, per CPU. * @timer - The periodic timer handle. * @flags - The atomic flags (see below). */ #define TFW_APM_DATA_F_REARM (0x0001) /* Re-arm the timer. */ #define TFW_APM_DATA_F_RECALC (0x0002) /* Need to recalculate. */ -#define TFW_APM_TIMER_TIMEOUT (HZ/20) /* The timer periodicity. */ + +#define TFW_APM_TIMER_INTVL (HZ / 20) +#define TFW_APM_UBUF_SZ TFW_APM_TIMER_INTVL /* a slot per ms. */ typedef struct { TfwApmRBuf rbuf; TfwApmRBCtl rbctl; TfwApmStats stats; + TfwApmUBuf __percpu *ubuf; struct timer_list timer; unsigned long flags; } TfwApmData; @@ -487,11 +522,6 @@ static const TfwPcntCtl __read_mostly tfw_rngctl_init[TFW_STATS_RANGES] = { {{4, 109, 349}} }; -/* A superset of percentiles for all users. */ -static const TfwPrcntl __read_mostly tfw_apm_prcntl[] = { - {50}, {75}, {90}, {95}, {99} -}; - static int tfw_apm_jtmwindow; /* Time window in jiffies. */ static int tfw_apm_jtmintrvl; /* Time interval in jiffies. */ static int tfw_apm_tmwscale; /* Time window scale. */ @@ -503,6 +533,7 @@ static int tfw_apm_tmwscale; /* Time window scale. */ */ /* * Ring buffer entry state structure. + * * @v - The response time value. * @i - The current sequential bucket number across all ranges. * @r - The current range number. @@ -539,7 +570,8 @@ __tfw_apm_state_next(TfwPcntRanges *rng, TfwApmRBEState *st) return; } } - __tfw_apm_state_set(st, USHRT_MAX, i, r, b); + __tfw_apm_state_set(st, USHRT_MAX, TFW_STATS_RANGES * TFW_STATS_BCKTS, + TFW_STATS_RANGES, TFW_STATS_BCKTS); } static inline void @@ -570,8 +602,13 @@ tfw_apm_state_next(TfwPcntRanges *rng, TfwApmRBEState *st) static int tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats) { +#define IDX_MIN TFW_PSTATS_IDX_MIN +#define IDX_MAX TFW_PSTATS_IDX_MAX +#define IDX_AVG TFW_PSTATS_IDX_AVG +#define IDX_ITH TFW_PSTATS_IDX_ITH + int i, p; - unsigned long cnt = 0, val, pval[pstats->prcntlsz]; + unsigned long cnt = 0, val, pval[pstats->psz]; TfwApmRBEState st[rbuf->rbufsz]; TfwPcntRanges *pcntrng; TfwApmRBEnt *rbent = rbuf->rbent; @@ -582,12 +619,12 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats __tfw_apm_state_next(pcntrng, &st[i]); } /* The number of items to collect for each percentile. */ - for (i = 0, p = 0; i < pstats->prcntlsz; ++i) { - pval[i] = rbctl->total_cnt * pstats->prcntl[i].ith / 100; + for (i = p = IDX_ITH; i < pstats->psz; ++i) { + pval[i] = rbctl->total_cnt * pstats->ith[i] / 100; if (!pval[i]) - pstats->prcntl[p++].val = 0; + pstats->val[p++] = 0; } - while (p < pstats->prcntlsz) { + while (p < pstats->psz) { int v_min = USHRT_MAX; for (i = 0; i < rbuf->rbufsz; i++) { if (st[i].v < v_min) @@ -602,8 +639,10 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats "cnt [%lu] total_cnt [%lu]\n", __func__, cnt, rbctl->total_cnt); TFW_DBG3("%s: [%lu] [%lu] [%lu] [%lu] [%lu] [%lu]\n", - __func__, pval[0], pval[1], pval[2], - pval[3], pval[4], pval[5]); + __func__, + pval[IDX_ITH], pval[IDX_ITH + 1], + pval[IDX_ITH + 2], pval[IDX_ITH + 3], + pval[IDX_ITH + 4], pval[IDX_ITH + 5]); break; } for (i = 0; i < rbuf->rbufsz; i++) { @@ -613,25 +652,30 @@ tfw_apm_prnctl_calc(TfwApmRBuf *rbuf, TfwApmRBCtl *rbctl, TfwPrcntlStats *pstats cnt += atomic_read(&pcntrng->cnt[st[i].r][st[i].b]); tfw_apm_state_next(pcntrng, &st[i]); } - for ( ; p < pstats->prcntlsz && pval[p] <= cnt; ++p) - pstats->prcntl[p].val = v_min; + for ( ; p < pstats->psz && pval[p] <= cnt; ++p) + pstats->val[p] = v_min; } cnt = val = 0; - pstats->max = 0; - pstats->min = UINT_MAX; + pstats->val[IDX_MAX] = 0; + pstats->val[IDX_MIN] = UINT_MAX; for (i = 0; i < rbuf->rbufsz; i++) { pcntrng = &rbent[i].pcntrng; - if (pstats->min > atomic_read(&pcntrng->min_val)) - pstats->min = atomic_read(&pcntrng->min_val); - if (pstats->max < atomic_read(&pcntrng->max_val)) - pstats->max = atomic_read(&pcntrng->max_val); + if (pstats->val[IDX_MIN] > atomic_read(&pcntrng->min_val)) + pstats->val[IDX_MIN] = atomic_read(&pcntrng->min_val); + if (pstats->val[IDX_MAX] < atomic_read(&pcntrng->max_val)) + pstats->val[IDX_MAX] = atomic_read(&pcntrng->max_val); cnt += atomic64_read(&pcntrng->tot_cnt); val += atomic64_read(&pcntrng->tot_val); } if (likely(cnt)) - pstats->avg = val / cnt; + pstats->val[IDX_AVG] = val / cnt; return p; + +#undef IDX_ITH +#undef IDX_AVG +#undef IDX_MAX +#undef IDX_MIN } /* @@ -701,8 +745,9 @@ tfw_apm_rbctl_update(TfwApmData *data, int recalc) for (i = 0; i < rbuf->rbufsz; ++i) total_cnt += atomic64_read(&rbent[i].pcntrng.tot_cnt); + entry_cnt = atomic64_read(&rbent[centry].pcntrng.tot_cnt); - rbctl->entry_cnt = 0; + rbctl->entry_cnt = entry_cnt; rbctl->total_cnt = total_cnt; rbctl->jtmwstamp = jtmwstart; @@ -743,81 +788,46 @@ tfw_apm_rbctl_update(TfwApmData *data, int recalc) /* * Calculate the latest percentiles if necessary. * - * Return the number of percentile values that have been filled - * if potentially new percentile values were calculated. - * Return 0 if the percentile values didn't need the recalculation. - * REturn -1 of the recalculation could not be performed. + * Return 0 if the calculation is successful. + * Return < 0 if there was a system error. + * Return > 0 and < @prcntlsz if the calculation is incomplete. */ static int -__tfw_apm_calc(TfwApmData *data, TfwPrcntlStats *pstats, int recalc) -{ - int ret; - - if (!spin_trylock(&data->rbuf.slock)) - return -1; - if ((ret = tfw_apm_rbctl_update(data, recalc))) - ret = tfw_apm_prnctl_calc(&data->rbuf, &data->rbctl, pstats); - spin_unlock(&data->rbuf.slock); - - return ret; -} - -/* - * Calculate the latest percentiles if necessary. - * - * Note that this function may also be used concurrently by other users - * than the kernel timer function in this module, should the need arise. - * That should only be done in exceptional cases (like testing), because - * it would increase @data->rbuf->slock lock contention. - */ -static void tfw_apm_calc(TfwApmData *data) { - int nfilled, wridx, recalc; - TfwPrcntl prcntl[ARRAY_SIZE(tfw_apm_prcntl)]; - TfwPrcntlStats pstats = { prcntl, ARRAY_SIZE(prcntl) }; + int nfilled, recalc; + unsigned int rdidx; + unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 }; + TfwPrcntlStats pstats = { + .ith = tfw_pstats_ith, + .val = val, + .psz = ARRAY_SIZE(tfw_pstats_ith) + }; TfwApmSEnt *asent; - memcpy(prcntl, tfw_apm_prcntl, sizeof(tfw_apm_prcntl)); - - wridx = ((unsigned int)atomic_read(&data->stats.rdidx) + 1) % 2; - asent = &data->stats.asent[wridx]; + rdidx = atomic_read(&data->stats.rdidx); + asent = &data->stats.asent[(rdidx + 1) % 2]; recalc = test_and_clear_bit(TFW_APM_DATA_F_RECALC, &data->flags); - nfilled = __tfw_apm_calc(data, &pstats, recalc); + if (!tfw_apm_rbctl_update(data, recalc)) + return 0; + nfilled = tfw_apm_prnctl_calc(&data->rbuf, &data->rbctl, &pstats); if (!nfilled) - return; + return 0; - if (nfilled < asent->pstats.prcntlsz) { + if (nfilled < asent->pstats.psz) { TFW_DBG3("%s: Percentile calculation incomplete.\n", __func__); set_bit(TFW_APM_DATA_F_RECALC, &data->flags); } else { TFW_DBG3("%s: Percentile values may have changed.\n", __func__); write_lock(&asent->rwlock); - memcpy(asent->pstats.prcntl, prcntl, - asent->pstats.prcntlsz * sizeof(TfwPrcntl)); - asent->pstats.min = pstats.min; - asent->pstats.max = pstats.max; - asent->pstats.avg = pstats.avg; + memcpy(asent->pstats.val, pstats.val, + asent->pstats.psz * sizeof(asent->pstats.val[0])); atomic_inc(&data->stats.rdidx); write_unlock(&asent->rwlock); } -} - -/* - * Calculate the latest percentiles if necessary. - * Runs periodically on timer. - */ -static void -tfw_apm_prcntl_fn(unsigned long fndata) -{ - TfwApmData *data = (TfwApmData *)fndata; - tfw_apm_calc(data); - - smp_mb__before_atomic(); - if (test_bit(TFW_APM_DATA_F_REARM, &data->flags)) - mod_timer(&data->timer, jiffies + TFW_APM_TIMER_TIMEOUT); + return nfilled % asent->pstats.psz; } /* @@ -831,22 +841,19 @@ tfw_apm_prcntl_fn(unsigned long fndata) * tfw_apm_stats_bh() should be used for calls in user context. */ #define __tfw_apm_stats_body(apmdata, pstats, fn_lock, fn_unlock) \ - int rdidx, seq = pstats->seq; \ + unsigned int rdidx, seq = pstats->seq; \ TfwApmData *data = apmdata; \ TfwApmSEnt *asent; \ \ BUG_ON(!apmdata); \ \ smp_mb__before_atomic(); \ - rdidx = (unsigned int)atomic_read(&data->stats.rdidx) % 2; \ - asent = &data->stats.asent[rdidx]; \ + rdidx = atomic_read(&data->stats.rdidx); \ + asent = &data->stats.asent[rdidx % 2]; \ \ fn_lock(&asent->rwlock); \ - memcpy(pstats->prcntl, asent->pstats.prcntl, \ - pstats->prcntlsz * sizeof(TfwPrcntl)); \ - pstats->min = asent->pstats.min; \ - pstats->max = asent->pstats.max; \ - pstats->avg = asent->pstats.avg; \ + memcpy(pstats->val, asent->pstats.val, \ + pstats->psz * sizeof(pstats->val[0])); \ fn_unlock(&asent->rwlock); \ pstats->seq = rdidx; \ \ @@ -863,6 +870,7 @@ tfw_apm_stats(void *apmdata, TfwPrcntlStats *pstats) { __tfw_apm_stats_body(apmdata, pstats, read_lock, read_unlock); } +EXPORT_SYMBOL(tfw_apm_stats); /* * Verify that an APM Stats user using the same set of percentiles. @@ -871,58 +879,101 @@ tfw_apm_stats(void *apmdata, TfwPrcntlStats *pstats) * All APM Stats users must use the same set of percentiles. */ int -tfw_apm_prcntl_verify(TfwPrcntl *prcntl, unsigned int prcntlsz) +tfw_apm_pstats_verify(TfwPrcntlStats *pstats) { int i; - if (prcntlsz != ARRAY_SIZE(tfw_apm_prcntl)) + if (pstats->psz != ARRAY_SIZE(tfw_pstats_ith)) return 1; - for (i = 0; i < prcntlsz; ++i) - if (prcntl[i].ith != tfw_apm_prcntl[i].ith) + for (i = 0; i < pstats->psz; ++i) + if (pstats->ith[i] != tfw_pstats_ith[i]) return 1; return 0; } -static inline void -__tfw_apm_update(TfwApmRBuf *rbuf, unsigned long jtstamp, unsigned int rtt) +/* + * Calculate the latest percentiles if necessary. + * Runs periodically on timer. + */ +static void +tfw_apm_prcntl_tmfn(unsigned long fndata) { - int centry = (jtstamp / tfw_apm_jtmintrvl) % rbuf->rbufsz; + int i, icpu, updone = 0; + TfwApmData *data = (TfwApmData *)fndata; + TfwApmRBuf *rbuf = &data->rbuf; + TfwApmRBEnt *rbent = rbuf->rbent; + + BUG_ON(!fndata); + + /* + * Increment the counter and make the updates use the other array + * of the two that are available. In the meanwhile, use the array + * filled with updates to process them and calculate percentiles. + */ + for_each_online_cpu(icpu) { + TfwApmUBuf *ubuf = per_cpu_ptr(data->ubuf, icpu); + unsigned long idxval = atomic64_inc_return(&ubuf->counter); + TfwApmUBEnt *ubent = ubuf->ubent[(idxval - 1) % 2]; + TfwApmUBEnt rtt_data; + + for (i = 0; i < ubuf->ubufsz; ++i) { + rtt_data.data = READ_ONCE(ubent[i].data); + if (rtt_data.data == ULONG_MAX) + continue; + WRITE_ONCE(ubent[i].data, ULONG_MAX); + tfw_stats_update(&rbent[rtt_data.centry].pcntrng, + rtt_data.rtt); + ++updone; + } + } + if (updone && unlikely(tfw_apm_calc(data))) { + TFW_DBG2("%s: Incomplete calculation\n", __func__); + } + + smp_mb(); + if (test_bit(TFW_APM_DATA_F_REARM, &data->flags)) + mod_timer(&data->timer, jiffies + TFW_APM_TIMER_INTVL); +} + +static void +__tfw_apm_update(TfwApmData *data, unsigned long jtstamp, unsigned long rtt) +{ + TfwApmUBuf *ubuf = this_cpu_ptr(data->ubuf); + unsigned long idxval = atomic64_add_return(0, &ubuf->counter); + TfwApmUBEnt *ubent = ubuf->ubent[idxval % 2]; + int centry = (jtstamp / tfw_apm_jtmintrvl) % data->rbuf.rbufsz; unsigned long jtmistart = jtstamp - (jtstamp % tfw_apm_jtmintrvl); - TfwApmRBEnt *crbent = &rbuf->rbent[centry]; + TfwApmUBEnt rtt_data = { .centry = centry, .rtt = rtt }; - tfw_apm_rbent_checkreset(crbent, jtmistart); - tfw_stats_update(&crbent->pcntrng, rtt, &rbuf->slock); + tfw_apm_rbent_checkreset(&data->rbuf.rbent[centry], jtmistart); + WRITE_ONCE(ubent[jtstamp % ubuf->ubufsz].data, rtt_data.data); } void -tfw_apm_update(void *apmdata, unsigned long jtstamp, unsigned long jrtt) +tfw_apm_update(void *apmref, unsigned long jtstamp, unsigned long jrtt) { unsigned int rtt = jiffies_to_msecs(jrtt); - BUG_ON(!apmdata); + BUG_ON(!apmref); /* * APM stats can't handle response times that are greater than * the maximum value possible for TfwPcntCtl{}->end. Currently * the value is USHRT_MAX which is about 65 secs in milliseconds. */ if (likely(rtt < (1UL << FIELD_SIZEOF(TfwPcntCtl, end) * 8))) - __tfw_apm_update(&((TfwApmData *)apmdata)->rbuf, jtstamp, rtt); + __tfw_apm_update(apmref, jtstamp, rtt); } -/* - * Destroy the specified APM ring buffer. - */ -void -tfw_apm_destroy(void *apmdata) +static void +tfw_apm_destroy(TfwApmData *data) { - TfwApmData *data = apmdata; - - if (!data) - return; - clear_bit(TFW_APM_DATA_F_REARM, &data->flags); - smp_mb__after_atomic(); - del_timer_sync(&data->timer); + int icpu; + for_each_online_cpu(icpu) { + TfwApmUBuf *ubuf = per_cpu_ptr(data->ubuf, icpu); + kfree(ubuf->ubent[0]); + } + free_percpu(data->ubuf); kfree(data); } @@ -938,16 +989,19 @@ tfw_apm_rbent_init(TfwApmRBEnt *rbent, unsigned long jtmistamp) /* * Create and initialize an APM ring buffer for a server. + * + * Note that due to specifics of Tempesta start up process this code + * is executed in SoftIRQ context (so that sleeping is not allowed). */ void * tfw_apm_create(void) { TfwApmData *data; TfwApmRBEnt *rbent; - TfwPrcntl *prcntl[2]; - int i, size; + int i, icpu, size; + unsigned int *val[2]; int rbufsz = tfw_apm_tmwscale; - int prcntlsz = ARRAY_SIZE(tfw_apm_prcntl); + int psz = ARRAY_SIZE(tfw_pstats_ith); if (!tfw_apm_tmwscale) { TFW_ERR("Late initialization of 'apm_stats' option\n"); @@ -955,43 +1009,100 @@ tfw_apm_create(void) } /* Keep complete stats for the full time window. */ - size = sizeof(TfwApmData) + rbufsz * sizeof(TfwApmRBEnt) - + 2 * sizeof(tfw_apm_prcntl); + size = sizeof(TfwApmData) + + rbufsz * sizeof(TfwApmRBEnt) + + 2 * psz * sizeof(unsigned int); if ((data = kzalloc(size, GFP_ATOMIC)) == NULL) return NULL; + size = sizeof(TfwApmUBuf); + data->ubuf = __alloc_percpu_gfp(size, sizeof(int64_t), GFP_ATOMIC); + if (!data->ubuf) { + kfree(data); + return NULL; + } + /* Set up memory areas. */ rbent = (TfwApmRBEnt *)(data + 1); - prcntl[0] = (TfwPrcntl *)(rbent + rbufsz); - prcntl[1] = (TfwPrcntl *)(prcntl[0] + prcntlsz); + val[0] = (unsigned int *)(rbent + rbufsz); + val[1] = (unsigned int *)(val[0] + psz); data->rbuf.rbent = rbent; data->rbuf.rbufsz = rbufsz; - data->stats.asent[0].pstats.prcntl = prcntl[0]; - data->stats.asent[0].pstats.prcntlsz = prcntlsz; + data->stats.asent[0].pstats.ith = tfw_pstats_ith; + data->stats.asent[0].pstats.val = val[0]; + data->stats.asent[0].pstats.psz = psz; - data->stats.asent[1].pstats.prcntl = prcntl[1]; - data->stats.asent[1].pstats.prcntlsz = prcntlsz; + data->stats.asent[1].pstats.ith = tfw_pstats_ith; + data->stats.asent[1].pstats.val = val[1]; + data->stats.asent[1].pstats.psz = psz; /* Initialize data. */ for (i = 0; i < rbufsz; ++i) tfw_apm_rbent_init(&rbent[i], 0); spin_lock_init(&data->rbuf.slock); - memcpy(prcntl[0], tfw_apm_prcntl, sizeof(tfw_apm_prcntl)); - memcpy(prcntl[1], tfw_apm_prcntl, sizeof(tfw_apm_prcntl)); - rwlock_init(&data->stats.asent[0].rwlock); rwlock_init(&data->stats.asent[1].rwlock); atomic_set(&data->stats.rdidx, 0); + size = 2 * TFW_APM_UBUF_SZ * sizeof(TfwApmUBEnt); + for_each_online_cpu(icpu) { + TfwApmUBEnt *ubent; + TfwApmUBuf *ubuf = per_cpu_ptr(data->ubuf, icpu); + ubent = kmalloc_node(size, GFP_ATOMIC, cpu_to_node(icpu)); + if (!ubent) + goto cleanup; + for (i = 0; i < 2 * TFW_APM_UBUF_SZ; ++i) + WRITE_ONCE(ubent[i].data, ULONG_MAX); + ubuf->ubent[0] = ubent; + ubuf->ubent[1] = ubent + TFW_APM_UBUF_SZ; + ubuf->ubufsz = TFW_APM_UBUF_SZ; + } + + return data; + +cleanup: + tfw_apm_destroy(data); + return NULL; +} + +int +tfw_apm_add_srv(TfwServer *srv) +{ + TfwApmData *data; + + BUG_ON(srv->apmref); + + if (!(data = tfw_apm_create())) + return -ENOMEM; + /* Start the timer for the percentile calculation. */ set_bit(TFW_APM_DATA_F_REARM, &data->flags); - setup_timer(&data->timer, tfw_apm_prcntl_fn, (unsigned long)data); - mod_timer(&data->timer, jiffies + TFW_APM_TIMER_TIMEOUT); + setup_timer(&data->timer, tfw_apm_prcntl_tmfn, (unsigned long)data); + mod_timer(&data->timer, jiffies + TFW_APM_TIMER_INTVL); - return data; + srv->apmref = data; + + return 0; +} + +void +tfw_apm_del_srv(TfwServer *srv) +{ + TfwApmData *data = srv->apmref; + + if (!data) + return; + + /* Stop the timer and the percentile calculation. */ + clear_bit(TFW_APM_DATA_F_REARM, &data->flags); + smp_mb__after_atomic(); + del_timer_sync(&data->timer); + + tfw_apm_destroy(data); + srv->apmref = NULL; } #define TFW_APM_MIN_TMWSCALE 1 /* Minimum time window scale. */ diff --git a/tempesta_fw/apm.h b/tempesta_fw/apm.h index f5ca5fbbe..3f529def2 100644 --- a/tempesta_fw/apm.h +++ b/tempesta_fw/apm.h @@ -21,38 +21,48 @@ #define __TFW_APM_H__ #include "pool.h" +#include "server.h" /* - * @ith - percentile number. - * @val - percentile value. + * @ith - array of percentile numbers, with space for min/max/avg; + * @val - array of percentile values, and values for min/max/avg; + * @psz - size of @ith and @val arrays; + * @seq - opaque data related to percentiles calculation; */ typedef struct { - unsigned int ith; - unsigned int val; -} TfwPrcntl; - -/* - * @stats - Percentile Stats array. - * @stsz - @stats array size. - * @min - Minimal value. - * @max - Maximal value. - * @avg - Average value. - * @seq - opaque data related to percentiles calculation. - */ -typedef struct { - TfwPrcntl *prcntl; - unsigned int prcntlsz; - unsigned int min; - unsigned int max; - unsigned int avg; - unsigned int seq; + const unsigned int *ith; + unsigned int *val; + unsigned int psz; + unsigned int seq; } TfwPrcntlStats; -void *tfw_apm_create(void); -void tfw_apm_destroy(void *data); -void tfw_apm_update(void *data, unsigned long jtstamp, unsigned long jrtime); -int tfw_apm_stats(void *data, TfwPrcntlStats *pstats); -int tfw_apm_stats_bh(void *data, TfwPrcntlStats *pstats); -int tfw_apm_prcntl_verify(TfwPrcntl *prcntl, unsigned int prcntlsz); +enum { + TFW_PSTATS_IDX_MIN = 0, + TFW_PSTATS_IDX_MAX, + TFW_PSTATS_IDX_AVG, + TFW_PSTATS_IDX_ITH, + TFW_PSTATS_IDX_P50 = TFW_PSTATS_IDX_ITH, + TFW_PSTATS_IDX_P75, + TFW_PSTATS_IDX_P90, + TFW_PSTATS_IDX_P95, + TFW_PSTATS_IDX_P99, + _TFW_PSTATS_IDX_COUNT +}; + +static const unsigned int __read_mostly tfw_pstats_ith[] = { + [TFW_PSTATS_IDX_MIN ... TFW_PSTATS_IDX_AVG] = 0, + [TFW_PSTATS_IDX_P50] = 50, + [TFW_PSTATS_IDX_P75] = 75, + [TFW_PSTATS_IDX_P90] = 90, + [TFW_PSTATS_IDX_P95] = 95, + [TFW_PSTATS_IDX_P99] = 99, +}; + +int tfw_apm_add_srv(TfwServer *srv); +void tfw_apm_del_srv(TfwServer *srv); +void tfw_apm_update(void *apmref, unsigned long jtstamp, unsigned long jrtime); +int tfw_apm_stats(void *apmref, TfwPrcntlStats *pstats); +int tfw_apm_stats_bh(void *apmref, TfwPrcntlStats *pstats); +int tfw_apm_pstats_verify(TfwPrcntlStats *pstats); #endif /* __TFW_APM_H__ */ diff --git a/tempesta_fw/http.c b/tempesta_fw/http.c index 9435e5357..6fb1955fe 100644 --- a/tempesta_fw/http.c +++ b/tempesta_fw/http.c @@ -344,7 +344,7 @@ tfw_http_send_502(TfwHttpReq *req, const char *reason) .flags = 4 << TFW_STR_CN_SHIFT }; - TFW_DBG("Send HTTP 502 response: %s:\n", reason); + TFW_DBG("Send HTTP 502 response: %s\n", reason); return tfw_http_send_resp(req, &rh, __TFW_STR_CH(&rh, 1)); } @@ -369,7 +369,7 @@ tfw_http_send_504(TfwHttpReq *req, const char *reason) .flags = 4 << TFW_STR_CN_SHIFT }; - TFW_DBG("Send HTTP 504 response: %s:\n", reason); + TFW_DBG("Send HTTP 504 response: %s\n", reason); return tfw_http_send_resp(req, &rh, __TFW_STR_CH(&rh, 1)); } @@ -2178,7 +2178,7 @@ tfw_http_resp_cache_cb(TfwHttpReq *req, TfwHttpResp *resp) * value of RTT has an upper boundary in the APM. */ if (resp->conn) - tfw_apm_update(((TfwServer *)resp->conn->peer)->apm, + tfw_apm_update(((TfwServer *)resp->conn->peer)->apmref, resp->jrxtstamp, resp->jrxtstamp - req->jtxtstamp); tfw_http_resp_fwd(req, resp); diff --git a/tempesta_fw/procfs.c b/tempesta_fw/procfs.c index 29d1ad4fa..8b18a0da1 100644 --- a/tempesta_fw/procfs.c +++ b/tempesta_fw/procfs.c @@ -141,43 +141,48 @@ tfw_perfstat_seq_open(struct inode *inode, struct file *file) return single_open(file, tfw_perfstat_seq_show, PDE_DATA(inode)); } -/* - * Individual server statistics. Note that 50% percentile - * is used to tell the median value. - */ -static const TfwPrcntl __read_mostly tfw_procfs_prcntl[] = { - {50}, {75}, {90}, {95}, {99} -}; - static int tfw_srvstats_seq_show(struct seq_file *seq, void *off) { #define SPRNE(m, e) seq_printf(seq, m": %dms\n", e) - int i; + size_t i, rc; TfwSrvConn *srv_conn; TfwServer *srv = seq->private; - TfwPrcntl prcntl[ARRAY_SIZE(tfw_procfs_prcntl)]; - TfwPrcntlStats pstats = { prcntl, ARRAY_SIZE(prcntl) }; + unsigned int qsize[srv->conn_n]; + unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 }; + TfwPrcntlStats pstats = { + .ith = tfw_pstats_ith, + .val = val, + .psz = ARRAY_SIZE(tfw_pstats_ith) + }; - memcpy(prcntl, tfw_procfs_prcntl, sizeof(prcntl)); + tfw_apm_stats_bh(srv->apmref, &pstats); - tfw_apm_stats_bh(srv->apm, &pstats); + SPRNE("Minimal response time\t\t", pstats.val[TFW_PSTATS_IDX_MIN]); + SPRNE("Average response time\t\t", pstats.val[TFW_PSTATS_IDX_AVG]); + SPRNE("Median response time\t\t", pstats.val[TFW_PSTATS_IDX_P50]); + SPRNE("Maximum response time\t\t", pstats.val[TFW_PSTATS_IDX_MAX]); - SPRNE("Minimal response time\t\t", pstats.min); - SPRNE("Average response time\t\t", pstats.avg); - SPRNE("Median response time\t\t", prcntl[0].val); - SPRNE("Maximum response time\t\t", pstats.max); seq_printf(seq, "Percentiles\n"); - for (i = 0; i < ARRAY_SIZE(prcntl); ++i) - seq_printf(seq, "\t%02d%%:\t%dms\n", - prcntl[i].ith, prcntl[i].val); - i = 0; + for (i = TFW_PSTATS_IDX_ITH; i < ARRAY_SIZE(tfw_pstats_ith); ++i) + seq_printf(seq, "%02d%%:\t%dms\n", + pstats.ith[i], pstats.val[i]); + + i = rc = 0; + list_for_each_entry(srv_conn, &srv->conn_list, list) { + qsize[i++] = READ_ONCE(srv_conn->qsize); + if (tfw_srv_conn_restricted(srv_conn)) + rc++; + } + + seq_printf(seq, "Total schedulable connections\t: %zd\n", + srv->conn_n - rc); seq_printf(seq, "Maximum forwarding queue size\t: %d\n", srv->sg->max_qsize); - list_for_each_entry(srv_conn, &srv->conn_list, list) - seq_printf(seq, "\tConnection %03d queue size\t: %d\n", - ++i, ACCESS_ONCE(srv_conn->qsize)); + for (i = 0; i < srv->conn_n; ++i) + seq_printf(seq, "\tConnection %03zd queue size\t: %d\n", + i, qsize[i]); return 0; #undef SPRNE @@ -241,13 +246,14 @@ static int tfw_procfs_cfg_start(void) { int i, ret; - TfwPrcntl prcntl[ARRAY_SIZE(tfw_procfs_prcntl)]; - - memcpy(prcntl, tfw_procfs_prcntl, sizeof(prcntl)); + TfwPrcntlStats pstats = { + .ith = tfw_pstats_ith, + .psz = ARRAY_SIZE(tfw_pstats_ith) + }; if (!tfw_procfs_tempesta) return -ENOENT; - if (tfw_apm_prcntl_verify(prcntl, ARRAY_SIZE(prcntl))) + if (tfw_apm_pstats_verify(&pstats)) return -EINVAL; tfw_procfs_srvstats = proc_mkdir("servers", tfw_procfs_tempesta); if (!tfw_procfs_srvstats) diff --git a/tempesta_fw/sched/Makefile b/tempesta_fw/sched/Makefile index a2f805f53..bafb70706 100644 --- a/tempesta_fw/sched/Makefile +++ b/tempesta_fw/sched/Makefile @@ -20,4 +20,4 @@ EXTRA_CFLAGS += $(TFW_CFLAGS) -I$(src)/../ -I$(src)/../../tempesta_db/core EXTRA_CFLAGS += $(TTLS_CFLAGS) -obj-m = tfw_sched_hash.o tfw_sched_http.o tfw_sched_rr.o +obj-m = tfw_sched_hash.o tfw_sched_http.o tfw_sched_ratio.o diff --git a/tempesta_fw/sched/tfw_sched_hash.c b/tempesta_fw/sched/tfw_sched_hash.c index 8f0b46879..e809ad56d 100644 --- a/tempesta_fw/sched/tfw_sched_hash.c +++ b/tempesta_fw/sched/tfw_sched_hash.c @@ -45,34 +45,22 @@ MODULE_AUTHOR(TFW_AUTHOR); MODULE_DESCRIPTION("Tempesta hash-based scheduler"); -MODULE_VERSION("0.3.0"); +MODULE_VERSION("0.4.0"); MODULE_LICENSE("GPL"); typedef struct { size_t conn_n; TfwServer *srv; - TfwSrvConn *conn[TFW_SRV_MAX_CONN]; - unsigned long hash[TFW_SRV_MAX_CONN]; + TfwSrvConn **conn; + unsigned long *hash; } TfwHashSrv; typedef struct { + size_t conn_n; size_t srv_n; - TfwHashSrv srvs[TFW_SG_MAX_SRV]; + TfwHashSrv *srvs; } TfwHashSrvList; -static void -tfw_sched_hash_alloc_data(TfwSrvGroup *sg) -{ - sg->sched_data = kzalloc(sizeof(TfwHashSrvList), GFP_KERNEL); - BUG_ON(!sg->sched_data); -} - -static void -tfw_sched_hash_free_data(TfwSrvGroup *sg) -{ - kfree(sg->sched_data); -} - static unsigned long __calc_conn_hash(TfwServer *srv, size_t conn_idx) { @@ -101,39 +89,6 @@ __calc_conn_hash(TfwServer *srv, size_t conn_idx) return hash_long(hash ^ conn_idx, BITS_PER_LONG); } -static void -tfw_sched_hash_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *conn) -{ - size_t s, c; - TfwHashSrv *srv_cl; - TfwHashSrvList *sl = sg->sched_data; - - BUG_ON(!sl); - - for (s = 0; s < sl->srv_n; ++s) - if (sl->srvs[s].srv == srv) - break; - if (s == sl->srv_n) { - sl->srvs[s].srv = srv; - ++sl->srv_n; - BUG_ON(sl->srv_n > TFW_SG_MAX_SRV); - srv->sched_data = &sl->srvs[s]; - } - - srv_cl = &sl->srvs[s]; - - for (c = 0; c < srv_cl->conn_n; ++c) - if (srv_cl->conn[c] == conn) { - TFW_WARN("sched_hash: Try to add existing connection," - " srv=%zu conn=%zu\n", s, c); - return; - } - srv_cl->conn[c] = conn; - srv_cl->hash[c] = __calc_conn_hash(srv, s * TFW_SRV_MAX_CONN + c); - ++srv_cl->conn_n; - BUG_ON(srv_cl->conn_n > TFW_SRV_MAX_CONN); -} - static inline void __find_best_conn(TfwSrvConn **best_conn, TfwHashSrv *srv_cl, unsigned long *best_weight, unsigned long msg_hash) @@ -186,9 +141,9 @@ __find_best_conn(TfwSrvConn **best_conn, TfwHashSrv *srv_cl, static TfwSrvConn * tfw_sched_hash_get_sg_conn(TfwMsg *msg, TfwSrvGroup *sg) { - unsigned long msg_hash; - unsigned long tries = TFW_SG_MAX_CONN; TfwHashSrvList *sl = sg->sched_data; + unsigned long msg_hash; + unsigned long tries = sl->conn_n; BUG_ON(!sl); @@ -249,12 +204,93 @@ tfw_sched_hash_get_srv_conn(TfwMsg *msg, TfwServer *srv) return NULL; } +static void +tfw_sched_hash_del_grp(TfwSrvGroup *sg) +{ + size_t si; + TfwHashSrvList *sl = sg->sched_data; + + if (!sl) + return; + + for (si = 0; si < sl->srv_n; ++si) + if (sl->srvs[si].conn) + kfree(sl->srvs[si].conn); + kfree(sl); + sg->sched_data = NULL; +} + +static int +tfw_sched_hash_add_grp(TfwSrvGroup *sg) +{ + int ret = -EINVAL; + size_t size, si, ci; + unsigned int sum_conn_n; + TfwServer *srv; + TfwHashSrv *hsrv; + TfwHashSrvList *sl; + + if (unlikely(!sg->srv_n || list_empty(&sg->srv_list))) + return -EINVAL; + + size = sizeof(TfwHashSrvList) + sizeof(TfwHashSrv) * sg->srv_n; + if (!(sg->sched_data = kzalloc(size, GFP_KERNEL))) + return -ENOMEM; + sl = sg->sched_data; + sl->srvs = sg->sched_data + sizeof(TfwHashSrvList); + sl->srv_n = sg->srv_n; + + si = sum_conn_n = 0; + hsrv = sl->srvs; + list_for_each_entry(srv, &sg->srv_list, list) { + TfwSrvConn **conn, *srv_conn; + unsigned long *hash; + + if (unlikely((si++ == sg->srv_n) || !srv->conn_n + || list_empty(&srv->conn_list))) + goto cleanup; + + size = (sizeof(hsrv->conn[0]) + sizeof(hsrv->hash[0])) + * srv->conn_n; + if (!(hsrv->conn = kzalloc(size, GFP_KERNEL))) { + ret = -ENOMEM; + goto cleanup; + } + hsrv->hash = (typeof(hsrv->hash))(hsrv->conn + srv->conn_n); + + ci = 0; + conn = hsrv->conn; + hash = hsrv->hash; + list_for_each_entry(srv_conn, &srv->conn_list, list) { + if (unlikely(ci++ == srv->conn_n)) + goto cleanup; + ++sum_conn_n; + *conn++ = srv_conn; + *hash++ = __calc_conn_hash(srv, sum_conn_n); + } + if (unlikely(ci != srv->conn_n)) + goto cleanup; + hsrv->conn_n = srv->conn_n; + hsrv->srv = srv; + srv->sched_data = hsrv; + ++hsrv; + } + if (unlikely(si != sg->srv_n)) + goto cleanup; + sl->conn_n = sum_conn_n; + + return 0; + +cleanup: + tfw_sched_hash_del_grp(sg); + return ret; +} + static TfwScheduler tfw_sched_hash = { .name = "hash", .list = LIST_HEAD_INIT(tfw_sched_hash.list), - .add_grp = tfw_sched_hash_alloc_data, - .del_grp = tfw_sched_hash_free_data, - .add_conn = tfw_sched_hash_add_conn, + .add_grp = tfw_sched_hash_add_grp, + .del_grp = tfw_sched_hash_del_grp, .sched_sg_conn = tfw_sched_hash_get_sg_conn, .sched_srv_conn = tfw_sched_hash_get_srv_conn, }; diff --git a/tempesta_fw/sched/tfw_sched_ratio.c b/tempesta_fw/sched/tfw_sched_ratio.c new file mode 100644 index 000000000..770b678dc --- /dev/null +++ b/tempesta_fw/sched/tfw_sched_ratio.c @@ -0,0 +1,1230 @@ +/** + * Tempesta FW + * + * Copyright (C) 2017 Tempesta Technologies, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include +#include +#include + +#include "tempesta_fw.h" +#include "apm.h" +#include "log.h" +#include "server.h" + +MODULE_AUTHOR(TFW_AUTHOR); +MODULE_DESCRIPTION("Tempesta Ratio Scheduler"); +MODULE_VERSION("0.1.0"); +MODULE_LICENSE("GPL"); + +#define TFW_SCHED_RATIO_INTVL (HZ / 20) /* The timer periodicity. */ + +/** + * Individual upstream server descriptor. + * + * Connections may go up or down during failover process. + * Only fully established connections are considered by scheduler. + * + * @srv - pointer to server structure. + * @conn - list of pointers to server connection structures. + * @counter - monotonic counter for choosing the next connection. + * @conn_n - number of connections to server. + * @seq - current sequence number for APM stats. + */ +typedef struct { + TfwServer *srv; + TfwSrvConn **conn; + atomic64_t counter; + size_t conn_n; + unsigned int seq; +} TfwRatioSrvDesc; + +/** + * Individual server data for scheduler. + * + * @sdidx - index of server descriptor this data is for. + * @weight - server weight. + * @cratio - current server ratio. + * @oratio - original server ratio. + */ +typedef struct { + size_t sdidx; + unsigned int weight; + unsigned int cratio; + unsigned int oratio; +} TfwRatioSrvData; + +/** + * Scheduler iteration data. + * + * @lock - must be in the same cache line for faster operations. + * @csidx - index of current server data entry. + * @reidx - index of next server data entry which ratio we need + * to reset, or @srv_n if no resetting is needed. + * @riter - ratio iteration, indicates the number of times we need + * to choose all servers before the current one until we + * can choose the current server. + * @crsum - current sum of all ratios, used to avoid scanning the + * list of servers with fully zeroed ratios. + * @orsum - original sum of all ratios, used to reset @crsum. + */ +typedef struct { + spinlock_t lock; + size_t csidx; + size_t reidx; + unsigned int riter; + unsigned long crsum; + unsigned long orsum; +} TfwRatioSchData; + +/** + * Historic (past) data unit for an individual upstream server. + * + * @cnt - count of timer function invocations. + * @rtt - RTT from APM in msecs. + */ +typedef struct { + unsigned long cnt; + unsigned long rtt; +} TfwRatioHstUnit; + +/** + * Historic (past) data set for an individual upstream server. + * This is the data set for simple linear regression calculation. + * + * @coeff_a - coefficient for rtt = coeff_a + coeff_b * cnt + eps. + * @coeff_b - coefficient for rtt = coeff_a + coeff_b * cnt + eps. + * @cnt_avg - average cnt value. + * @rtt_avg - average rtt value. + * @cnt_rtt_avg - avg(cnt * rtt). + * @cnt_avg_rtt_avg - avg(cnt) * avg(rtt). + * @cnt_sq_avg - avg(cnt * cnt). + * @cnt_avg_sq - avg(cnt) * avg(cnt). + * @hist - array of history data units. + */ +typedef struct { + long coeff_a; + long coeff_b; + long cnt_avg; + long rtt_avg; + long cnt_rtt_avg; + long cnt_avg_rtt_avg; + long cnt_sq_avg; + long cnt_avg_sq; + TfwRatioHstUnit *hist; +} TfwRatioHstDesc; + +/** + * Historic (past) data for predictive scheduler. + * + * @ahead - predict for this number of @intvl ahead. + * @slot_n - total number of slots for past data. + * @counter - slot that is available for storing past data. + * @hstdesc - past data for each server (@hstdesc[@srv_n]). + */ +typedef struct { + unsigned int ahead; + size_t slot_n; + unsigned long counter; + TfwRatioHstDesc *hstdesc; +} TfwRatioHstData; + +/** + * The main Ratio Scheduler data structure. + * + * All servers, either dead or live, are present in the list during + * the whole run-time. That may change in the future. + * + * @rcu - RCU control structure. + * @srvdata - scheduler data specific to each server in the group. + * @schdata - scheduler data common to all servers in the group. + */ +typedef struct { + struct rcu_head rcu; + TfwRatioSrvData *srvdata; + TfwRatioSchData schdata; +} TfwRatioData; + +/** + * The main structure for the group. + * + * @srv_n - number of upstream servers. + * @psidx - APM pstats[] value index for dynamic ratios. + * @intvl - interval for re-arming the timer. + * @rearm - indicates if the timer can be re-armed. + * @timer - periodic timer for dynamic APM data. + * @hstdata - historic data for predictive scheduler. + * @srvdesc - array of upstream server descriptors. + * @rtodata - pointer to the currently used scheduler data. + */ +typedef struct { + size_t srv_n; + size_t psidx; + unsigned int intvl; + atomic_t rearm; + struct timer_list timer; + TfwRatioHstData *hstdata; + TfwRatioSrvDesc *srvdesc; + TfwRatioData __rcu *rtodata; +} TfwRatio; + +/** + * Swap two server data entries. Required for sorting by sort(). + */ +static void +tfw_sched_ratio_srvdata_swap(void *lhs, void *rhs, int size) +{ + TfwRatioSrvData *lhs_data = (TfwRatioSrvData *)lhs; + TfwRatioSrvData *rhs_data = (TfwRatioSrvData *)rhs; + TfwRatioSrvData tmp = *lhs_data; + *lhs_data = *rhs_data; + *rhs_data = tmp; +} + +/** + * Sort server data entries by ratio in descending order. Entries + * with higher ratios are moved towards the start of the array. + */ +static int +tfw_sched_ratio_srvdata_cmp(const void *lhs, const void *rhs) +{ + unsigned int lhs_ratio = ((const TfwRatioSrvData *)lhs)->oratio; + unsigned int rhs_ratio = ((const TfwRatioSrvData *)rhs)->oratio; + + return (rhs_ratio < lhs_ratio) ? -1 : (rhs_ratio > lhs_ratio); +} + +/** + * Calculate and set up ratios for each server in the group. + * + * Return 0 if done with the ratios. + * Return a non-zero value if additional actions are needed. + */ +static int +tfw_sched_ratio_calc(TfwRatio *ratio, TfwRatioData *rtodata, + unsigned long sum_wgt, size_t max_val_idx, + size_t *arg_ovidx) +{ + size_t si, one_val_idx; + unsigned int diff, max_wgt, oratio; + unsigned long unit, sum_ratio = 0; + TfwRatioSrvData *srvdata = rtodata->srvdata; + TfwRatioSchData *schdata = &rtodata->schdata; + + /* Set up the common part of scheduler data. */ + schdata->csidx = 0; + schdata->riter = 1; + schdata->reidx = ratio->srv_n; + + /* + * Calculate each server's ratio using the following formula: + * unit = (MAX_WEIGHT + SRV_NUM) * MAX_WEIGHT / sum(weight); + * ratio[i] = unit * weight[i] / MAX_WEIGHT; + * + * See if all calculated ratios are the same. Set scheduler data. + */ + diff = one_val_idx = 0; + max_wgt = srvdata[max_val_idx].weight; + unit = ((max_wgt + ratio->srv_n) * max_wgt) / sum_wgt; + for (si = 0; si < ratio->srv_n; ++si) { + oratio = (unit * srvdata[si].weight) / max_wgt ? : 1; + srvdata[si].cratio = srvdata[si].oratio = oratio; + diff |= (oratio != srvdata[0].oratio); + sum_ratio += oratio; + if ((oratio == 1) && !one_val_idx) + one_val_idx = si; + } + schdata->crsum = schdata->orsum = sum_ratio; + + /* Return the index of server data entry with value of 1. */ + *arg_ovidx = one_val_idx; + + return diff; +} + +/* + * Calculate and set up ratios for each server in a group based on + * weights that are statically defined in the configuration file. + */ +static void +tfw_sched_ratio_calc_static(TfwRatio *ratio, TfwRatioData *rtodata) +{ + unsigned long sum_wgt; + unsigned int diff; + size_t si, max_val_idx, one_val_idx; + TfwRatioSrvDesc *srvdesc = ratio->srvdesc; + TfwRatioSrvData *srvdata = rtodata->srvdata; + + /* + * Collect server weights from the configuration. Calculate the + * sum of server's weights in the group. Remember the index of + * server data entry with maximum weight. That same entry will + * also have the maximum ratio. See if all weights in the group + * are the same. + */ + sum_wgt = diff = max_val_idx = 0; + for (si = 0; si < ratio->srv_n; ++si) { + unsigned int weight = srvdesc[si].srv->weight; + srvdata[si].sdidx = si; + srvdata[si].weight = weight; + srvdata[si].cratio = srvdata[si].oratio = 1; + if (srvdata[max_val_idx].weight < weight) + max_val_idx = si; + sum_wgt += weight; + diff |= (weight != srvdata[0].weight); + } + + /* + * If all server weights are the same, then there's no need to + * do anything else. Set up all ratios to 1 and be done with it. + */ + if (!diff) { + TfwRatioSchData *schdata = &rtodata->schdata; + + /* Set up the common part of scheduler data. */ + schdata->csidx = 0; + schdata->riter = 1; + schdata->reidx = ratio->srv_n; + + schdata->crsum = schdata->orsum = ratio->srv_n; + } + + /* Calculate ratios based on different weights of servers. */ + if (!tfw_sched_ratio_calc(ratio, rtodata, sum_wgt, + max_val_idx, &one_val_idx)) + return; + + /* Sort server data entries by ratio in descending order. */ + sort(srvdata, ratio->srv_n, sizeof(srvdata[0]), + tfw_sched_ratio_srvdata_cmp, tfw_sched_ratio_srvdata_swap); +} + +/** + * Calculate ratios for each server in a group based on dynamic data. + * + * Latest dynamic data is provided by APM module and represent RTT values + * for each server in a group. Ratios are calculated on those RTT values. + * However that way the ratios do not represent the real weight of each + * server. A bigger RTT value leads to a bigger ratio, while in fact that + * server is less favorable and should have a lesser, NOT bigger weight. + * + * Based on ratios calculated from RTT values, the algorithm here adjusts + * that and assigns a correct ratio to each server in the group. + * + * 1. If the minimal calculated ratio is 1, then find entries that have + * ratio of 1, and set them up with the weight and ratio of an entry + * with maximum calculated ratio. Likewise, set up entries with the + * maximum calculated ratio with weight and ratio of an entry with + * ratio of 1. + * For example, this is after the calculation of ratios: + * sdidx: 1 2 3 4 5 6 7 8 9 10 + * ratio: 10 5 1 30 1 25 1 60 15 50 + * After this step the result will be: + * sdidx: 1 2 3 4 5 6 7 8 9 10 + * ratio: 10 5 60 30 60 25 60 1 15 50 + * + * 2. Sort the resulting array by ratio in descending order as required + * by the scheduling algorithm. The result will be as follows: + * sdidx: 7 5 3 10 4 6 9 1 2 8 + * ratio: 60 60 60 50 30 25 15 10 5 1 + * + * 3. Select the part of the array that omits entries from step 1 if any. + * Those are entries at the start and at the end of the array. Reverse + * the sequence of server descriptor indices in that part of the array. + * The resulting pairing of servers to ratios is the target. Servers + * with a lesser RTT are assigned a larger ratio. Servers with a larger + * RTT are assigned a lesser ratio. The result will be as follows: + * sdidx: 7 5 3 2 1 9 6 4 10 8 + * ratio: 60 60 60 50 30 25 15 10 5 1 + */ +static void +__tfw_sched_ratio_calc_dynamic(TfwRatio *ratio, TfwRatioData *rtodata, + unsigned long sum_wgt, size_t max_val_idx) +{ + size_t si, one_val_idx, left, right; + unsigned int max_ratio, has_one_val; + TfwRatioSrvData *srvdata = rtodata->srvdata; + + /* Calculate ratios based on server RTT values. */ + if (!tfw_sched_ratio_calc(ratio, rtodata, sum_wgt, + max_val_idx, &one_val_idx)) + return; + + /* + * It's guaranteed here that NOT all calculated ratio values are + * equal. See if there are ratio values that equal to 1. If so, + * do actions described in step 1 in the function's description. + * Adjust the sum of ratios that is changed in this procedure. + */ + has_one_val = (srvdata[one_val_idx].oratio == 1); + + if (has_one_val) { + unsigned long orsum = rtodata->schdata.orsum; + TfwRatioSrvData sdent_one = srvdata[one_val_idx]; + TfwRatioSrvData sdent_max = srvdata[max_val_idx]; + + /* Save maximum ratio value for future use. */ + max_ratio = srvdata[max_val_idx].oratio; + + for (si = 0; si < ratio->srv_n; ++si) { + if (srvdata[si].oratio == 1) { + srvdata[si].weight = sdent_max.weight; + srvdata[si].oratio = + srvdata[si].cratio = sdent_max.oratio; + orsum += sdent_max.oratio - 1; + } else if (srvdata[si].oratio == sdent_max.oratio) { + srvdata[si].weight = sdent_one.weight; + srvdata[si].oratio = + srvdata[si].cratio = sdent_one.oratio; + orsum -= sdent_max.oratio - 1; + } + } + rtodata->schdata.crsum = rtodata->schdata.orsum = orsum; + } + + /* Sort server data entries by ratio in descending order. */ + sort(srvdata, ratio->srv_n, sizeof(srvdata[0]), + tfw_sched_ratio_srvdata_cmp, tfw_sched_ratio_srvdata_swap); + + /* + * Do actions described in step 3 in the function's description. + * Select the part of the array that omits entries from step 1 + * if there are any. Those are entries at the start and at the + * end of the array. Reverse the sequence of server descriptor + * indices in that part of the array. + */ + if (!has_one_val) { + left = 0; + right = ratio->srv_n - 1; + } else { + for (si = 0; si < ratio->srv_n; ++si) + if (srvdata[si].oratio == max_ratio) { + left = si + 1; + } else if (srvdata[si].oratio == 1) { + right = si - 1; + break; + } + } + while (left < right) { + size_t left_sdidx = srvdata[left].sdidx; + srvdata[left++].sdidx = srvdata[right].sdidx; + srvdata[right--].sdidx = left_sdidx; + } + + return; +} + +/** + * Get specific server's data (RTT) from the APM module. + * + * While all stats values are returned by the APM, only one specific + * value is taken as the current RTT. That is the configured value, + * one of MIN, MAX, AVG, or a specific percentile. + * + * Return 0 if there is no new APM data. + * Return a non-zero value otherwise. + * + * TODO: The following cases should be considered. + * 1. It's possible that the actual stats values calculated by the APM + * module did not change. However, the APM doesn't know of that and + * just reports that the values may have changed. It would be great + * to catch that and avoid the recalculation of ratios in some cases. + * 2. Depending on specific RTT value a small deviation from the previous + * value might be acceptable. That should not cause a recalculation + * of ratio. + * 3. A typical case is that only a handful of servers misbehave in + * a large group of servers. Is there a way to detect that and do + * a partial recalculation of ratios? + */ +static inline int +__tfw_sched_ratio_get_rtt(size_t si, TfwRatio *ratio, TfwRatioData *rtodata) +{ + unsigned int recalc; + unsigned int val[ARRAY_SIZE(tfw_pstats_ith)] = { 0 }; + TfwPrcntlStats pstats = { + .ith = tfw_pstats_ith, + .val = val, + .psz = ARRAY_SIZE(tfw_pstats_ith) + }; + TfwRatioSrvData *srvdata = rtodata->srvdata; + TfwRatioSrvDesc *srvdesc = ratio->srvdesc; + + pstats.seq = srvdesc[si].seq; + recalc = tfw_apm_stats(srvdesc[si].srv->apmref, &pstats); + srvdesc[si].seq = pstats.seq; + + srvdata[si].sdidx = si; + srvdata[si].weight = pstats.val[ratio->psidx] ? : 1; + + return recalc; +} + +/** + * Calculate ratios for each server in a group based on dynamic data. + * Latest dynamic data is provided by APM module and represent RTT values + * for each server in a group. Ratios are calculated on those RTT values. + * + * The function runs periodically on timer and provides the data that is + * used by the ratio scheduler for outgoing requests. + */ +static void +tfw_sched_ratio_calc_dynamic(TfwRatio *ratio, TfwRatioData *rtodata) +{ + size_t si, max_val_idx = 0; + unsigned long sum_wgt = 0; + TfwRatioSrvData *srvdata = rtodata->srvdata; + + /* + * Calculate the sum of server's weights in the group. Remember + * the index of server data entry with maximum weight. That same + * entry will also have the maximum ratio. + */ + for (si = 0; si < ratio->srv_n; ++si) { + __tfw_sched_ratio_get_rtt(si, ratio, rtodata); + if (srvdata[max_val_idx].weight < srvdata[si].weight) + max_val_idx = si; + sum_wgt += srvdata[si].weight; + } + + __tfw_sched_ratio_calc_dynamic(ratio, rtodata, sum_wgt, max_val_idx); +} + +/** + * Calculate ratios for each server in a group based on predicted values + * derived from dynamic data. The dynamic data is provided by APM module + * and represent RTT values for each server in a group. The RTT values + * are collected within a latest period of time (time window) and then + * used to predict the future RTT values that will be in action until + * the next run of this function. Server ratios are calculated on those + * predicted RTT values. + * + * A simple linear regression calculation on a sliding data window is + * used to predict future RTT values for each server. @rtt is an RTT + * value from APM, and @cnt is the current number of invocations of + * this timer function (every @intvl msecs). Essentially, @cnt is + * a measure of time. + * + * The POC (proof of concept) implementation of this algorithm can be + * found in t/unit/user_space/slr.cc. @cnt corresponds to @x in the POC, + * and @rtt corresponds to @y. + * + * The function runs periodically on timer and provides the data that + * is used by the ratio scheduler for outgoing requests. + */ +static void +tfw_sched_ratio_calc_predict(TfwRatio *ratio, TfwRatioData *rtodata) +{ + static const long MUL = 1000; + int ni, sz; + size_t si, max_val_idx; + unsigned long sum_wgt; + long cnt, rtt, ahead, prediction; + TfwRatioHstData *hstdata = ratio->hstdata; + TfwRatioSrvData *srvdata = rtodata->srvdata; + + ni = hstdata->counter % hstdata->slot_n; + cnt = hstdata->counter * MUL; + ahead = hstdata->counter + hstdata->ahead; + + sum_wgt = max_val_idx = 0; + for (si = 0; si < ratio->srv_n; ++si) { + TfwRatioHstDesc *hd = &hstdata->hstdesc[si]; + + __tfw_sched_ratio_get_rtt(si, ratio, rtodata); + + rtt = srvdata[si].weight * MUL; + + /* + * The calculations are slightly different for the case + * in the beginning where there's insufficient data for + * a whole window into the historic data set. + */ + if (unlikely(hstdata->counter < hstdata->slot_n)) { + sz = ni + 1; + hd->cnt_avg = (hd->cnt_avg * ni + cnt) / sz; + hd->rtt_avg = (hd->rtt_avg * ni + rtt) / sz; + hd->cnt_rtt_avg = + (hd->cnt_rtt_avg * ni + cnt * rtt) / sz; + hd->cnt_avg_rtt_avg = hd->cnt_avg * hd->rtt_avg; + hd->cnt_sq_avg = + (hd->cnt_sq_avg * ni + cnt * cnt) / sz; + hd->cnt_avg_sq = hd->cnt_avg * hd->cnt_avg; + } else { + long h_cnt = hd->hist[ni].cnt; + long h_rtt = hd->hist[ni].rtt; + sz = hstdata->slot_n; + hd->cnt_avg = hd->cnt_avg - (h_cnt - cnt) / sz; + hd->rtt_avg = hd->rtt_avg - (h_rtt - rtt) / sz; + hd->cnt_rtt_avg = hd->cnt_rtt_avg + - (h_cnt * h_rtt - cnt * rtt) / sz; + hd->cnt_avg_rtt_avg = hd->cnt_avg * hd->rtt_avg; + hd->cnt_sq_avg = hd->cnt_sq_avg + - (h_cnt * h_cnt - cnt * cnt) / sz; + hd->cnt_avg_sq = hd->cnt_avg * hd->cnt_avg; + } + + hd->hist[ni].cnt = cnt; + hd->hist[ni].rtt = rtt; + + if (hd->cnt_sq_avg == hd->cnt_avg_sq) { + hd->coeff_a = 0; + hd->coeff_b = hd->cnt_avg + ? hd->rtt_avg / hd->cnt_avg : 1; + } else { + hd->coeff_b = (hd->cnt_rtt_avg - hd->cnt_avg_rtt_avg) + / (hd->cnt_sq_avg - hd->cnt_avg_sq); + hd->coeff_a = (hd->rtt_avg - hd->coeff_b * hd->cnt_avg) + / MUL; + } + + prediction = hd->coeff_a + hd->coeff_b * ahead; + srvdata[si].weight = prediction <= 0 ? 1 : prediction; + + if (srvdata[max_val_idx].weight < srvdata[si].weight) + max_val_idx = si; + sum_wgt += srvdata[si].weight; + } + + ++hstdata->counter; + + __tfw_sched_ratio_calc_dynamic(ratio, rtodata, sum_wgt, max_val_idx); +} + +/** + * Get and set up a new ratio data entry. + */ +static TfwRatioData * +tfw_sched_ratio_rtodata_get(TfwRatio *ratio) +{ + size_t size; + TfwRatioData *rtodata; + + size = sizeof(TfwRatioData) + sizeof(TfwRatioSrvData) * ratio->srv_n; + if (!(rtodata = kmalloc(size, GFP_ATOMIC))) + return NULL; + rtodata->srvdata = (TfwRatioSrvData *)(rtodata + 1); + spin_lock_init(&rtodata->schdata.lock); + + return rtodata; +} + +/** + * Release a ratio data entry that is no longer used. + */ +static void +tfw_sched_ratio_rtodata_put(struct rcu_head *rcup) +{ + TfwRatioData *rtodata = container_of(rcup, TfwRatioData, rcu); + kfree(rtodata); +} + +/** + * Calculate the latest ratios for each server in the group in real time. + * + * RCU is used to avoid locks. When recalculation is in order, the new + * data is placed in a new allocated entry. The new entry is seamlessly + * set as the current entry by using RCU. The formerly active entry is + * released in due time when all users of it are done and gone. + */ +static void +tfw_sched_ratio_calc_tmfn(TfwSrvGroup *sg, + void (*calc_fn)(TfwRatio *, TfwRatioData *)) +{ + TfwRatio *ratio = sg->sched_data; + TfwRatioData *crtodata, *nrtodata; + + /* + * Get a new ratio data entry. Usually, if unsuccessful, that's + * not a big deal. Scheduling of upstream servers will continue + * to run on currently active data. However, the lack of memory + * is a critical issue in itself. + */ + if (!(nrtodata = tfw_sched_ratio_rtodata_get(ratio))) { + TFW_ERR("Sched ratio: Insufficient memory for group '%s'\n", + sg->name); + goto rearm; + } + + /* Calculate dynamic ratios. */ + calc_fn(ratio, nrtodata); + + /* + * Substitute the current ratio data entry with the new one for + * the scheduler. The former entry will be released when there + * are no users of it. Use the faster non-lazy RCU. + */ + crtodata = ratio->rtodata; + rcu_assign_pointer(ratio->rtodata, nrtodata); + call_rcu(&crtodata->rcu, tfw_sched_ratio_rtodata_put); + +rearm: + smp_mb(); + if (atomic_read(&ratio->rearm)) + mod_timer(&ratio->timer, jiffies + ratio->intvl); +} + +/** + * Periodic function for Dynamic Ratio Scheduler. + */ +static void +tfw_sched_ratio_dynamic_tmfn(unsigned long tmfn_data) +{ + tfw_sched_ratio_calc_tmfn((TfwSrvGroup *)tmfn_data, + tfw_sched_ratio_calc_dynamic); +} + +/** + * Periodic function for Predictive Ratio Scheduler. + */ +static void +tfw_sched_ratio_predict_tmfn(unsigned long tmfn_data) +{ + tfw_sched_ratio_calc_tmfn((TfwSrvGroup *)tmfn_data, + tfw_sched_ratio_calc_predict); +} + +/* + * Determine if it's the turn of the server described by the server + * data entry at index @csidx. + * + * It's the turn of server at @csidx if sums of ratios to the left and + * to the right of this entry are proportional to the current iteration. + * As the scheduler algorithm moves forward, the sum of ratios on the + * left side decreases. When a server is selected, its current ratio + * is decremented, so the sum of ratios decreases by 1 as well. + * + * With that in mind, ratios that have a huge difference should not be + * specified for servers in the same group. A decrement of a huge sum + * would be too insignificant to affect the scheduling algorithm. Thus + * weights like { 10, 1 } make more sense than weights like { 1000, 10 }. + * Requests are distributed proportionally in both cases, but significant + * bursts are possible in the first case. + * + * TODO: The algorithm may and should be improved. + */ +static inline bool +tfw_sched_ratio_is_srv_turn(TfwRatio *ratio, TfwRatioData *rtodata, size_t csidx) +{ + unsigned long headsum2, tailsum2; + TfwRatioSrvData *srvdata = rtodata->srvdata; + TfwRatioSchData *schdata = &rtodata->schdata; + + if (!csidx) + return true; + + headsum2 = (srvdata[0].cratio + srvdata[csidx - 1].cratio) * csidx; + tailsum2 = (srvdata[csidx].cratio + + (srvdata[ratio->srv_n - 1].cratio + ? : srvdata[ratio->srv_n - 1].oratio)) + * (ratio->srv_n - csidx); + + return tailsum2 * schdata->riter > headsum2; +} + +/* + * Get the index of the next server descriptor. + * + * The array of server data entries used by the algorithm must be sorted + * by ratio in descending order, with the higher weight entries moved + * towards the start of the array. + * + * For concurrent use the algorithm is synchronized by a plain spin lock. + * A lock-free implementation of the algorithm as it is would require too + * many atomic operations including CMPXCHG and checking loops. It seems + * that it won't give any advantage. + */ +static TfwRatioSrvDesc * +tfw_sched_ratio_next_srv(TfwRatio *ratio, TfwRatioData *rtodata) +{ + size_t csidx; + TfwRatioSrvData *srvdata = rtodata->srvdata; + TfwRatioSchData *schdata = &rtodata->schdata; + + /* Start with server that has the highest ratio. */ + spin_lock(&schdata->lock); +retry: + csidx = schdata->csidx; + if (!srvdata[csidx].cratio) { + /* + * The server's counter (current ratio) is depleted, but + * the server is not due yet for re-arming. Don't choose + * this server. This is a likely branch for ratios like + * { N, 1, 1, 1, ... } where N > 1 at some point. This + * is not the case if all server weights (and therefore + * ratios) were specified as 1. In that case it's down + * to plain round-robin. + */ + if (schdata->reidx != csidx) { + ++schdata->csidx; + if (schdata->csidx == ratio->srv_n) { + schdata->csidx = 0; + schdata->riter = 1; + } + goto retry; + } + srvdata[csidx].cratio = srvdata[csidx].oratio; + ++schdata->reidx; + /* Fall through */ + } + /* + * If it's the turn of the current server then take off a point + * from the server's current ratio (decrement it). Then prepare + * for the next time this function is called. If ratios of all + * servers got down to zero, then reset everything and start + * from the beginning. Otherwise, if it's the last server in + * the group, then also start from the beginning, but do not + * reset as it's been reset already (make sure of that). + */ + if (likely(tfw_sched_ratio_is_srv_turn(ratio, rtodata, csidx))) { + --srvdata[csidx].cratio; + if (unlikely(!--schdata->crsum)) { + schdata->csidx = 0; + schdata->riter = 1; + schdata->crsum = schdata->orsum; + schdata->reidx = 0; + } else if (unlikely(++schdata->csidx == ratio->srv_n)) { + BUG_ON(schdata->reidx != ratio->srv_n); + schdata->csidx = 0; + schdata->riter = 1; + } + spin_unlock(&schdata->lock); + return ratio->srvdesc + srvdata[csidx].sdidx; + } + /* + * This is not the turn of the current server. Start + * a new iteration from the server with highest ratio. + */ + schdata->csidx = 0; + ++schdata->riter; + goto retry; +} + +/* + * Find an available connection to the server described by @srvdesc. + * Consider the following restrictions: + * 1. connection is not in recovery mode. + * 2. connection's queue is not be full. + * 3. connection doesn't have active non-idempotent requests. + * + * The restriction #3 is controlled by @skipnip and can be removed + * to get a wider selection of available connections. + */ +static inline TfwSrvConn * +__sched_srv(TfwRatioSrvDesc *srvdesc, int skipnip, int *nipconn) +{ + size_t ci; + + for (ci = 0; ci < srvdesc->conn_n; ++ci) { + unsigned long idxval = atomic64_inc_return(&srvdesc->counter); + TfwSrvConn *srv_conn = srvdesc->conn[idxval % srvdesc->conn_n]; + + if (unlikely(tfw_srv_conn_restricted(srv_conn) + || tfw_srv_conn_queue_full(srv_conn))) + continue; + if (skipnip && tfw_srv_conn_hasnip(srv_conn)) { + if (likely(tfw_srv_conn_live(srv_conn))) + ++(*nipconn); + continue; + } + if (likely(tfw_srv_conn_get_if_live(srv_conn))) + return srv_conn; + } + + return NULL; +} + +/** + * Same as @tfw_sched_ratio_sched_sg_conn(), but schedule a connection + * to a specific server in a group. + */ +static TfwSrvConn * +tfw_sched_ratio_sched_srv_conn(TfwMsg *msg, TfwServer *srv) +{ + int skipnip = 1, nipconn = 0; + TfwRatioSrvDesc *srvdesc = srv->sched_data; + TfwSrvConn *srv_conn; + + /* + * For @srv without connections @srvdesc will be NULL. Normally, + * it doesn't happen in real life, but unit tests check this case. + */ + if (unlikely(!srvdesc)) + return NULL; +rerun: + if ((srv_conn = __sched_srv(srvdesc, skipnip, &nipconn))) + return srv_conn; + + if (skipnip && nipconn) { + skipnip = 0; + goto rerun; + } + + return NULL; +} + +/** + * On each subsequent call the function returns the next available + * connection to one of the servers in the group. Connections to a + * server are rotated in pure round-robin fashion. + * + * A server is chosen according to its current weight that can be + * either static or dynamic. Servers with greater weight are chosen + * more often than servers with lesser weight. + * + * Dead connections and servers w/o live connections are skipped. + * Initially, connections with non-idempotent requests are also skipped + * in attempt to increase throughput. However, if all live connections + * contain a non-idempotent request, then re-run the algorithm and get + * the first live connection they way it is usually done. + * + * Ratio scheduler must be the fastest scheduler. Also, it's essential + * to maintain a completely fair distribution of requests to servers + * according to servers weights. + */ +static TfwSrvConn * +tfw_sched_ratio_sched_sg_conn(TfwMsg *msg, TfwSrvGroup *sg) +{ + unsigned int attempts, skipnip = 1, nipconn = 0; + TfwRatio *ratio = sg->sched_data; + TfwRatioSrvDesc *srvdesc; + TfwSrvConn *srv_conn; + TfwRatioData *rtodata; + + BUG_ON(!ratio); + + rcu_read_lock(); + rtodata = rcu_dereference(ratio->rtodata); + BUG_ON(!rtodata); +rerun: + /* + * Try servers in a group according to their ratios. Attempt to + * schedule a connection that is not under a set of restrictions. + * + * NOTE: The way the algorithm works, same server may be chosen + * multiple times in a row, even if that's the server where all + * connections were under restrictions for one reason or another. + * The idea is that the conditions for server's connections may + * change any time, and so the next time one or more connections + * to the same server will not be restricted. + * Also, servers are chosen concurrently, so a particular thread + * may not be able to probe all servers in a group. + * + * These properties suggest that a limit is needed on the number + * of attempts to find the right connection. This limit appears + * to be purely empirical. + * + * A tricky issue here is that the algorithm assumes two passes. + * One runs under full set of restrictions, and the other runs + * under restrictions that are slightly relaxed. It's likely + * that servers probed in these two passes are not the same. + * + * It doesn't make sense to do lots of attempts. If a suitable + * connection can not be found after multiple attempts, then + * something is wrong with one or more upstream servers in + * this group. Spinning in the loop here would just aggravate + * the issue on Tempesta's side. + */ + attempts = ratio->srv_n; + while (attempts--) { + srvdesc = tfw_sched_ratio_next_srv(ratio, rtodata); + if ((srv_conn = __sched_srv(srvdesc, skipnip, &nipconn))) { + rcu_read_unlock(); + return srv_conn; + } + } + /* Relax the restrictions and re-run the search cycle. */ + if (skipnip && nipconn) { + skipnip = 0; + goto rerun; + } + + rcu_read_unlock(); + return NULL; +} + +/** + * Release Ratio Scheduler data from a server group. + */ +static void +tfw_sched_ratio_cleanup(TfwSrvGroup *sg) +{ + size_t si; + TfwRatio *ratio = sg->sched_data; + + if (!ratio) + return; + + /* Data that is shared between pool entries. */ + for (si = 0; si < sg->srv_n; ++si) + kfree(ratio->srvdesc[si].conn); + + kfree(ratio->hstdata); + kfree(ratio->rtodata); + + kfree(ratio); + sg->sched_data = NULL; +} + +/** + * Delete a server group from Ratio Scheduler. + * + * Note that at this time the group is inactive. That means there are no + * attempts to schedule to servers in this group and enter RCU read-side + * critical section. There's no need for synchronize_rcu() to wait for + * expiration of an RCU grace period. + */ +static void +tfw_sched_ratio_del_grp(TfwSrvGroup *sg) +{ + TfwRatio *ratio = sg->sched_data; + + /* + * Make sure the timer doesn't re-arms itself. This + * also ensures that no more RCU callbacks are created. + */ + if (sg->flags & (TFW_SG_F_SCHED_RATIO_DYNAMIC + | TFW_SG_F_SCHED_RATIO_PREDICT)) + { + atomic_set(&ratio->rearm, 0); + smp_mb__after_atomic(); + del_timer_sync(&ratio->timer); + } + + /* Wait for outstanding RCU callbacks to complete. */ + rcu_barrier(); + + /* Release all memory allocated for the group. */ + tfw_sched_ratio_cleanup(sg); +} + +/** + * Add a server group to Ratio Scheduler. + * + * At the time this function is called the server group is fully formed + * and populated with all servers and connections. + * + * Additional configuration data required for Predictive scheduler are + * passed via @sg->sched_data. + */ + +/* Set up the upstream server descriptors. */ +static int +tfw_sched_ratio_srvdesc_setup(TfwSrvGroup *sg) +{ + size_t size, si = 0, ci; + TfwServer *srv; + TfwRatio *ratio = sg->sched_data; + TfwRatioSrvDesc *srvdesc = ratio->srvdesc; + + list_for_each_entry(srv, &sg->srv_list, list) { + TfwSrvConn **conn, *srv_conn; + + if (unlikely((si++ == sg->srv_n) || !srv->conn_n + || list_empty(&srv->conn_list))) + return -EINVAL; + + size = sizeof(TfwSrvConn *) * srv->conn_n; + if (!(srvdesc->conn = kzalloc(size, GFP_KERNEL))) + return -ENOMEM; + + ci = 0; + conn = srvdesc->conn; + list_for_each_entry(srv_conn, &srv->conn_list, list) { + if (unlikely(ci++ == srv->conn_n)) + return -EINVAL; + *conn++ = srv_conn; + } + if (unlikely(ci != srv->conn_n)) + return -EINVAL; + + srvdesc->conn_n = srv->conn_n; + srvdesc->srv = srv; + atomic64_set(&srvdesc->counter, 0); + srv->sched_data = srvdesc; + ++srvdesc; + } + if (unlikely(si != sg->srv_n)) + return -EINVAL; + + return 0; +} + +static TfwRatio * +tfw_sched_ration_add_grp_common(TfwSrvGroup *sg) +{ + int ret; + size_t size; + TfwRatio *ratio; + TfwRatioData *rtodata; + + TFW_DBG2("%s: SG=[%s]\n", __func__, sg->name); + + size = sizeof(TfwRatio) + sizeof(TfwRatioSrvDesc) * sg->srv_n; + if (!(sg->sched_data = kzalloc(size, GFP_KERNEL))) + return ERR_PTR(-ENOMEM); + + ratio = sg->sched_data; + ratio->srv_n = sg->srv_n; + ratio->psidx = sg->flags & TFW_SG_M_PSTATS_IDX; + + ratio->srvdesc = (TfwRatioSrvDesc *)(ratio + 1); + if ((ret = tfw_sched_ratio_srvdesc_setup(sg))) + return ERR_PTR(ret); + + if (!(rtodata = tfw_sched_ratio_rtodata_get(ratio))) + return ERR_PTR(-ENOMEM); + rcu_assign_pointer(ratio->rtodata, rtodata); + + return ratio; +} + +static int +tfw_sched_ratio_add_grp_static(TfwSrvGroup *sg) +{ + TfwRatio *ratio; + + ratio = tfw_sched_ration_add_grp_common(sg); + if (IS_ERR(ratio)) + return PTR_ERR(ratio); + + /* Calculate the static ratio data for each server. */ + tfw_sched_ratio_calc_static(ratio, ratio->rtodata); + + return 0; +} + +static int +tfw_sched_ratio_add_grp_dynamic(TfwSrvGroup *sg) +{ + TfwRatio *ratio; + TfwSchrefPredict *schref = sg->sched_data; + + TFW_DBG2("%s: SG=[%s]\n", __func__, sg->name); + + ratio = tfw_sched_ration_add_grp_common(sg); + if (IS_ERR(ratio)) + return PTR_ERR(ratio); + + /* Set up the necessary workspace for predictive scheduler. */ + if (sg->flags & TFW_SG_F_SCHED_RATIO_PREDICT) { + size_t size, slot_n; + TfwRatioHstUnit *hunit; + TfwRatioHstData *hdata; + TfwRatioHstDesc *hdesc, *hdesc_end; + + BUG_ON(!schref); + + slot_n = schref->past * schref->rate; + size = sizeof(TfwRatioHstData) + + sizeof(TfwRatioHstDesc) * sg->srv_n + + sizeof(TfwRatioHstUnit) * sg->srv_n * slot_n; + if (!(ratio->hstdata = kzalloc(size, GFP_KERNEL))) + return -ENOMEM; + + hdata = ratio->hstdata; + hdata->hstdesc = (TfwRatioHstDesc *)(hdata + 1); + hdata->slot_n = slot_n; + hdata->ahead = schref->ahead * schref->rate; + + hdesc_end = hdata->hstdesc + sg->srv_n; + hunit = (TfwRatioHstUnit *)hdesc_end; + for (hdesc = hdata->hstdesc; hdesc < hdesc_end; ++hdesc) { + hdesc->hist = hunit; + hunit += slot_n; + } + } + + /* + * Calculate the initial ratio data for each server. That's + * based on equal initial (default) weights that are set by + * the configuration processing routines. + */ + tfw_sched_ratio_calc_static(ratio, ratio->rtodata); + + /* Set up periodic re-calculation of ratios. */ + if (sg->flags & TFW_SG_F_SCHED_RATIO_DYNAMIC) { + ratio->intvl = TFW_SCHED_RATIO_INTVL; + atomic_set(&ratio->rearm, 1); + smp_mb__after_atomic(); + setup_timer(&ratio->timer, + tfw_sched_ratio_dynamic_tmfn, (unsigned long)sg); + mod_timer(&ratio->timer, jiffies + ratio->intvl); + } else if (sg->flags & TFW_SG_F_SCHED_RATIO_PREDICT) { + ratio->intvl = msecs_to_jiffies(1000 / schref->rate); + atomic_set(&ratio->rearm, 1); + smp_mb__after_atomic(); + setup_timer(&ratio->timer, + tfw_sched_ratio_predict_tmfn, (unsigned long)sg); + mod_timer(&ratio->timer, jiffies + ratio->intvl); + } + + return 0; +} + +static int +tfw_sched_ratio_add_grp(TfwSrvGroup *sg) +{ + int ret; + + if (unlikely(!sg->srv_n || list_empty(&sg->srv_list))) + return -EINVAL; + + switch (sg->flags & TFW_SG_M_SCHED_RATIO_TYPE) { + case TFW_SG_F_SCHED_RATIO_STATIC: + if ((ret = tfw_sched_ratio_add_grp_static(sg))) + goto cleanup; + break; + case TFW_SG_F_SCHED_RATIO_DYNAMIC: + case TFW_SG_F_SCHED_RATIO_PREDICT: + if ((ret = tfw_sched_ratio_add_grp_dynamic(sg))) + goto cleanup; + break; + default: + return -EINVAL; + } + + return 0; + +cleanup: + tfw_sched_ratio_cleanup(sg); + return ret; +} + +static TfwScheduler tfw_sched_ratio = { + .name = "ratio", + .list = LIST_HEAD_INIT(tfw_sched_ratio.list), + .add_grp = tfw_sched_ratio_add_grp, + .del_grp = tfw_sched_ratio_del_grp, + .sched_sg_conn = tfw_sched_ratio_sched_sg_conn, + .sched_srv_conn = tfw_sched_ratio_sched_srv_conn, +}; + +int +tfw_sched_ratio_init(void) +{ + TFW_DBG("%s: init\n", tfw_sched_ratio.name); + return tfw_sched_register(&tfw_sched_ratio); +} +module_init(tfw_sched_ratio_init); + +void +tfw_sched_ratio_exit(void) +{ + TFW_DBG("%s: exit\n", tfw_sched_ratio.name); + tfw_sched_unregister(&tfw_sched_ratio); +} +module_exit(tfw_sched_ratio_exit); diff --git a/tempesta_fw/sched/tfw_sched_rr.c b/tempesta_fw/sched/tfw_sched_rr.c deleted file mode 100644 index c6001e42d..000000000 --- a/tempesta_fw/sched/tfw_sched_rr.c +++ /dev/null @@ -1,223 +0,0 @@ -/** - * Tempesta FW - * - * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2015-2017 Tempesta Technologies, Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, - * or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include -#include - -#include "tempesta_fw.h" -#include "log.h" -#include "server.h" - -MODULE_AUTHOR(TFW_AUTHOR); -MODULE_DESCRIPTION("Tempesta round-robin scheduler"); -MODULE_VERSION("0.3.0"); -MODULE_LICENSE("GPL"); - -/** - * List of connections to an upstream server. - * Connections can up and down during failover process and shouldn't be - * taken into account by the scheduler. - */ -typedef struct { - atomic64_t rr_counter; - size_t conn_n; - TfwServer *srv; - TfwSrvConn *conns[TFW_SRV_MAX_CONN]; -} TfwRrSrv; - -/** - * List of upstream servers. - * The list is considered static, i.e. all the servers are alive during - * whole run-time. This can be changed in future. - */ -typedef struct { - atomic64_t rr_counter; - size_t srv_n; - TfwRrSrv srvs[TFW_SG_MAX_SRV]; -} TfwRrSrvList; - -static void -tfw_sched_rr_alloc_data(TfwSrvGroup *sg) -{ - sg->sched_data = kzalloc(sizeof(TfwRrSrvList), GFP_KERNEL); - BUG_ON(!sg->sched_data); -} - -static void -tfw_sched_rr_free_data(TfwSrvGroup *sg) -{ - kfree(sg->sched_data); -} - -/** - * Add connection and server, if new, to the scheduler. - * Called at configuration phase, no synchronization is required. - */ -static void -tfw_sched_rr_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn) -{ - size_t s, c; - TfwRrSrv *srv_cl; - TfwRrSrvList *sl = sg->sched_data; - - BUG_ON(!sl); - - for (s = 0; s < sl->srv_n; ++s) - if (sl->srvs[s].srv == srv) - break; - if (s == sl->srv_n) { - sl->srvs[s].srv = srv; - ++sl->srv_n; - BUG_ON(sl->srv_n > TFW_SG_MAX_SRV); - srv->sched_data = &sl->srvs[s]; - } - - srv_cl = &sl->srvs[s]; - for (c = 0; c < srv_cl->conn_n; ++c) - if (srv_cl->conns[c] == srv_conn) { - TFW_WARN("sched_rr: Try to add existing connection," - " srv=%zu conn=%zu\n", s, c); - return; - } - srv_cl->conns[c] = srv_conn; - ++srv_cl->conn_n; - BUG_ON(srv_cl->conn_n > TFW_SRV_MAX_CONN); -} - -static inline TfwSrvConn * -__sched_srv(TfwRrSrv *srv_cl, int skipnip, int *nipconn) -{ - size_t c; - - for (c = 0; c < srv_cl->conn_n; ++c) { - unsigned long idxval = atomic64_inc_return(&srv_cl->rr_counter); - TfwSrvConn *srv_conn = srv_cl->conns[idxval % srv_cl->conn_n]; - - if (unlikely(tfw_srv_conn_restricted(srv_conn) - || tfw_srv_conn_queue_full(srv_conn))) - continue; - if (skipnip && tfw_srv_conn_hasnip(srv_conn)) { - if (likely(tfw_srv_conn_live(srv_conn))) - ++(*nipconn); - continue; - } - if (likely(tfw_srv_conn_get_if_live(srv_conn))) - return srv_conn; - } - - return NULL; -} - -/** - * On each subsequent call the function returns the next server in the - * group. Parallel connections to the same server are also rotated in - * the round-robin manner. - * - * Dead connections and servers w/o live connections are skipped. - * Initially, connections with non-idempotent requests are also skipped - * in attempt to increase throughput. However, if all live connections - * contain a non-idempotent request, then re-run the algorithm and get - * the first live connection they way it is usually done. - * - * RR scheduler must be the fastest scheduler. Also, it's essential - * to maintain strict round-robin fashion of getting the next server. - * Usually the optimistic approach gives the fastest solution: we are - * optimistic in that there are not many non-idempotent requests, and - * there are available server connections. - */ -static TfwSrvConn * -tfw_sched_rr_get_sg_conn(TfwMsg *msg, TfwSrvGroup *sg) -{ - size_t s; - int skipnip = 1, nipconn = 0; - TfwRrSrvList *sl = sg->sched_data; - - BUG_ON(!sl); -rerun: - for (s = 0; s < sl->srv_n; ++s) { - unsigned long idxval = atomic64_inc_return(&sl->rr_counter); - TfwRrSrv *srv_cl = &sl->srvs[idxval % sl->srv_n]; - TfwSrvConn *srv_conn; - - if ((srv_conn = __sched_srv(srv_cl, skipnip, &nipconn))) - return srv_conn; - } - if (skipnip && nipconn) { - skipnip = 0; - goto rerun; - } - return NULL; -} - -/** - * Same as @tfw_sched_rr_get_sg_conn(), but but schedule for a specific server - * in a group. - */ -static TfwSrvConn * -tfw_sched_rr_get_srv_conn(TfwMsg *msg, TfwServer *srv) -{ - int skipnip = 1, nipconn = 0; - TfwRrSrv *srv_cl = srv->sched_data; - TfwSrvConn *srv_conn; - - /* - * For @srv without connections srv_cl will be NULL, that normally - * does not happen in real life, but unit tests check that case. - */ - if (unlikely(!srv_cl)) - return NULL; - -rerun: - if ((srv_conn = __sched_srv(srv_cl, skipnip, &nipconn))) - return srv_conn; - - if (skipnip && nipconn) { - skipnip = 0; - goto rerun; - } - return NULL; -} - -static TfwScheduler tfw_sched_rr = { - .name = "round-robin", - .list = LIST_HEAD_INIT(tfw_sched_rr.list), - .add_grp = tfw_sched_rr_alloc_data, - .del_grp = tfw_sched_rr_free_data, - .add_conn = tfw_sched_rr_add_conn, - .sched_sg_conn = tfw_sched_rr_get_sg_conn, - .sched_srv_conn = tfw_sched_rr_get_srv_conn, -}; - -int -tfw_sched_rr_init(void) -{ - TFW_DBG("sched_rr: init\n"); - return tfw_sched_register(&tfw_sched_rr); -} -module_init(tfw_sched_rr_init); - -void -tfw_sched_rr_exit(void) -{ - TFW_DBG("sched_rr: exit\n"); - tfw_sched_unregister(&tfw_sched_rr); -} -module_exit(tfw_sched_rr_exit); - diff --git a/tempesta_fw/server.c b/tempesta_fw/server.c index 5057863a0..5ef4938c5 100644 --- a/tempesta_fw/server.c +++ b/tempesta_fw/server.c @@ -47,7 +47,7 @@ tfw_server_destroy(TfwServer *srv) /* Close all connections before freeing the server! */ BUG_ON(!list_empty(&srv->conn_list)); - tfw_apm_destroy(srv->apm); + tfw_apm_del_srv(srv); kmem_cache_free(srv_cache, srv); } @@ -64,16 +64,6 @@ tfw_server_create(const TfwAddr *addr) return srv; } -int -tfw_server_apm_create(TfwServer *srv) -{ - BUG_ON(!srv); - - if (!(srv->apm = tfw_apm_create())) - return -ENOMEM; - return 0; -} - /* * Look up Server Group by name, and return it to caller. * @@ -117,16 +107,13 @@ tfw_sg_new(const char *name, gfp_t flags) TFW_DBG("new server group: '%s'\n", name); - sg = kmalloc(sizeof(*sg) + name_size, flags); + sg = kzalloc(sizeof(*sg) + name_size, flags); if (!sg) return NULL; INIT_LIST_HEAD(&sg->list); INIT_LIST_HEAD(&sg->srv_list); rwlock_init(&sg->lock); - sg->sched = NULL; - sg->sched_data = NULL; - sg->flags = 0; memcpy(sg->name, name, name_size); write_lock(&sg_lock); @@ -162,9 +149,8 @@ tfw_sg_count(void) TfwSrvGroup *sg; read_lock(&sg_lock); - list_for_each_entry(sg, &sg_list, list) { + list_for_each_entry(sg, &sg_list, list) ++count; - } read_unlock(&sg_lock); return count; @@ -182,16 +168,10 @@ tfw_sg_add(TfwSrvGroup *sg, TfwServer *srv) TFW_DBG2("Add new backend server\n"); write_lock(&sg->lock); list_add(&srv->list, &sg->srv_list); + ++sg->srv_n; write_unlock(&sg->lock); } -void -tfw_sg_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn) -{ - if (sg->sched && sg->sched->add_conn) - sg->sched->add_conn(sg, srv, srv_conn); -} - int tfw_sg_set_sched(TfwSrvGroup *sg, const char *sched_name) { @@ -202,7 +182,7 @@ tfw_sg_set_sched(TfwSrvGroup *sg, const char *sched_name) sg->sched = s; if (s->add_grp) - s->add_grp(sg); + return s->add_grp(sg); return 0; } @@ -242,6 +222,11 @@ tfw_sg_for_each_srv(int (*cb)(TfwServer *srv)) /** * Release all server groups with all servers. + * + * Note: The function is called at shutdown and in user context when + * it's guaranteed that all activity has stopped. Therefore the locks + * are not just not necessary, they can't be used as the code in user + * context may sleep. */ void tfw_sg_release_all(void) @@ -249,25 +234,14 @@ tfw_sg_release_all(void) TfwServer *srv, *srv_tmp; TfwSrvGroup *sg, *sg_tmp; - write_lock(&sg_lock); - list_for_each_entry_safe(sg, sg_tmp, &sg_list, list) { - write_lock(&sg->lock); - list_for_each_entry_safe(srv, srv_tmp, &sg->srv_list, list) tfw_server_destroy(srv); - - write_unlock(&sg->lock); - if (sg->sched && sg->sched->del_grp) sg->sched->del_grp(sg); - kfree(sg); } - INIT_LIST_HEAD(&sg_list); - - write_unlock(&sg_lock); } int __init diff --git a/tempesta_fw/server.h b/tempesta_fw/server.h index 677f3badd..d871e53e0 100644 --- a/tempesta_fw/server.h +++ b/tempesta_fw/server.h @@ -25,9 +25,14 @@ #include "connection.h" #include "peer.h" -#define TFW_SRV_MAX_CONN 32 /* TfwSrvConn{} per TfwServer{} */ -#define TFW_SG_MAX_SRV 32 /* TfwServer{} per TfwSrvGroup{} */ -#define TFW_SG_MAX_CONN (TFW_SG_MAX_SRV * TFW_SRV_MAX_CONN) +/* + * Maximum values for the number of upstream servers in a group, + * and the number of connections of an upstream server. + */ +#define TFW_SRV_MAX_CONN_N USHRT_MAX +#define TFW_SG_MAX_SRV_N USHRT_MAX +#define TFW_SG_MAX_CONN_N \ + ((unsigned long)TFW_SG_MAX_SRV_N * TFW_SRV_MAX_CONN_N) typedef struct tfw_srv_group_t TfwSrvGroup; typedef struct tfw_scheduler_t TfwScheduler; @@ -38,14 +43,18 @@ typedef struct tfw_scheduler_t TfwScheduler; * @list - member pointer in the list of servers of a server group; * @sg - back-reference to the server group; * @sched_data - private scheduler data for the server; - * @apm - opaque handle for APM stats; + * @apmref - opaque handle for APM stats; + * @weight - static server weight for load balancers; + * @conn_n - configured number of connections to the server; */ typedef struct { TFW_PEER_COMMON; struct list_head list; TfwSrvGroup *sg; void *sched_data; - void *apm; + void *apmref; + unsigned int weight; + size_t conn_n; } TfwServer; /** @@ -60,6 +69,7 @@ typedef struct { * @lock - synchronizes the group readers with updaters; * @sched - requests scheduling handler; * @sched_data - private scheduler data for the server group; + * @srv_n - configured number of servers in the group; * @max_qsize - maximum queue size of a server connection; * @max_refwd - maximum number of tries for forwarding a request; * @max_jqage - maximum age of a request in a server connection, in jiffies; @@ -73,6 +83,7 @@ struct tfw_srv_group_t { rwlock_t lock; TfwScheduler *sched; void *sched_data; + size_t srv_n; unsigned int max_qsize; unsigned int max_refwd; unsigned long max_jqage; @@ -81,43 +92,63 @@ struct tfw_srv_group_t { char name[0]; }; -/* Server related flags. */ -#define TFW_SRV_RETRY_NIP 0x0001 /* Retry non-idemporent req. */ -#define TFW_SRV_STICKY_FLAGS (TFW_SRV_STICKY | TFW_SRV_STICKY_FAILOVER) -#define TFW_SRV_STICKY 0x0002 /* Use sticky sessions. */ -#define TFW_SRV_STICKY_FAILOVER 0x0004 /* Allow failovering of sticky - sessions*/ +/** + * @past - period of time (secs) to keep past APM values; + * @rate - rate (times per sec) of retrieval of past APM values; + * @ahead - period of time (secs) for a prediction; + */ +typedef struct { + unsigned int past; + unsigned int rate; + unsigned int ahead; +} TfwSchrefPredict; + +/* Server and server group related flags. + * Lower 4 bits keep an index into APM stats array. + */ +#define TFW_SG_M_PSTATS_IDX 0x000f +#define TFW_SG_F_SCHED_RATIO_STATIC 0x0010 +#define TFW_SG_F_SCHED_RATIO_DYNAMIC 0x0020 +#define TFW_SG_F_SCHED_RATIO_PREDICT 0x0040 +#define TFW_SG_M_SCHED_RATIO_TYPE (TFW_SG_F_SCHED_RATIO_STATIC \ + | TFW_SG_F_SCHED_RATIO_DYNAMIC \ + | TFW_SG_F_SCHED_RATIO_PREDICT) + +#define TFW_SRV_RETRY_NIP 0x0100 /* Retry non-idemporent req. */ +#define TFW_SRV_STICKY 0x0200 /* Use sticky sessions. */ +#define TFW_SRV_STICKY_FAILOVER 0x0400 /* Allow failovering. */ +#define TFW_SRV_STICKY_FLAGS \ + (TFW_SRV_STICKY | TFW_SRV_STICKY_FAILOVER) /** * Requests scheduling algorithm handler. * * @name - name of the algorithm; - * @list - list of registered schedulers; - * @add_grp - add server group to the scheduler; + * @list - member in the list of registered schedulers; + * @add_grp - add server group to the scheduler. + * Called in process context at configuration time. + * Called only after all servers are set up with connections, + * and the group is set up with all servers; * @del_grp - delete server group from the scheduler; - * @add_conn - add connection and server if it's new, called in process - * context at configuration time; - * @sched_grp - server group scheduling virtual method, typically returns - * result of @tfw_sched_get_sg_srv_conn(); - * @sched_sg_conn - virtual method, schedules request to a server from given - * server group, returns server connection; - * @sched_srv_conn - schedule request to the given server, - * returns server connection; + * @sched_grp - server group scheduling virtual method. + * Typically returns the result of @tfw_sched_get_sg_srv_conn(); + * @sched_sg_conn - virtual method. Schedule a request to a server from + * given server group. Returns a server connection; + * @sched_srv_conn - schedule a request to the given server. + * Returns a server connection; * - * There can be 2 kind of schedulers. Tier-2 schedulers can determine + * There can be 2 kind of schedulers. Tier-2 schedulers can determine the * target server connection by server or server group (@sched_srv_conn and - * @sched_sg_conn callbacks). Every server group is bound to one of the tier-2 - * schedulers. Group schedulers can find out target server group - * by message content (@sched_grp callback) and then find and outgoing - * connection by @tfw_sched_get_sg_srv_conn(). + * @sched_sg_conn callbacks). Each server group is bound to one of tier-2 + * schedulers. Group schedulers can determine the target server group from + * request's content (@sched_grp callback) and then get an outgoing + * connection by calling @tfw_sched_get_sg_srv_conn(). */ struct tfw_scheduler_t { const char *name; struct list_head list; - void (*add_grp)(TfwSrvGroup *sg); + int (*add_grp)(TfwSrvGroup *sg); void (*del_grp)(TfwSrvGroup *sg); - void (*add_conn)(TfwSrvGroup *sg, TfwServer *srv, - TfwSrvConn *srv_conn); TfwSrvConn *(*sched_grp)(TfwMsg *msg); TfwSrvConn *(*sched_sg_conn)(TfwMsg *msg, TfwSrvGroup *sg); TfwSrvConn *(*sched_srv_conn)(TfwMsg *msg, TfwServer *srv); @@ -125,7 +156,6 @@ struct tfw_scheduler_t { /* Server specific routines. */ TfwServer *tfw_server_create(const TfwAddr *addr); -int tfw_server_apm_create(TfwServer *srv); void tfw_server_destroy(TfwServer *srv); void tfw_srv_conn_release(TfwSrvConn *srv_conn); @@ -157,7 +187,6 @@ void tfw_sg_free(TfwSrvGroup *sg); unsigned int tfw_sg_count(void); void tfw_sg_add(TfwSrvGroup *sg, TfwServer *srv); -void tfw_sg_add_conn(TfwSrvGroup *sg, TfwServer *srv, TfwSrvConn *srv_conn); int tfw_sg_set_sched(TfwSrvGroup *sg, const char *sched); int tfw_sg_for_each_srv(int (*cb)(TfwServer *srv)); void tfw_sg_release_all(void); diff --git a/tempesta_fw/sock_srv.c b/tempesta_fw/sock_srv.c index 1b6f3db9c..20d317238 100644 --- a/tempesta_fw/sock_srv.c +++ b/tempesta_fw/sock_srv.c @@ -25,6 +25,7 @@ #include #include +#include "apm.h" #include "tempesta_fw.h" #include "connection.h" #include "http_sess.h" @@ -438,6 +439,13 @@ tfw_sock_srv_disconnect(TfwConn *conn) * not-yet-established connections in the TfwServer->conn_list. */ +static inline int +__tfw_sock_srv_connect_try_later_cb(TfwSrvConn *srv_conn) +{ + tfw_sock_srv_connect_try_later(srv_conn); + return 0; +} + static int tfw_sock_srv_connect_srv(TfwServer *srv) { @@ -465,7 +473,8 @@ tfw_sock_srv_disconnect_srv(TfwServer *srv) { TfwConn *conn; - return tfw_peer_for_each_conn(srv, conn, list, tfw_sock_srv_disconnect); + return tfw_peer_for_each_conn(srv, conn, list, + tfw_sock_srv_disconnect); } /* @@ -521,17 +530,15 @@ tfw_srv_conn_free(TfwSrvConn *srv_conn) } static int -tfw_sock_srv_add_conns(TfwServer *srv, int conns_n) +tfw_sock_srv_add_conns(TfwServer *srv) { int i; TfwSrvConn *srv_conn; - for (i = 0; i < conns_n; ++i) { + for (i = 0; i < srv->conn_n; ++i) { if (!(srv_conn = tfw_srv_conn_alloc())) return -ENOMEM; - tfw_connection_link_peer((TfwConn *)srv_conn, - (TfwPeer *)srv); - tfw_sg_add_conn(srv->sg, srv, srv_conn); + tfw_connection_link_peer((TfwConn *)srv_conn, (TfwPeer *)srv); } return 0; @@ -546,6 +553,7 @@ tfw_sock_srv_del_conns(TfwServer *srv) tfw_connection_unlink_from_peer((TfwConn *)srv_conn); tfw_srv_conn_free(srv_conn); } + return 0; } @@ -560,65 +568,45 @@ tfw_sock_srv_delete_all_conns(void) * Configuration handling * ------------------------------------------------------------------------ */ - -/* - * Default values for various configuration directives and options. - */ -#define TFW_CFG_SRV_CONNS_N_DEF 32 /* Default # of connections */ -#define TFW_CFG_SRV_QUEUE_SIZE_DEF 1000 /* Max queue size */ -#define TFW_CFG_SRV_FWD_TIMEOUT_DEF 60 /* Default request timeout */ -#define TFW_CFG_SRV_FWD_RETRIES_DEF 5 /* Default number of tries */ -#define TFW_CFG_SRV_CNS_RETRIES_DEF 10 /* Reconnect tries. */ -#define TFW_CFG_SRV_RETRY_NIP_DEF 0 /* Do NOT resend NIP reqs */ -#define TFW_CFG_SRV_STICKY_DEF 0 /* Don't use sticky sessions */ - -static TfwServer *tfw_cfg_in_slst[TFW_SG_MAX_SRV]; -static TfwServer *tfw_cfg_out_slst[TFW_SG_MAX_SRV]; -static int tfw_cfg_in_nconn[TFW_SG_MAX_SRV]; -static int tfw_cfg_out_nconn[TFW_SG_MAX_SRV]; -static int tfw_cfg_in_slstsz, tfw_cfg_out_slstsz; -static TfwScheduler *tfw_cfg_in_sched, *tfw_cfg_out_sched; -static TfwSrvGroup *tfw_cfg_in_sg, *tfw_cfg_out_sg; - -static int tfw_cfg_in_queue_size = TFW_CFG_SRV_QUEUE_SIZE_DEF; -static int tfw_cfg_in_fwd_timeout = TFW_CFG_SRV_FWD_TIMEOUT_DEF; -static int tfw_cfg_in_fwd_retries = TFW_CFG_SRV_FWD_RETRIES_DEF; -static int tfw_cfg_in_cns_retries = TFW_CFG_SRV_CNS_RETRIES_DEF; -static int tfw_cfg_in_retry_nip = TFW_CFG_SRV_RETRY_NIP_DEF; -static unsigned int tfw_cfg_in_sticky = TFW_CFG_SRV_STICKY_DEF; - -static int tfw_cfg_out_queue_size = TFW_CFG_SRV_QUEUE_SIZE_DEF; -static int tfw_cfg_out_fwd_timeout = TFW_CFG_SRV_FWD_TIMEOUT_DEF; -static int tfw_cfg_out_fwd_retries = TFW_CFG_SRV_FWD_RETRIES_DEF; -static int tfw_cfg_out_cns_retries = TFW_CFG_SRV_CNS_RETRIES_DEF; -static int tfw_cfg_out_retry_nip = TFW_CFG_SRV_RETRY_NIP_DEF; -static unsigned int tfw_cfg_out_sticky = TFW_CFG_SRV_STICKY_DEF; +#define TFW_CFG_DFLT_VAL "__dfltval__" /* Use a default value. */ + +static struct list_head tfw_cfg_in_slst = LIST_HEAD_INIT(tfw_cfg_in_slst); +static struct list_head tfw_cfg_out_slst = LIST_HEAD_INIT(tfw_cfg_out_slst); +static struct list_head *tfw_cfg_slst; +static int tfw_cfg_slstsz, tfw_cfg_out_slstsz; +static TfwScheduler *tfw_cfg_sched, *tfw_cfg_out_sched; +static TfwSchrefPredict tfw_cfg_schref_predict, tfw_cfg_out_schref_predict; +static void *tfw_cfg_schref, *tfw_cfg_out_schref; +static TfwSrvGroup *tfw_cfg_sg, *tfw_cfg_out_sg; + +static int tfw_cfg_queue_size, tfw_cfg_out_queue_size; +static int tfw_cfg_fwd_timeout, tfw_cfg_out_fwd_timeout; +static int tfw_cfg_fwd_retries, tfw_cfg_out_fwd_retries; +static int tfw_cfg_cns_retries, tfw_cfg_out_cns_retries; +static unsigned int tfw_cfg_retry_nip, tfw_cfg_out_retry_nip; +static unsigned int tfw_cfg_sticky_sess, tfw_cfg_out_sticky_sess; +static unsigned int tfw_cfg_sg_flags, tfw_cfg_out_sg_flags; static int tfw_cfgop_intval(TfwCfgSpec *cs, TfwCfgEntry *ce, int *intval) { - int ret; - - if (ce->attr_n) { - TFW_ERR_NL("%s: Arguments may not have the \'=\' sign\n", - cs->name); - return -EINVAL; - } if (ce->val_n != 1) { - TFW_ERR_NL("%s: Invalid number of arguments: %d\n", - cs->name, (int)ce->val_n); + TFW_ERR_NL("Invalid number of arguments: %zu\n", ce->val_n); + return -EINVAL; + } + if (ce->attr_n) { + TFW_ERR_NL("Arguments may not have the \'=\' sign\n"); return -EINVAL; } - if ((ret = tfw_cfg_parse_int(ce->vals[0], intval))) - return ret; - return 0; + cs->dest = intval; + return tfw_cfg_set_int(cs, ce); } static int tfw_cfgop_in_queue_size(TfwCfgSpec *cs, TfwCfgEntry *ce) { - return tfw_cfgop_intval(cs, ce, &tfw_cfg_in_queue_size); + return tfw_cfgop_intval(cs, ce, &tfw_cfg_queue_size); } static int @@ -630,7 +618,7 @@ tfw_cfgop_out_queue_size(TfwCfgSpec *cs, TfwCfgEntry *ce) static int tfw_cfgop_in_fwd_timeout(TfwCfgSpec *cs, TfwCfgEntry *ce) { - return tfw_cfgop_intval(cs, ce, &tfw_cfg_in_fwd_timeout); + return tfw_cfgop_intval(cs, ce, &tfw_cfg_fwd_timeout); } static int @@ -642,7 +630,7 @@ tfw_cfgop_out_fwd_timeout(TfwCfgSpec *cs, TfwCfgEntry *ce) static int tfw_cfgop_in_fwd_retries(TfwCfgSpec *cs, TfwCfgEntry *ce) { - return tfw_cfgop_intval(cs, ce, &tfw_cfg_in_fwd_retries); + return tfw_cfgop_intval(cs, ce, &tfw_cfg_fwd_retries); } static int @@ -654,49 +642,52 @@ tfw_cfgop_out_fwd_retries(TfwCfgSpec *cs, TfwCfgEntry *ce) static inline int tfw_cfgop_retry_nip(TfwCfgSpec *cs, TfwCfgEntry *ce, int *retry_nip) { - if (ce->attr_n || ce->val_n) { - TFW_ERR_NL("%s: The option may not have arguments.\n", - cs->name); + if (ce->attr_n) { + TFW_ERR_NL("Arguments may not have the \'=\' sign\n"); return -EINVAL; } - *retry_nip = 1; + if (!ce->val_n) { + *retry_nip = TFW_SRV_RETRY_NIP; + } else if (!strcasecmp(ce->vals[0], TFW_CFG_DFLT_VAL)) { + BUG_ON(ce->val_n != 1); + *retry_nip = 0; + } else { + TFW_ERR_NL("Invalid number of arguments: %zu\n", ce->val_n); + return -EINVAL; + } + return 0; } static inline int -tfw_cfgop_sticky(TfwCfgSpec *cs, TfwCfgEntry *ce, unsigned int *use_sticky) +tfw_cfgop_sticky_sess(TfwCfgSpec *cs, TfwCfgEntry *ce, unsigned int *use_sticky) { if (ce->attr_n) { - TFW_ERR_NL("%s: Arguments may not have the \'=\' sign\n", - cs->name); + TFW_ERR_NL("Arguments may not have the \'=\' sign\n"); return -EINVAL; } if (ce->val_n > 1) { - TFW_ERR_NL("%s: Invalid number of arguments: %zu\n", - cs->name, ce->val_n); + TFW_ERR_NL("Invalid number of arguments: %zu\n", ce->val_n); return -EINVAL; } - - if (ce->val_n) { - if (!strcasecmp(ce->vals[0], "allow_failover")) { - *use_sticky |= TFW_SRV_STICKY_FAILOVER; - } - else { - TFW_ERR_NL("%s: Unsupported argument: %s\n", - cs->name, ce->vals[0]); - return -EINVAL; - } + if (!ce->val_n) { + *use_sticky = TFW_SRV_STICKY; + } else if (!strcasecmp(ce->vals[0], "allow_failover")) { + *use_sticky = TFW_SRV_STICKY | TFW_SRV_STICKY_FAILOVER; + } else if (!strcasecmp(ce->vals[0], TFW_CFG_DFLT_VAL)) { + *use_sticky = 0; + } else { + TFW_ERR_NL("Unsupported argument: %s\n", ce->vals[0]); + return -EINVAL; } - *use_sticky |= TFW_SRV_STICKY; - return 0; } static int tfw_cfgop_in_retry_nip(TfwCfgSpec *cs, TfwCfgEntry *ce) { - return tfw_cfgop_retry_nip(cs, ce, &tfw_cfg_in_retry_nip); + return tfw_cfgop_retry_nip(cs, ce, &tfw_cfg_retry_nip); } static int @@ -706,21 +697,21 @@ tfw_cfgop_out_retry_nip(TfwCfgSpec *cs, TfwCfgEntry *ce) } static int -tfw_cfgop_in_sticky(TfwCfgSpec *cs, TfwCfgEntry *ce) +tfw_cfgop_in_sticky_sess(TfwCfgSpec *cs, TfwCfgEntry *ce) { - return tfw_cfgop_sticky(cs, ce, &tfw_cfg_in_sticky); + return tfw_cfgop_sticky_sess(cs, ce, &tfw_cfg_sticky_sess); } static int -tfw_cfgop_out_sticky(TfwCfgSpec *cs, TfwCfgEntry *ce) +tfw_cfgop_out_sticky_sess(TfwCfgSpec *cs, TfwCfgEntry *ce) { - return tfw_cfgop_sticky(cs, ce, &tfw_cfg_out_sticky); + return tfw_cfgop_sticky_sess(cs, ce, &tfw_cfg_out_sticky_sess); } static int tfw_cfgop_in_conn_retries(TfwCfgSpec *cs, TfwCfgEntry *ce) { - return tfw_cfgop_intval(cs, ce, &tfw_cfg_in_cns_retries); + return tfw_cfgop_intval(cs, ce, &tfw_cfg_cns_retries); } static int @@ -729,93 +720,90 @@ tfw_cfgop_out_conn_retries(TfwCfgSpec *cs, TfwCfgEntry *ce) return tfw_cfgop_intval(cs, ce, &tfw_cfg_out_cns_retries); } -static int -tfw_cfgop_set_conn_retries(TfwSrvGroup *sg, int recns) -{ - if (!recns) { - sg->max_recns = UINT_MAX; - } else if (recns < ARRAY_SIZE(tfw_srv_tmo_vals)) { - sg->max_recns = ARRAY_SIZE(tfw_srv_tmo_vals); - } else { - sg->max_recns = recns; - } - - return 0; -} +/* Default and maximum values for "server" options. */ +#define TFW_CFG_SRV_CONNS_N_DEF 32 /* Default # of connections */ +#define TFW_CFG_SRV_WEIGHT_MIN 1 /* Min static weight value */ +#define TFW_CFG_SRV_WEIGHT_MAX 100 /* Max static weight value */ +#define TFW_CFG_SRV_WEIGHT_DEF 50 /* Dflt static weight value */ -/* +/** * Common code to handle 'server' directive. */ static int -tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce, - TfwSrvGroup *sg, TfwServer **arg_srv, int *arg_conns_n) +tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce, struct list_head *slst) { TfwAddr addr; TfwServer *srv; - int i, conns_n = 0; - bool has_conns_n = false; - const char *key, *val, *saddr; + int i, conns_n = 0, weight = 0; + bool has_conns_n = false, has_weight = false; + const char *key, *val; if (ce->val_n != 1) { - TFW_ERR_NL("%s: %s %s: Invalid number of arguments: %zd\n", - sg->name, cs->name, ce->val_n ? ce->vals[0] : "", - ce->val_n); + TFW_ERR_NL("Invalid number of arguments: %zu\n", ce->val_n); return -EINVAL; } if (ce->attr_n > 2) { - TFW_ERR_NL("%s: %s %s: Invalid number of key=value pairs: %zd\n", - sg->name, cs->name, ce->vals[0], ce->attr_n); + TFW_ERR_NL("Invalid number of key=value pairs: %zu\n", + ce->attr_n); return -EINVAL; } - saddr = ce->vals[0]; - - if (tfw_addr_pton(&TFW_STR_FROM(saddr), &addr)) { - TFW_ERR_NL("%s: %s %s: Invalid IP address: '%s'\n", - sg->name, cs->name, saddr, saddr); + if (tfw_addr_pton(&TFW_STR_FROM(ce->vals[0]), &addr)) { + TFW_ERR_NL("Invalid IP address: '%s'\n", ce->vals[0]); return -EINVAL; } TFW_CFG_ENTRY_FOR_EACH_ATTR(ce, i, key, val) { if (!strcasecmp(key, "conns_n")) { if (has_conns_n) { - TFW_ERR_NL("%s: %s %s: Duplicate arg: '%s=%s'" - "\n", sg->name, cs->name, saddr, key, - val); + TFW_ERR_NL("Duplicate argument: '%s'\n", key); return -EINVAL; } if (tfw_cfg_parse_int(val, &conns_n)) { - TFW_ERR_NL("%s: %s %s: Invalid value: '%s=%s'" - "\n", sg->name, cs->name, saddr, key, - val); + TFW_ERR_NL("Invalid value: '%s'\n", val); return -EINVAL; } has_conns_n = true; + } else if (!strcasecmp(key, "weight")) { + if (has_weight) { + TFW_ERR_NL("Duplicate argument: '%s'\n", key); + return -EINVAL; + } + if (tfw_cfg_parse_int(val, &weight)) { + TFW_ERR_NL("Invalid value: '%s'\n", val); + return -EINVAL; + } + has_weight = true; } else { - TFW_ERR_NL("%s: %s %s: Unsupported argument: '%s=%s'\n", - sg->name, cs->name, saddr, key, val); + TFW_ERR_NL("Unsupported argument: '%s'\n", key); return -EINVAL; } } if (!has_conns_n) { conns_n = TFW_CFG_SRV_CONNS_N_DEF; - } else if ((conns_n < 1) || (conns_n > TFW_SRV_MAX_CONN)) { - TFW_ERR_NL("%s: %s %s: Out of range of [1..%d]: 'conns_n=%d'\n", - sg->name, cs->name, saddr, TFW_SRV_MAX_CONN, - conns_n); + } else if ((conns_n < 1) || (conns_n > TFW_SRV_MAX_CONN_N)) { + TFW_ERR_NL("Out of range of [1..%d]: 'conns_n=%d'\n", + TFW_SRV_MAX_CONN_N, conns_n); + return -EINVAL; + } + /* Default weight is set only for static ratio scheduler. */ + if (has_weight && ((weight < TFW_CFG_SRV_WEIGHT_MIN) + || (weight > TFW_CFG_SRV_WEIGHT_MAX))) + { + TFW_ERR_NL("Out of range of [%d..%d]: 'weight=%d'\n", + TFW_CFG_SRV_WEIGHT_MIN, TFW_CFG_SRV_WEIGHT_MAX, + weight); return -EINVAL; } if (!(srv = tfw_server_create(&addr))) { - TFW_ERR_NL("%s: %s %s: Error handling the server\n", - sg->name, cs->name, saddr); + TFW_ERR_NL("Error handling the server: '%s'\n", ce->vals[0]); return -EINVAL; } - tfw_sg_add(sg, srv); - - *arg_srv = srv; - *arg_conns_n = conns_n; + srv->weight = weight; + srv->conn_n = conns_n; + list_add_tail(&srv->list, slst); return 0; } @@ -827,21 +815,13 @@ tfw_cfgop_server(TfwCfgSpec *cs, TfwCfgEntry *ce, * server 10.0.0.2; * server 10.0.0.3 conns_n=1; * } - * - * Every server is simply added to the tfw_srv_cfg_curr_group. */ static int tfw_cfgop_in_server(TfwCfgSpec *cs, TfwCfgEntry *ce) { - int nconn; - TfwServer *srv; - - if (tfw_cfg_in_slstsz >= TFW_SG_MAX_SRV) - return -EINVAL; - if (tfw_cfgop_server(cs, ce, tfw_cfg_in_sg, &srv, &nconn)) + if (tfw_cfgop_server(cs, ce, tfw_cfg_slst)) return -EINVAL; - tfw_cfg_in_nconn[tfw_cfg_in_slstsz] = nconn; - tfw_cfg_in_slst[tfw_cfg_in_slstsz++] = srv; + tfw_cfg_slstsz++; return 0; } @@ -868,28 +848,9 @@ tfw_cfgop_in_server(TfwCfgSpec *cs, TfwCfgEntry *ce) static int tfw_cfgop_out_server(TfwCfgSpec *cs, TfwCfgEntry *ce) { - int nconn; - TfwServer *srv; - - if (tfw_cfg_out_slstsz >= TFW_SG_MAX_SRV) - return -EINVAL; - /* - * The group "default" is created implicitly, and only when - * a server outside of any group is found in the configuration. - */ - if (!tfw_cfg_out_sg) { - static const char __read_mostly s_default[] = "default"; - - if (!(tfw_cfg_out_sg = tfw_sg_new(s_default, GFP_KERNEL))) { - TFW_ERR_NL("Unable to add default server group\n"); - return -EINVAL; - } - } - - if (tfw_cfgop_server(cs, ce, tfw_cfg_out_sg, &srv, &nconn)) + if (tfw_cfgop_server(cs, ce, &tfw_cfg_out_slst)) return -EINVAL; - tfw_cfg_out_nconn[tfw_cfg_out_slstsz] = nconn; - tfw_cfg_out_slst[tfw_cfg_out_slstsz++] = srv; + tfw_cfg_out_slstsz++; return 0; } @@ -910,32 +871,127 @@ static int tfw_cfgop_begin_srv_group(TfwCfgSpec *cs, TfwCfgEntry *ce) { if (ce->val_n != 1) { - TFW_ERR_NL("%s %s: Invalid number of arguments: %zd\n", - cs->name, ce->val_n ? ce->vals[0] : "", ce->val_n); - return -EINVAL; + TFW_ERR_NL("Invalid number of arguments: %zu\n", ce->val_n); + return -EINVAL; } if (ce->attr_n) { - TFW_ERR_NL("%s %s: Arguments may not have the \'=\' sign\n", - cs->name, ce->vals[0]); + TFW_ERR_NL("Arguments may not have the \'=\' sign\n"); return -EINVAL; } - if (!(tfw_cfg_in_sg = tfw_sg_new(ce->vals[0], GFP_KERNEL))) { - TFW_ERR_NL("%s %s: Unable to add group\n", cs->name, - ce->vals[0]); + if (!(tfw_cfg_sg = tfw_sg_new(ce->vals[0], GFP_KERNEL))) { + TFW_ERR_NL("Unable to add group: '%s'\n", ce->vals[0]); return -EINVAL; } - TFW_DBG("begin srv_group: %s\n", tfw_cfg_in_sg->name); + TFW_DBG("begin srv_group: %s\n", tfw_cfg_sg->name); - tfw_cfg_in_slstsz = 0; - tfw_cfg_in_sched = tfw_cfg_out_sched; - tfw_cfg_in_queue_size = tfw_cfg_out_queue_size; - tfw_cfg_in_fwd_timeout = tfw_cfg_out_fwd_timeout; - tfw_cfg_in_fwd_retries = tfw_cfg_out_fwd_retries; - tfw_cfg_in_cns_retries = tfw_cfg_out_cns_retries; - tfw_cfg_in_retry_nip = tfw_cfg_out_retry_nip; - tfw_cfg_in_sticky = tfw_cfg_out_sticky; + tfw_cfg_queue_size = tfw_cfg_out_queue_size; + tfw_cfg_fwd_timeout = tfw_cfg_out_fwd_timeout; + tfw_cfg_fwd_retries = tfw_cfg_out_fwd_retries; + tfw_cfg_cns_retries = tfw_cfg_out_cns_retries; + tfw_cfg_retry_nip = tfw_cfg_out_retry_nip; + tfw_cfg_sticky_sess = tfw_cfg_out_sticky_sess; + tfw_cfg_sg_flags = tfw_cfg_out_sg_flags; + tfw_cfg_sched = tfw_cfg_out_sched; + tfw_cfg_schref = tfw_cfg_out_schref; + + BUG_ON(!list_empty(&tfw_cfg_in_slst)); + tfw_cfg_slst = &tfw_cfg_in_slst; + tfw_cfg_slstsz = 0; + + return 0; +} + +static int +tfw_cfg_sg_ratio_adjust(struct list_head *slst) +{ + TfwServer *srv; + + list_for_each_entry(srv, slst, list) + if (!srv->weight) + srv->weight = TFW_CFG_SRV_WEIGHT_DEF; + return 0; +} + +static int +tfw_cfg_sg_ratio_verify(void) +{ + TfwServer *srv; + int count = 0; + + if (tfw_cfg_sg->flags & (TFW_SG_F_SCHED_RATIO_DYNAMIC + || TFW_SG_F_SCHED_RATIO_PREDICT)) + { + list_for_each_entry(srv, tfw_cfg_slst, list) { + if (srv->weight) + break; + ++count; + } + if (count < tfw_cfg_slstsz) { + TFW_ERR_NL("srv_group %s: static weight [%d] used " + "with 'dynamic' scheduler option\n", + tfw_cfg_sg->name, srv->weight); + return -EINVAL; + } + } + + return 0; +} + +static int +tfw_cfgop_setup_srv_group(void) +{ + int ret; + TfwServer *srv, *tmp; + + BUG_ON(!tfw_cfg_sg); + BUG_ON(!tfw_cfg_sched); + + tfw_cfg_sg->max_qsize = tfw_cfg_queue_size ? : UINT_MAX; + tfw_cfg_sg->max_jqage = tfw_cfg_fwd_timeout + ? msecs_to_jiffies(tfw_cfg_fwd_timeout * 1000) + : ULONG_MAX; + tfw_cfg_sg->max_refwd = tfw_cfg_fwd_retries ? : UINT_MAX; + tfw_cfg_sg->max_recns = tfw_cfg_cns_retries + ? max_t(int, tfw_cfg_cns_retries, + ARRAY_SIZE(tfw_srv_tmo_vals)) + : UINT_MAX; + + tfw_cfg_sg->flags = tfw_cfg_sg_flags; + tfw_cfg_sg->flags |= tfw_cfg_retry_nip | tfw_cfg_sticky_sess; + tfw_cfg_sg->sched_data = tfw_cfg_schref; + + /* + * Check 'ratio' scheduler configuration for incompatibilities. + * Set weight to default value for each server in the group + * if no weight is provided in the configuration. For dynamic + * or predictive ratios this sets initial equal weights to all + * servers. + */ + if (!strcasecmp(tfw_cfg_sched->name, "ratio")) { + if (tfw_cfg_sg_ratio_verify()) + return -EINVAL; + if (tfw_cfg_sg_ratio_adjust(tfw_cfg_slst)) + return -EINVAL; + } + /* Set up the server group with all servers that are in it. */ + list_for_each_entry_safe(srv, tmp, tfw_cfg_slst, list) { + if ((ret = tfw_sock_srv_add_conns(srv)) != 0) + return ret; + list_del(&srv->list); + tfw_sg_add(tfw_cfg_sg, srv); + } + /* + * Set up a scheduler and add the server group to the scheduler. + * Must be called only after the server group is set up with all + * servers (and all connections) that are in it. + */ + if (tfw_sg_set_sched(tfw_cfg_sg, tfw_cfg_sched->name)) { + TFW_ERR_NL("Unable to add srv_group '%s' to scheduler '%s'\n", + tfw_cfg_sg->name, tfw_cfg_sched->name); + return -EINVAL; + } return 0; } @@ -953,41 +1009,173 @@ tfw_cfgop_begin_srv_group(TfwCfgSpec *cs, TfwCfgEntry *ce) static int tfw_cfgop_finish_srv_group(TfwCfgSpec *cs) { - int i; - TfwSrvGroup *sg = tfw_cfg_in_sg; - - BUG_ON(!sg); - BUG_ON(list_empty(&sg->srv_list)); - BUG_ON(!tfw_cfg_in_sched); - TFW_DBG("finish srv_group: %s\n", sg->name); - - tfw_cfgop_set_conn_retries(sg, tfw_cfg_in_cns_retries); - sg->max_qsize = tfw_cfg_in_queue_size ? : UINT_MAX; - sg->max_jqage = tfw_cfg_in_fwd_timeout - ? msecs_to_jiffies(tfw_cfg_in_fwd_timeout * 1000) - : ULONG_MAX; - sg->max_refwd = tfw_cfg_in_fwd_retries ? : UINT_MAX; - sg->flags |= tfw_cfg_in_retry_nip ? TFW_SRV_RETRY_NIP : 0; - sg->flags |= tfw_cfg_in_sticky; - - if (tfw_sg_set_sched(sg, tfw_cfg_in_sched->name)) { - TFW_ERR_NL("%s %s: Unable to set scheduler: '%s'\n", - cs->name, sg->name, tfw_cfg_in_sched->name); - return -EINVAL; + BUG_ON(list_empty(&tfw_cfg_sg->srv_list)); + TFW_DBG("finish srv_group: %s\n", tfw_cfg_sg->name); + + return tfw_cfgop_setup_srv_group(); +} + +static int +tfw_cfg_handle_ratio_predyn_opts(TfwCfgEntry *ce, unsigned int *arg_flags) +{ + unsigned int idx, value, flags = *arg_flags; + + if (ce->val_n < 3) { + /* Default dynamic type. */ + flags |= TFW_PSTATS_IDX_AVG; + goto done; } - /* Add connections only after a scheduler is set. */ - for (i = 0; i < tfw_cfg_in_slstsz; ++i) { - TfwServer *srv = tfw_cfg_in_slst[i]; - if (tfw_sock_srv_add_conns(srv, tfw_cfg_in_nconn[i])) { - char as[TFW_ADDR_STR_BUF_SIZE] = { 0 }; - tfw_addr_ntop(&srv->addr, as, sizeof(as)); - TFW_ERR_NL("%s %s: server '%s': " - "Error adding connections\n", - cs->name, sg->name, as); + if (!strcasecmp(ce->vals[2], "minimum")) { + idx = TFW_PSTATS_IDX_MIN; + }else if (!strcasecmp(ce->vals[2], "maximum")) { + idx = TFW_PSTATS_IDX_MAX; + } else if (!strcasecmp(ce->vals[2], "average")) { + idx = TFW_PSTATS_IDX_AVG; + } else if (!strcasecmp(ce->vals[2], "percentile")) { + if (ce->val_n < 4) { + /* Default percentile. */ + flags |= TFW_PSTATS_IDX_P90; + goto done; + } + if (tfw_cfg_parse_int(ce->vals[3], &value)) { + TFW_ERR_NL("Invalid value: '%s'\n", ce->vals[3]); + return -EINVAL; + } + for (idx = 0; idx < ARRAY_SIZE(tfw_pstats_ith); ++idx) { + if (!tfw_pstats_ith[idx]) + continue; + if (tfw_pstats_ith[idx] == value) + break; + } + if (idx == ARRAY_SIZE(tfw_pstats_ith)) { + TFW_ERR_NL("Invalid value: '%s'\n", ce->vals[3]); return -EINVAL; } + } else { + TFW_ERR_NL("Unsupported argument: '%s'\n", ce->vals[2]); + return -EINVAL; + } + flags |= idx; + +done: + *arg_flags = flags; + return 0; +} + +/* Default and maximum values for "sched ratio predict" options. */ +#define TFW_CFG_PAST_DEF 30 /* 30 secs of past APM vals */ +#define TFW_CFG_PAST_MAX 120 /* 120 secs of past APM vals */ +#define TFW_CFG_RATE_DEF 20 /* 20 times/sec */ +#define TFW_CFG_RATE_MAX 20 /* 20 times/sec */ + +static int +tfw_cfg_handle_ratio_predict(TfwCfgEntry *ce, + void *arg_schref, unsigned int *arg_flags) +{ + int i, ret; + const char *key, *val; + bool has_past = false, has_rate = false, has_ahead = false; + TfwSchrefPredict schref = { 0 }; + + if ((ret = tfw_cfg_handle_ratio_predyn_opts(ce, arg_flags))) + return ret; + + TFW_CFG_ENTRY_FOR_EACH_ATTR(ce, i, key, val) { + if (!strcasecmp(key, "past")) { + if (has_past) { + TFW_ERR_NL("Duplicate argument: '%s'\n", key); + return -EINVAL; + } + if (tfw_cfg_parse_int(val, &schref.past)) { + TFW_ERR_NL("Invalid value: '%s'\n", val); + return -EINVAL; + } + has_past = true; + } else if (!strcasecmp(key, "rate")) { + if (has_rate) { + TFW_ERR_NL("Duplicate argument: '%s'\n", key); + return -EINVAL; + } + if (tfw_cfg_parse_int(val, &schref.rate)) { + TFW_ERR_NL("Invalid value: '%s'\n", val); + return -EINVAL; + } + has_rate = true; + } else if (!strcasecmp(key, "ahead")) { + if (has_ahead) { + TFW_ERR_NL("Duplicate argument: '%s'\n", key); + return -EINVAL; + } + if (tfw_cfg_parse_int(val, &schref.ahead)) { + TFW_ERR_NL("Invalid value: '%s'\n", val); + return -EINVAL; + } + has_ahead = true; + } + } + if (!has_past) { + schref.past = TFW_CFG_PAST_DEF; + } else if ((schref.past < 1) || (schref.past > TFW_CFG_PAST_MAX)) { + TFW_ERR_NL("Out of range of [1..%d]: 'past=%d'\n", + TFW_CFG_PAST_MAX, schref.past); + return -EINVAL; + } + if (!has_rate) { + schref.rate = TFW_CFG_RATE_DEF; + } else if ((schref.rate < 1) || (schref.rate > TFW_CFG_RATE_MAX)) { + TFW_ERR_NL("Out of range of [1..%d]: 'rate=%d'\n", + TFW_CFG_RATE_MAX, schref.rate); + return -EINVAL; + } + if (!has_ahead) { + schref.ahead = schref.past > 1 ? schref.past / 2 : 1; + } else if ((schref.ahead < 1) || (schref.ahead > schref.past / 2)) { + TFW_ERR_NL("Out of range of [1..%d]: 'ahead=%d'." + "Can't be greater than half of 'past=%d'.\n", + schref.past / 2, schref.ahead, schref.past); + return -EINVAL; } + *(TfwSchrefPredict *)arg_schref = schref; + return 0; +} + +static int +tfw_cfg_handle_ratio_dynamic(TfwCfgEntry *ce, unsigned int *arg_flags) +{ + if (ce->attr_n) { + TFW_ERR_NL("Arguments may not have the \'=\' sign\n"); + return -EINVAL; + } + + return tfw_cfg_handle_ratio_predyn_opts(ce, arg_flags); +} + +static int +tfw_cfg_handle_ratio(TfwCfgEntry *ce, void *schref, unsigned int *sg_flags) +{ + int ret; + unsigned int flags; + + if (ce->val_n < 2) { + /* Default ratio scheduler type. */ + flags = TFW_SG_F_SCHED_RATIO_STATIC; + } else if (!strcasecmp(ce->vals[1], "static")) { + flags = TFW_SG_F_SCHED_RATIO_STATIC; + } else if (!strcasecmp(ce->vals[1], "dynamic")) { + flags = TFW_SG_F_SCHED_RATIO_DYNAMIC; + if ((ret = tfw_cfg_handle_ratio_dynamic(ce, &flags))) + return ret; + } else if (!strcasecmp(ce->vals[1], "predict")) { + flags = TFW_SG_F_SCHED_RATIO_PREDICT; + if ((ret = tfw_cfg_handle_ratio_predict(ce, schref, &flags))) + return ret; + } else { + TFW_ERR_NL("Unsupported argument: '%s'\n", ce->vals[1]); + return -EINVAL; + } + + *sg_flags = flags; return 0; } @@ -995,27 +1183,25 @@ tfw_cfgop_finish_srv_group(TfwCfgSpec *cs) * Common code to handle 'sched' directive. */ static int -tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce, TfwScheduler **arg_sched) +tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce, TfwScheduler **arg_sched, + void *schref, unsigned int *sg_flags) { TfwScheduler *sched; if (!ce->val_n) { - TFW_ERR_NL("%s: Invalid number of arguments: %zd\n", - cs->name, ce->val_n); - return -EINVAL; - } - if (ce->attr_n) { - TFW_ERR_NL("%s %s: Arguments may not have the \'=\' sign\n", - cs->name, ce->vals[0]); + TFW_ERR_NL("Invalid number of arguments: %zu\n", ce->val_n); return -EINVAL; } if (!(sched = tfw_sched_lookup(ce->vals[0]))) { - TFW_ERR_NL("%s %s: Unrecognized scheduler: '%s'\n", - cs->name, ce->vals[0], ce->vals[0]); + TFW_ERR_NL("Unrecognized scheduler: '%s'\n", ce->vals[0]); return -EINVAL; } + if (!strcasecmp(sched->name, "ratio")) + if (tfw_cfg_handle_ratio(ce, schref, sg_flags)) + return -EINVAL; + *arg_sched = sched; return 0; @@ -1024,13 +1210,21 @@ tfw_cfgop_sched(TfwCfgSpec *cs, TfwCfgEntry *ce, TfwScheduler **arg_sched) static int tfw_cfgop_in_sched(TfwCfgSpec *cs, TfwCfgEntry *ce) { - return tfw_cfgop_sched(cs, ce, &tfw_cfg_in_sched); + tfw_cfg_schref = &tfw_cfg_schref_predict; + + return tfw_cfgop_sched(cs, ce, &tfw_cfg_sched, + tfw_cfg_schref, + &tfw_cfg_sg_flags); } static int tfw_cfgop_out_sched(TfwCfgSpec *cs, TfwCfgEntry *ce) { - return tfw_cfgop_sched(cs, ce, &tfw_cfg_out_sched); + tfw_cfg_out_schref = &tfw_cfg_out_schref_predict; + + return tfw_cfgop_sched(cs, ce, &tfw_cfg_out_sched, + tfw_cfg_out_schref, + &tfw_cfg_out_sg_flags); } /** @@ -1039,56 +1233,66 @@ tfw_cfgop_out_sched(TfwCfgSpec *cs, TfwCfgEntry *ce) static void tfw_clean_srv_groups(TfwCfgSpec *cs) { + TfwServer *srv, *tmp; + + list_for_each_entry_safe(srv, tmp, &tfw_cfg_in_slst, list) { + list_del(&srv->list); + tfw_sock_srv_del_conns(srv); + tfw_server_destroy(srv); + } + list_for_each_entry_safe(srv, tmp, &tfw_cfg_out_slst, list) { + list_del(&srv->list); + tfw_sock_srv_del_conns(srv); + tfw_server_destroy(srv); + } + + tfw_cfg_sg = tfw_cfg_out_sg = NULL; + tfw_cfg_sched = tfw_cfg_out_sched = NULL; + tfw_cfg_schref = tfw_cfg_out_schref = NULL; + tfw_cfg_slstsz = tfw_cfg_out_slstsz = 0; + tfw_sock_srv_delete_all_conns(); tfw_sg_release_all(); - - tfw_cfg_in_sg = tfw_cfg_out_sg = NULL; - tfw_cfg_in_sched = tfw_cfg_out_sched = NULL; - tfw_cfg_in_slstsz = tfw_cfg_out_slstsz = 0; } static int tfw_sock_srv_start(void) { - int i, ret; - TfwSrvGroup *sg = tfw_cfg_out_sg; - - if (sg) { - BUG_ON(!tfw_cfg_out_sched); - - tfw_cfgop_set_conn_retries(sg, tfw_cfg_out_cns_retries); - sg->max_qsize = tfw_cfg_out_queue_size ? : UINT_MAX; - sg->max_jqage = tfw_cfg_out_fwd_timeout - ? msecs_to_jiffies(tfw_cfg_out_fwd_timeout * 1000) - : ULONG_MAX; - sg->max_refwd = tfw_cfg_out_fwd_retries ? : UINT_MAX; - sg->flags |= tfw_cfg_out_retry_nip ? TFW_SRV_RETRY_NIP : 0; - sg->flags |= tfw_cfg_out_sticky; + int ret; - if (tfw_sg_set_sched(sg, tfw_cfg_out_sched->name)) { - TFW_ERR_NL("srv_group %s: Unable to set scheduler: " - "'%s'\n", sg->name, tfw_cfg_out_sched->name); + /* + * The group "default" is created implicitly, and only when + * a server outside of any group is found in the configuration. + */ + if (tfw_cfg_out_slstsz) { + tfw_cfg_out_sg = tfw_sg_new("default", GFP_KERNEL); + if (!tfw_cfg_out_sg) { + TFW_ERR_NL("Unable to add default server group\n"); return -EINVAL; } - /* Add connections only after a scheduler is set. */ - for (i = 0; i < tfw_cfg_out_slstsz; ++i) { - TfwServer *srv = tfw_cfg_out_slst[i]; - if (tfw_sock_srv_add_conns(srv, tfw_cfg_out_nconn[i])) { - char as[TFW_ADDR_STR_BUF_SIZE] = { 0 }; - tfw_addr_ntop(&srv->addr, as, sizeof(as)); - TFW_ERR_NL("srv_group %s: server '%s': " - "Error adding connections\n", - sg->name, as); - return -EINVAL; - } - } + + tfw_cfg_cns_retries = tfw_cfg_out_cns_retries; + tfw_cfg_queue_size = tfw_cfg_out_queue_size; + tfw_cfg_fwd_timeout = tfw_cfg_out_fwd_timeout; + tfw_cfg_fwd_retries = tfw_cfg_out_fwd_retries; + tfw_cfg_sticky_sess = tfw_cfg_out_sticky_sess; + tfw_cfg_retry_nip = tfw_cfg_out_retry_nip; + tfw_cfg_sg_flags = tfw_cfg_out_sg_flags; + tfw_cfg_slst = &tfw_cfg_out_slst; + tfw_cfg_slstsz = tfw_cfg_out_slstsz; + tfw_cfg_sched = tfw_cfg_out_sched; + tfw_cfg_schref = tfw_cfg_out_schref; + tfw_cfg_sg = tfw_cfg_out_sg; + + if ((ret = tfw_cfgop_setup_srv_group())) + return ret; } /* * This must be executed only after the complete configuration * has been processed as it depends on configuration directives * that can be located anywhere in the configuration file. */ - if ((ret = tfw_sg_for_each_srv(tfw_server_apm_create)) != 0) + if ((ret = tfw_sg_for_each_srv(tfw_apm_add_srv)) != 0) return ret; return tfw_sg_for_each_srv(tfw_sock_srv_connect_srv); @@ -1108,50 +1312,62 @@ static TfwCfgSpec tfw_srv_group_specs[] = { .cleanup = tfw_clean_srv_groups }, { - "sched", "round-robin", + "sched", "ratio static", tfw_cfgop_in_sched, .allow_none = true, .allow_repeat = false, .cleanup = tfw_clean_srv_groups, }, { - "server_queue_size", NULL, + "server_queue_size", "1000", tfw_cfgop_in_queue_size, .allow_none = true, .allow_repeat = false, .cleanup = tfw_clean_srv_groups, + .spec_ext = &(TfwCfgSpecInt) { + .range = { 0, INT_MAX }, + }, }, { - "server_forward_timeout", NULL, + "server_forward_timeout", "60", tfw_cfgop_in_fwd_timeout, .allow_none = true, .allow_repeat = false, .cleanup = tfw_clean_srv_groups, + .spec_ext = &(TfwCfgSpecInt) { + .range = { 0, INT_MAX }, + }, }, { - "server_forward_retries", NULL, + "server_forward_retries", "5", tfw_cfgop_in_fwd_retries, .allow_none = true, .allow_repeat = false, .cleanup = tfw_clean_srv_groups, + .spec_ext = &(TfwCfgSpecInt) { + .range = { 0, INT_MAX }, + }, }, { - "server_retry_nonidempotent", NULL, + "server_retry_nonidempotent", TFW_CFG_DFLT_VAL, tfw_cfgop_in_retry_nip, .allow_none = true, .allow_repeat = false, .cleanup = tfw_clean_srv_groups, }, { - "server_connect_retries", NULL, + "server_connect_retries", "10", tfw_cfgop_in_conn_retries, .allow_none = true, .allow_repeat = false, .cleanup = tfw_clean_srv_groups, + .spec_ext = &(TfwCfgSpecInt) { + .range = { 0, INT_MAX }, + }, }, { - "sticky_sessions", NULL, - tfw_cfgop_in_sticky, + "sticky_sessions", TFW_CFG_DFLT_VAL, + tfw_cfgop_in_sticky_sess, .allow_none = true, .allow_repeat = false, .cleanup = tfw_clean_srv_groups, @@ -1172,50 +1388,62 @@ TfwCfgMod tfw_sock_srv_cfg_mod = { .cleanup = tfw_clean_srv_groups, }, { - "sched", "round-robin", + "sched", "ratio static", tfw_cfgop_out_sched, .allow_none = true, .allow_repeat = false, .cleanup = tfw_clean_srv_groups, }, { - "server_queue_size", NULL, + "server_queue_size", "1000", tfw_cfgop_out_queue_size, .allow_none = true, - .allow_repeat = true, + .allow_repeat = false, .cleanup = tfw_clean_srv_groups, + .spec_ext = &(TfwCfgSpecInt) { + .range = { 0, INT_MAX }, + }, }, { - "server_forward_timeout", NULL, + "server_forward_timeout", "60", tfw_cfgop_out_fwd_timeout, .allow_none = true, - .allow_repeat = true, + .allow_repeat = false, .cleanup = tfw_clean_srv_groups, + .spec_ext = &(TfwCfgSpecInt) { + .range = { 0, INT_MAX }, + }, }, { - "server_forward_retries", NULL, + "server_forward_retries", "5", tfw_cfgop_out_fwd_retries, .allow_none = true, - .allow_repeat = true, + .allow_repeat = false, .cleanup = tfw_clean_srv_groups, + .spec_ext = &(TfwCfgSpecInt) { + .range = { 0, INT_MAX }, + }, }, { - "server_retry_non_idempotent", NULL, + "server_retry_non_idempotent", TFW_CFG_DFLT_VAL, tfw_cfgop_out_retry_nip, .allow_none = true, - .allow_repeat = true, + .allow_repeat = false, .cleanup = tfw_clean_srv_groups, }, { - "server_connect_retries", NULL, + "server_connect_retries", "10", tfw_cfgop_out_conn_retries, .allow_none = true, - .allow_repeat = true, + .allow_repeat = false, .cleanup = tfw_clean_srv_groups, + .spec_ext = &(TfwCfgSpecInt) { + .range = { 0, INT_MAX }, + }, }, { - "sticky_sessions", NULL, - tfw_cfgop_out_sticky, + "sticky_sessions", TFW_CFG_DFLT_VAL, + tfw_cfgop_out_sticky_sess, .allow_none = true, .allow_repeat = false, .cleanup = tfw_clean_srv_groups, @@ -1245,6 +1473,7 @@ TfwCfgMod tfw_sock_srv_cfg_mod = { int tfw_sock_srv_init(void) { + BUILD_BUG_ON(_TFW_PSTATS_IDX_COUNT > TFW_SG_M_PSTATS_IDX); BUG_ON(tfw_srv_conn_cache); tfw_srv_conn_cache = kmem_cache_create("tfw_srv_conn_cache", sizeof(TfwSrvConn), 0, 0, NULL); diff --git a/tempesta_fw/t/unit/Makefile b/tempesta_fw/t/unit/Makefile index 7dee41b58..57c6ff6d3 100644 --- a/tempesta_fw/t/unit/Makefile +++ b/tempesta_fw/t/unit/Makefile @@ -36,7 +36,7 @@ tfw_test-objs = \ test_tfw_str.o \ test_http_parser.o \ sched_helper.o \ - test_sched_rr.o \ + test_sched_ratio.o \ test_sched_hash.o \ test_sched_http.o \ test_http_sticky.o \ diff --git a/tempesta_fw/t/unit/sched_helper.c b/tempesta_fw/t/unit/sched_helper.c index 0bd4366d4..f22bff929 100644 --- a/tempesta_fw/t/unit/sched_helper.c +++ b/tempesta_fw/t/unit/sched_helper.c @@ -49,7 +49,7 @@ test_spec_cleanup(TfwCfgSpec specs[]) } TfwSrvGroup * -test_create_sg(const char *name, const char *sched_name) +test_create_sg(const char *name) { TfwSrvGroup *sg; @@ -58,11 +58,6 @@ test_create_sg(const char *name, const char *sched_name) sg = tfw_sg_new(name, GFP_ATOMIC); BUG_ON(!sg); - { - int r = tfw_sg_set_sched(sg, sched_name); - BUG_ON(r); - } - sg->max_qsize = 100; kernel_fpu_begin(); @@ -70,6 +65,20 @@ test_create_sg(const char *name, const char *sched_name) return sg; } +void +test_start_sg(TfwSrvGroup *sg, const char *sched_name, unsigned int flags) +{ + int r; + + kernel_fpu_end(); + + sg->flags = flags; + r = tfw_sg_set_sched(sg, sched_name); + BUG_ON(r); + + kernel_fpu_begin(); +} + void test_sg_release_all(void) { @@ -100,28 +109,30 @@ test_create_srv(const char *in_addr, TfwSrvGroup *sg) } TfwSrvConn * -test_create_conn(TfwPeer *peer) +test_create_srv_conn(TfwServer *srv) { static struct sock __test_sock = { .sk_state = TCP_ESTABLISHED, }; - TfwConn *conn; + TfwSrvConn *srv_conn; kernel_fpu_end(); if (!tfw_srv_conn_cache) tfw_sock_srv_init(); - conn = (TfwConn *)tfw_srv_conn_alloc(); - BUG_ON(!conn); + srv_conn = tfw_srv_conn_alloc(); + BUG_ON(!srv_conn); - tfw_connection_link_peer(conn, peer); - conn->sk = &__test_sock; + tfw_connection_link_peer((TfwConn *)srv_conn, (TfwPeer *)srv); + srv_conn->sk = &__test_sock; /* A connection is skipped by schedulers if (refcnt <= 0). */ - tfw_connection_revive(conn); + tfw_connection_revive((TfwConn *)srv_conn); + + srv->conn_n++; kernel_fpu_begin(); - return (TfwSrvConn *)conn; + return srv_conn; } void @@ -156,7 +167,8 @@ test_sched_sg_empty_sg(struct TestSchedHelper *sched_helper) BUG_ON(!sched_helper->get_sched_arg); BUG_ON(!sched_helper->free_sched_arg); - sg = test_create_sg("test", sched_helper->sched); + sg = test_create_sg("test"); + test_start_sg(sg, sched_helper->sched, sched_helper->flags); for (i = 0; i < sched_helper->conn_types; ++i) { TfwMsg *msg = sched_helper->get_sched_arg(i); @@ -185,9 +197,9 @@ test_sched_sg_one_srv_zero_conn(struct TestSchedHelper *sched_helper) BUG_ON(!sched_helper->get_sched_arg); BUG_ON(!sched_helper->free_sched_arg); - sg = test_create_sg("test", sched_helper->sched); - + sg = test_create_sg("test"); test_create_srv("127.0.0.1", sg); + test_start_sg(sg, sched_helper->sched, sched_helper->flags); for (i = 0; i < sched_helper->conn_types; ++i) { TfwMsg *msg = sched_helper->get_sched_arg(i); @@ -217,22 +229,23 @@ test_sched_sg_max_srv_zero_conn(struct TestSchedHelper *sched_helper) BUG_ON(!sched_helper->get_sched_arg); BUG_ON(!sched_helper->free_sched_arg); - sg = test_create_sg("test", sched_helper->sched); + sg = test_create_sg("test"); - for (j = 0; j < TFW_SG_MAX_SRV; ++j) + for (j = 0; j < TFW_TEST_SG_MAX_SRV_N; ++j) test_create_srv("127.0.0.1", sg); + test_start_sg(sg, sched_helper->sched, sched_helper->flags); for (i = 0; i < sched_helper->conn_types; ++i) { TfwMsg *msg = sched_helper->get_sched_arg(i); - for (j = 0; j < TFW_SG_MAX_SRV; ++j) { + for (j = 0; j < sg->srv_n; ++j) { TfwSrvConn *srv_conn = - sg->sched->sched_sg_conn(msg, sg); + sg->sched->sched_sg_conn(msg, sg); EXPECT_NULL(srv_conn); /* - * Don't let wachtdog wuppose that we have stucked - * on long cycles. + * Don't let the kernel watchdog decide + * that we're stuck in a locked context. */ kernel_fpu_end(); schedule(); @@ -261,9 +274,9 @@ test_sched_srv_one_srv_zero_conn(struct TestSchedHelper *sched_helper) BUG_ON(!sched_helper->get_sched_arg); BUG_ON(!sched_helper->free_sched_arg); - sg = test_create_sg("test", sched_helper->sched); - + sg = test_create_sg("test"); srv = test_create_srv("127.0.0.1", sg); + test_start_sg(sg, sched_helper->sched, sched_helper->flags); for (i = 0; i < sched_helper->conn_types; ++i) { TfwMsg *msg = sched_helper->get_sched_arg(i); @@ -292,10 +305,11 @@ test_sched_srv_max_srv_zero_conn(struct TestSchedHelper *sched_helper) BUG_ON(!sched_helper->get_sched_arg); BUG_ON(!sched_helper->free_sched_arg); - sg = test_create_sg("test", sched_helper->sched); + sg = test_create_sg("test"); - for (j = 0; j < TFW_SG_MAX_SRV; ++j) + for (j = 0; j < TFW_TEST_SG_MAX_SRV_N; ++j) test_create_srv("127.0.0.1", sg); + test_start_sg(sg, sched_helper->sched, sched_helper->flags); for (i = 0; i < sched_helper->conn_types; ++i) { TfwMsg *msg = sched_helper->get_sched_arg(i); @@ -303,12 +317,12 @@ test_sched_srv_max_srv_zero_conn(struct TestSchedHelper *sched_helper) list_for_each_entry(srv, &sg->srv_list, list) { TfwSrvConn *srv_conn = - sg->sched->sched_srv_conn(msg, srv); + sg->sched->sched_srv_conn(msg, srv); EXPECT_NULL(srv_conn); /* - * Don't let wachtdog wuppose that we have stucked - * on long cycles. + * Don't let the kernel watchdog decide + * that we're stuck in a locked context. */ kernel_fpu_end(); schedule(); @@ -331,41 +345,47 @@ test_sched_srv_offline_srv(struct TestSchedHelper *sched_helper) size_t offline_num = 3; TfwServer *offline_srv = NULL; TfwSrvGroup *sg; + TfwServer *srv; + TfwSrvConn *srv_conn; BUG_ON(!sched_helper); BUG_ON(!sched_helper->sched); BUG_ON(!sched_helper->conn_types); BUG_ON(!sched_helper->get_sched_arg); BUG_ON(!sched_helper->free_sched_arg); - BUG_ON(offline_num >= TFW_SG_MAX_SRV); + BUG_ON(offline_num >= TFW_TEST_SG_MAX_SRV_N); - sg = test_create_sg("test", sched_helper->sched); + sg = test_create_sg("test"); - for (i = 0; i < TFW_SG_MAX_SRV; ++i) { - TfwServer *srv = test_create_srv("127.0.0.1", sg); - TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, srv_conn); + for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) { + srv = test_create_srv("127.0.0.1", sg); + srv_conn = test_create_srv_conn(srv); - if (i == offline_num) { + if (i == offline_num) offline_srv = srv; - atomic_set(&srv_conn->refcnt, 0); + } + list_for_each_entry(srv, &sg->srv_list, list) { + if (srv == offline_srv) { + list_for_each_entry(srv_conn, &srv->conn_list, list) + atomic_set(&srv_conn->refcnt, 0); + break; } } + test_start_sg(sg, sched_helper->sched, sched_helper->flags); for (i = 0; i < sched_helper->conn_types; ++i) { TfwMsg *msg = sched_helper->get_sched_arg(i); - TfwServer *srv; + list_for_each_entry(srv, &sg->srv_list, list) { - TfwSrvConn *srv_conn = - sg->sched->sched_srv_conn(msg, srv); + srv_conn = sg->sched->sched_srv_conn(msg, srv); if (srv == offline_srv) EXPECT_NULL(srv_conn); else EXPECT_NOT_NULL(srv_conn); /* - * Don't let wachtdog wuppose that we have stucked - * on long cycles. + * Don't let the kernel watchdog decide + * that we're stuck in a locked context. */ kernel_fpu_end(); schedule(); diff --git a/tempesta_fw/t/unit/sched_helper.h b/tempesta_fw/t/unit/sched_helper.h index 9f2546e14..cdc179513 100644 --- a/tempesta_fw/t/unit/sched_helper.h +++ b/tempesta_fw/t/unit/sched_helper.h @@ -26,23 +26,29 @@ #include "cfg.h" #include "connection.h" +#define TFW_TEST_SG_MAX_SRV_N 64 +#define TFW_TEST_SRV_MAX_CONN_N 64 +#define TFW_TEST_SG_MAX_CONN_N \ + (TFW_TEST_SG_MAX_SRV_N * TFW_TEST_SRV_MAX_CONN_N) + int tfw_server_init(void); -int tfw_sched_rr_init(void); +int tfw_sched_ratio_init(void); void sched_helper_init(void); void test_spec_cleanup(TfwCfgSpec specs[]); -TfwSrvGroup *test_create_sg(const char *name, const char *sched_name); +TfwSrvGroup *test_create_sg(const char *name); +void test_start_sg(TfwSrvGroup *sg, const char *sched_name, unsigned int flags); void test_sg_release_all(void); TfwServer *test_create_srv(const char *in_addr, TfwSrvGroup *sg); - -TfwSrvConn *test_create_conn(TfwPeer *peer); +TfwSrvConn *test_create_srv_conn(TfwServer *srv); void test_conn_release_all(TfwSrvGroup *sg); struct TestSchedHelper { const char *sched; size_t conn_types; + unsigned int flags; TfwMsg *(*get_sched_arg)(size_t conn_type); void (*free_sched_arg)(TfwMsg *); }; diff --git a/tempesta_fw/t/unit/test.c b/tempesta_fw/t/unit/test.c index cda0aa42b..8829c1c99 100644 --- a/tempesta_fw/t/unit/test.c +++ b/tempesta_fw/t/unit/test.c @@ -22,6 +22,9 @@ #include #include "test.h" +#undef tfw_apm_stats +#define tfw_apm_stats test_tfw_apm_stats + #include "apm.c" #include "vhost.c" @@ -92,7 +95,7 @@ TEST_SUITE(http_sticky); TEST_SUITE(http_match); TEST_SUITE(hash); TEST_SUITE(addr); -TEST_SUITE(sched_rr); +TEST_SUITE(sched_ratio); TEST_SUITE(sched_hash); TEST_SUITE(sched_http); @@ -117,7 +120,7 @@ test_run_all(void) TEST_SUITE_RUN(http_sticky); TEST_SUITE_RUN(hash); TEST_SUITE_RUN(addr); - TEST_SUITE_RUN(sched_rr); + TEST_SUITE_RUN(sched_ratio); TEST_SUITE_RUN(sched_hash); TEST_SUITE_RUN(sched_http); diff --git a/tempesta_fw/t/unit/test_sched_hash.c b/tempesta_fw/t/unit/test_sched_hash.c index 5c6c03682..f4e94d2a3 100644 --- a/tempesta_fw/t/unit/test_sched_hash.c +++ b/tempesta_fw/t/unit/test_sched_hash.c @@ -53,12 +53,7 @@ static char *req_strs[] = { }; static TfwMsg *sched_hash_get_arg(size_t conn_type); - -static void -sched_hash_free_arg(TfwMsg *msg) -{ - test_req_free((TfwHttpReq *)msg); -} +static void sched_hash_free_arg(TfwMsg *msg); static struct TestSchedHelper sched_helper_hash = { .sched = "hash", @@ -67,6 +62,12 @@ static struct TestSchedHelper sched_helper_hash = { .free_sched_arg = &sched_hash_free_arg, }; +static void +sched_hash_free_arg(TfwMsg *msg) +{ + test_req_free((TfwHttpReq *)msg); +} + static TfwMsg * sched_hash_get_arg(size_t conn_type) { @@ -96,20 +97,19 @@ TEST(tfw_sched_hash, sched_sg_one_srv_max_conn) { size_t i, j; - TfwSrvGroup *sg = test_create_sg("test", sched_helper_hash.sched); + TfwSrvGroup *sg = test_create_sg("test"); TfwServer *srv = test_create_srv("127.0.0.1", sg); - for (i = 0; i < TFW_SRV_MAX_CONN; ++i) { - TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, srv_conn); - } + for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i) + test_create_srv_conn(srv); + test_start_sg(sg, sched_helper_hash.sched, 0); /* Check that every request is scheduled to the same connection. */ for (i = 0; i < sched_helper_hash.conn_types; ++i) { TfwMsg *msg = sched_helper_hash.get_sched_arg(i); TfwSrvConn *exp_conn = NULL; - for (j = 0; j < TFW_SRV_MAX_CONN; ++j) { + for (j = 0; j < srv->conn_n; ++j) { TfwSrvConn *srv_conn = sg->sched->sched_sg_conn(msg, sg); EXPECT_NOT_NULL(srv_conn); @@ -123,8 +123,8 @@ TEST(tfw_sched_hash, sched_sg_one_srv_max_conn) tfw_srv_conn_put(srv_conn); /* - * Don't let wachtdog suppose that we have stucked - * on long cycles. + * Don't let the kernel watchdog decide + * that we are stuck in locked context. */ kernel_fpu_end(); schedule(); @@ -144,25 +144,24 @@ TEST(tfw_sched_hash, sched_sg_max_srv_zero_conn) TEST(tfw_sched_hash, sched_sg_max_srv_max_conn) { - size_t i, j; + unsigned long i, j; - TfwSrvGroup *sg = test_create_sg("test", sched_helper_hash.sched); + TfwSrvGroup *sg = test_create_sg("test"); - for (i = 0; i < TFW_SG_MAX_SRV; ++i) { + for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) { TfwServer *srv = test_create_srv("127.0.0.1", sg); - for (j = 0; j < TFW_SRV_MAX_CONN; ++j) { - TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, srv_conn); - } + for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) + test_create_srv_conn(srv); } + test_start_sg(sg, sched_helper_hash.sched, 0); /* Check that every request is scheduled to the same connection. */ for (i = 0; i < sched_helper_hash.conn_types; ++i) { TfwMsg *msg = sched_helper_hash.get_sched_arg(i); TfwSrvConn *exp_conn = NULL; - for (j = 0; j < TFW_SG_MAX_SRV * TFW_SRV_MAX_CONN; ++j) { + for (j = 0; j < TFW_TEST_SG_MAX_CONN_N; ++j) { TfwSrvConn *srv_conn = sg->sched->sched_sg_conn(msg, sg); EXPECT_NOT_NULL(srv_conn); @@ -176,8 +175,8 @@ TEST(tfw_sched_hash, sched_sg_max_srv_max_conn) tfw_srv_conn_put(srv_conn); /* - * Don't let wachtdog suppose that we have stucked - * on long cycles. + * Don't let the kernel watchdog decide + * that we are stuck in locked context. */ kernel_fpu_end(); schedule(); @@ -199,20 +198,19 @@ TEST(tfw_sched_hash, sched_srv_one_srv_max_conn) { size_t i, j; - TfwSrvGroup *sg = test_create_sg("test", sched_helper_hash.sched); + TfwSrvGroup *sg = test_create_sg("test"); TfwServer *srv = test_create_srv("127.0.0.1", sg); - for (i = 0; i < TFW_SRV_MAX_CONN; ++i) { - TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, srv_conn); - } + for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i) + test_create_srv_conn(srv); + test_start_sg(sg, sched_helper_hash.sched, 0); /* Check that every request is scheduled to the same connection. */ for (i = 0; i < sched_helper_hash.conn_types; ++i) { TfwMsg *msg = sched_helper_hash.get_sched_arg(i); TfwSrvConn *exp_conn = NULL; - for (j = 0; j < TFW_SRV_MAX_CONN; ++j) { + for (j = 0; j < srv->conn_n; ++j) { TfwSrvConn *srv_conn = sg->sched->sched_srv_conn(msg, srv); @@ -228,8 +226,8 @@ TEST(tfw_sched_hash, sched_srv_one_srv_max_conn) tfw_srv_conn_put(srv_conn); /* - * Don't let wachtdog suppose that we have stucked - * on long cycles. + * Don't let the kernel watchdog decide + * that we are stuck in locked context. */ kernel_fpu_end(); schedule(); @@ -251,17 +249,15 @@ TEST(tfw_sched_hash, sched_srv_max_srv_max_conn) { size_t i, j; - TfwSrvGroup *sg = test_create_sg("test", sched_helper_hash.sched); + TfwSrvGroup *sg = test_create_sg("test"); - for (i = 0; i < TFW_SG_MAX_SRV; ++i) { + for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) { TfwServer *srv = test_create_srv("127.0.0.1", sg); - for (j = 0; j < TFW_SRV_MAX_CONN; ++j) { - TfwSrvConn *srv_conn = - test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, srv_conn); - } + for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) + test_create_srv_conn(srv); } + test_start_sg(sg, sched_helper_hash.sched, 0); /* Check that every request is scheduled to the same connection. */ for (i = 0; i < sched_helper_hash.conn_types; ++i) { @@ -271,7 +267,7 @@ TEST(tfw_sched_hash, sched_srv_max_srv_max_conn) list_for_each_entry(srv, &sg->srv_list, list) { TfwSrvConn *exp_conn = NULL; - for (j = 0; j < TFW_SG_MAX_SRV * TFW_SRV_MAX_CONN; ++j) { + for (j = 0; j < TFW_TEST_SG_MAX_CONN_N; ++j) { TfwSrvConn *srv_conn = sg->sched->sched_srv_conn(msg, srv); @@ -288,8 +284,8 @@ TEST(tfw_sched_hash, sched_srv_max_srv_max_conn) tfw_srv_conn_put(srv_conn); /* - * Don't let wachtdog suppose that we have - * stucked on long cycles. + * Don't let the kernel watchdog decide + * that we are stuck in locked context. */ kernel_fpu_end(); schedule(); diff --git a/tempesta_fw/t/unit/test_sched_http.c b/tempesta_fw/t/unit/test_sched_http.c index 6ec104fe4..4e87cff01 100644 --- a/tempesta_fw/t/unit/test_sched_http.c +++ b/tempesta_fw/t/unit/test_sched_http.c @@ -110,7 +110,8 @@ TEST(tfw_sched_http, zero_rules_and_zero_conns) TEST(tfw_sched_http, one_rule_and_zero_conns) { - test_create_sg("default", "round-robin"); + TfwSrvGroup *sg = test_create_sg("default"); + test_start_sg(sg, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); if (parse_cfg("sched_http_rules {\nmatch default * * *;\n}\n")) { TEST_FAIL("can't parse rules\n"); @@ -128,10 +129,10 @@ TEST(tfw_sched_http, one_wildcard_rule) TfwServer *srv; TfwSrvConn *expect_conn; - sg = test_create_sg("default", "round-robin"); + sg = test_create_sg("default"); srv = test_create_srv("127.0.0.1", sg); - expect_conn = test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, expect_conn); + expect_conn = test_create_srv_conn(srv); + test_start_sg(sg, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); if (parse_cfg("sched_http_rules {\nmatch default * * *;\n}\n")) { TEST_FAIL("can't parse rules\n"); @@ -153,55 +154,55 @@ TEST(tfw_sched_http, some_rules) *expect_conn5, *expect_conn6, *expect_conn7, *expect_conn8, *expect_conn9, *expect_conn10; - sg1 = test_create_sg("sg1", "round-robin"); + sg1 = test_create_sg("sg1"); srv = test_create_srv("127.0.0.1", sg1); - expect_conn1 = test_create_conn((TfwPeer *)srv); - sg1->sched->add_conn(sg1, srv, expect_conn1); + expect_conn1 = test_create_srv_conn(srv); + test_start_sg(sg1, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); - sg2 = test_create_sg("sg2", "round-robin"); + sg2 = test_create_sg("sg2"); srv = test_create_srv("127.0.0.1", sg2); - expect_conn2 = test_create_conn((TfwPeer *)srv); - sg2->sched->add_conn(sg2, srv, expect_conn2); + expect_conn2 = test_create_srv_conn(srv); + test_start_sg(sg2, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); - sg3 = test_create_sg("sg3", "round-robin"); + sg3 = test_create_sg("sg3"); srv = test_create_srv("127.0.0.1", sg3); - expect_conn3 = test_create_conn((TfwPeer *)srv); - sg3->sched->add_conn(sg3, srv, expect_conn3); + expect_conn3 = test_create_srv_conn(srv); + test_start_sg(sg3, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); - sg4 = test_create_sg("sg4", "round-robin"); + sg4 = test_create_sg("sg4"); srv = test_create_srv("127.0.0.1", sg4); - expect_conn4 = test_create_conn((TfwPeer *)srv); - sg4->sched->add_conn(sg4, srv, expect_conn4); + expect_conn4 = test_create_srv_conn(srv); + test_start_sg(sg4, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); - sg5 = test_create_sg("sg5", "round-robin"); + sg5 = test_create_sg("sg5"); srv = test_create_srv("127.0.0.1", sg5); - expect_conn5 = test_create_conn((TfwPeer *)srv); - sg5->sched->add_conn(sg5, srv, expect_conn5); + expect_conn5 = test_create_srv_conn(srv); + test_start_sg(sg5, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); - sg6 = test_create_sg("sg6", "round-robin"); + sg6 = test_create_sg("sg6"); srv = test_create_srv("127.0.0.1", sg6); - expect_conn6 = test_create_conn((TfwPeer *)srv); - sg6->sched->add_conn(sg6, srv, expect_conn6); + expect_conn6 = test_create_srv_conn(srv); + test_start_sg(sg6, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); - sg7 = test_create_sg("sg7", "round-robin"); + sg7 = test_create_sg("sg7"); srv = test_create_srv("127.0.0.1", sg7); - expect_conn7 = test_create_conn((TfwPeer *)srv); - sg7->sched->add_conn(sg7, srv, expect_conn7); + expect_conn7 = test_create_srv_conn(srv); + test_start_sg(sg7, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); - sg8 = test_create_sg("sg8", "round-robin"); + sg8 = test_create_sg("sg8"); srv = test_create_srv("127.0.0.1", sg8); - expect_conn8 = test_create_conn((TfwPeer *)srv); - sg8->sched->add_conn(sg8, srv, expect_conn8); + expect_conn8 = test_create_srv_conn(srv); + test_start_sg(sg8, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); - sg9 = test_create_sg("sg9", "round-robin"); + sg9 = test_create_sg("sg9"); srv = test_create_srv("127.0.0.1", sg9); - expect_conn9 = test_create_conn((TfwPeer *)srv); - sg9->sched->add_conn(sg9, srv, expect_conn9); + expect_conn9 = test_create_srv_conn(srv); + test_start_sg(sg9, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); - sg10 = test_create_sg("sg10", "round-robin"); + sg10 = test_create_sg("sg10"); srv = test_create_srv("127.0.0.1", sg10); - expect_conn10 = test_create_conn((TfwPeer *)srv); - sg10->sched->add_conn(sg10, srv, expect_conn10); + expect_conn10 = test_create_srv_conn(srv); + test_start_sg(sg10, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); if (parse_cfg("sched_http_rules {\nmatch sg1 uri eq /foo;\n\ match sg2 uri prefix /foo/bar;\n\ @@ -313,10 +314,10 @@ TEST(tfw_sched_http, one_rule) TfwServer *srv; TfwSrvConn *expect_conn; - sg = test_create_sg("default", "round-robin"); + sg = test_create_sg("default"); srv = test_create_srv("127.0.0.1", sg); - expect_conn = test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, expect_conn); + expect_conn = test_create_srv_conn(srv); + test_start_sg(sg, "ratio", TFW_SG_F_SCHED_RATIO_STATIC); if (parse_cfg(test_cases[i].rule_str)) { TEST_FAIL("can't parse rules\n"); @@ -337,9 +338,9 @@ TEST_SUITE(sched_http) kernel_fpu_end(); - s = tfw_sched_lookup("round-robin"); + s = tfw_sched_lookup("ratio"); if (!s) - tfw_sched_rr_init(); + tfw_sched_ratio_init(); tfw_sched_http_init(); tfw_server_init(); diff --git a/tempesta_fw/t/unit/test_sched_ratio.c b/tempesta_fw/t/unit/test_sched_ratio.c new file mode 100644 index 000000000..21f6ea7eb --- /dev/null +++ b/tempesta_fw/t/unit/test_sched_ratio.c @@ -0,0 +1,340 @@ +/** + * Tempesta FW + * + * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). + * Copyright (C) 2015-2017 Tempesta Technologies, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include + +#undef tfw_sock_srv_init +#define tfw_sock_srv_init test_ratio_sock_srv_conn_init +#undef tfw_sock_srv_exit +#define tfw_sock_srv_exit test_ratio_sock_srv_exit +#undef tfw_srv_conn_release +#define tfw_srv_conn_release test_ratio_srv_conn_release +#undef tfw_sock_srv_cfg_mod +#define tfw_sock_srv_cfg_mod test_ratio_srv_cfg_mod + +#include "sock_srv.c" + +#ifdef module_init +#undef module_init +#undef module_exit +#define module_init(func) +#define module_exit(func) +#endif + +#include "../../sched/tfw_sched_ratio.c" + +#include "sched_helper.h" +#include "server.h" +#include "test.h" + +static TfwMsg * +sched_ratio_get_arg(size_t conn_type __attribute__((unused))) +{ + return NULL; +} + +static void +sched_ratio_free_arg(TfwMsg *msg __attribute__((unused))) +{ +} + +static struct TestSchedHelper sched_helper_ratio = { + .sched = "ratio", + .flags = TFW_SG_F_SCHED_RATIO_STATIC, + .conn_types = 1, + .get_sched_arg = &sched_ratio_get_arg, + .free_sched_arg = &sched_ratio_free_arg, +}; + +TEST(tfw_sched_ratio, sg_empty) +{ + test_sched_sg_empty_sg(&sched_helper_ratio); +} + +TEST(tfw_sched_ratio, sched_sg_one_srv_zero_conn) +{ + test_sched_sg_one_srv_zero_conn(&sched_helper_ratio); +} + +TEST(tfw_sched_ratio, sched_sg_one_srv_max_conn) +{ + size_t i, j; + long long conn_acc = 0, conn_acc_check = 0; + + TfwSrvGroup *sg = test_create_sg("test"); + TfwServer *srv = test_create_srv("127.0.0.1", sg); + TfwSrvConn *srv_conn; + + for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i) { + srv_conn = test_create_srv_conn(srv); + conn_acc ^= (long long)srv_conn; + } + test_start_sg(sg, sched_helper_ratio.sched, sched_helper_ratio.flags); + + /* + * Check that connections are scheduled in fair way: + * every connection will be scheduled only once + */ + for (i = 0; i < sched_helper_ratio.conn_types; ++i) { + TfwMsg *msg = sched_helper_ratio.get_sched_arg(i); + conn_acc_check = 0; + + for (j = 0; j < srv->conn_n; ++j) { + srv_conn = sg->sched->sched_sg_conn(msg, sg); + EXPECT_NOT_NULL(srv_conn); + if (!srv_conn) + goto err; + + conn_acc_check ^= (long long)srv_conn; + tfw_srv_conn_put(srv_conn); + /* + * Don't let the kernel watchdog decide + * that we are stuck in locked context. + */ + kernel_fpu_end(); + schedule(); + kernel_fpu_begin(); + } + + EXPECT_EQ(conn_acc, conn_acc_check); + sched_helper_ratio.free_sched_arg(msg); + } +err: + test_conn_release_all(sg); + test_sg_release_all(); +} + +TEST(tfw_sched_ratio, sched_sg_max_srv_zero_conn) +{ + test_sched_sg_max_srv_zero_conn(&sched_helper_ratio); +} + +TEST(tfw_sched_ratio, sched_sg_max_srv_max_conn) +{ + unsigned long i, j; + long long conn_acc = 0, conn_acc_check = 0; + + TfwSrvGroup *sg = test_create_sg("test"); + TfwServer *srv; + TfwSrvConn *srv_conn; + + for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) { + srv = test_create_srv("127.0.0.1", sg); + + for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) { + srv_conn = test_create_srv_conn(srv); + conn_acc ^= (long long)srv_conn; + } + } + test_start_sg(sg, sched_helper_ratio.sched, sched_helper_ratio.flags); + + /* + * Check that connections are scheduled in fair way: + * every connection will be scheduled only once + */ + for (i = 0; i < sched_helper_ratio.conn_types; ++i) { + TfwMsg *msg = sched_helper_ratio.get_sched_arg(i); + conn_acc_check = 0; + + for (j = 0; j < TFW_TEST_SG_MAX_CONN_N; ++j) { + srv_conn = sg->sched->sched_sg_conn(msg, sg); + EXPECT_NOT_NULL(srv_conn); + if (!srv_conn) + goto err; + + conn_acc_check ^= (long long)srv_conn; + tfw_srv_conn_put(srv_conn); + } + + EXPECT_EQ(conn_acc, conn_acc_check); + sched_helper_ratio.free_sched_arg(msg); + } +err: + test_conn_release_all(sg); + test_sg_release_all(); +} + +TEST(tfw_sched_ratio, sched_srv_one_srv_zero_conn) +{ + test_sched_srv_one_srv_zero_conn(&sched_helper_ratio); +} + +TEST(tfw_sched_ratio, sched_srv_one_srv_max_conn) +{ + size_t i, j; + long long conn_acc = 0, conn_acc_check = 0; + + TfwSrvGroup *sg = test_create_sg("test"); + TfwServer *srv = test_create_srv("127.0.0.1", sg); + TfwSrvConn *srv_conn; + + for (i = 0; i < TFW_TEST_SRV_MAX_CONN_N; ++i) { + srv_conn = test_create_srv_conn(srv); + conn_acc ^= (long long)srv_conn; + } + test_start_sg(sg, sched_helper_ratio.sched, sched_helper_ratio.flags); + + /* + * Check that connections are scheduled in fair way: + * every connection will be scheduled only once + */ + for (i = 0; i < sched_helper_ratio.conn_types; ++i) { + TfwMsg *msg = sched_helper_ratio.get_sched_arg(i); + conn_acc_check = 0; + + for (j = 0; j < srv->conn_n; ++j) { + srv_conn = sg->sched->sched_srv_conn(msg, srv); + EXPECT_NOT_NULL(srv_conn); + if (!srv_conn) + goto err; + EXPECT_EQ((TfwServer *)srv_conn->peer, srv); + + conn_acc_check ^= (long long)srv_conn; + tfw_srv_conn_put(srv_conn); + + /* + * Don't let the kernel watchdog decide + * that we are stuck in locked context. + */ + kernel_fpu_end(); + schedule(); + kernel_fpu_begin(); + } + + EXPECT_EQ(conn_acc, conn_acc_check); + sched_helper_ratio.free_sched_arg(msg); + } +err: + test_conn_release_all(sg); + test_sg_release_all(); +} + +TEST(tfw_sched_ratio, sched_srv_max_srv_zero_conn) +{ + test_sched_srv_max_srv_zero_conn(&sched_helper_ratio); +} + +TEST(tfw_sched_ratio, sched_srv_max_srv_max_conn) +{ + size_t i, j; + long long conn_acc_check = 0; + struct { + TfwServer *srv; + long long conn_acc; + } srv_acc[TFW_TEST_SG_MAX_SRV_N] = { 0 }; + TfwServer *srv; + TfwSrvConn *srv_conn; + + TfwSrvGroup *sg = test_create_sg("test"); + + for (i = 0; i < TFW_TEST_SG_MAX_SRV_N; ++i) { + srv = test_create_srv("127.0.0.1", sg); + srv_acc[i].srv = srv; + + for (j = 0; j < TFW_TEST_SRV_MAX_CONN_N; ++j) { + srv_conn = test_create_srv_conn(srv); + srv_acc[i].conn_acc ^= (long long)srv_conn; + } + } + test_start_sg(sg, sched_helper_ratio.sched, sched_helper_ratio.flags); + + /* + * Check that connections are scheduled in fair way: + * every connection will be scheduled only once + */ + for (i = 0; i < sched_helper_ratio.conn_types; ++i) { + TfwMsg *msg = sched_helper_ratio.get_sched_arg(i); + + list_for_each_entry(srv, &sg->srv_list, list) { + size_t k = 0; + conn_acc_check = 0; + + for (j = 0; j < srv->conn_n; ++j) { + srv_conn = sg->sched->sched_srv_conn(msg, srv); + EXPECT_NOT_NULL(srv_conn); + if (!srv_conn) + goto err; + EXPECT_EQ((TfwServer *)srv_conn->peer, srv); + + conn_acc_check ^= (long long)srv_conn; + tfw_srv_conn_put(srv_conn); + + /* + * Don't let the kernel watchdog decide + * that we are stuck in locked context. + */ + kernel_fpu_end(); + schedule(); + kernel_fpu_begin(); + } + + for (k = 0; k < srv->conn_n; ++k) { + if (srv_acc[k].srv == srv) + EXPECT_EQ(srv_acc[k].conn_acc, + conn_acc_check); + } + } + sched_helper_ratio.free_sched_arg(msg); + } +err: + test_conn_release_all(sg); + test_sg_release_all(); +} + +TEST(tfw_sched_ratio, sched_srv_offline_srv) +{ + test_sched_srv_offline_srv(&sched_helper_ratio); +} + +TEST_SUITE(sched_ratio) +{ + kernel_fpu_end(); + + tfw_server_init(); + tfw_sched_ratio_init(); + + kernel_fpu_begin(); + + /* + * Schedulers have the same interface so some test cases can use generic + * implementations. Some test cases still have to know how scheduler + * work at low level. Please, keep same structure for implementation + * aware test cases across all schedulers. + * + * Implementation aware cases: + * sched_sg_one_srv_max_conn + * sched_sg_max_srv_max_conn + * sched_srv_one_srv_max_conn + * sched_srv_max_srv_max_conn + */ + + TEST_RUN(tfw_sched_ratio, sg_empty); + + TEST_RUN(tfw_sched_ratio, sched_sg_one_srv_zero_conn); + TEST_RUN(tfw_sched_ratio, sched_sg_one_srv_max_conn); + TEST_RUN(tfw_sched_ratio, sched_sg_max_srv_zero_conn); + TEST_RUN(tfw_sched_ratio, sched_sg_max_srv_max_conn); + + TEST_RUN(tfw_sched_ratio, sched_srv_one_srv_zero_conn); + TEST_RUN(tfw_sched_ratio, sched_srv_one_srv_max_conn); + TEST_RUN(tfw_sched_ratio, sched_srv_max_srv_zero_conn); + TEST_RUN(tfw_sched_ratio, sched_srv_max_srv_max_conn); + TEST_RUN(tfw_sched_ratio, sched_srv_offline_srv); +} diff --git a/tempesta_fw/t/unit/test_sched_rr.c b/tempesta_fw/t/unit/test_sched_rr.c deleted file mode 100644 index 918f0ae39..000000000 --- a/tempesta_fw/t/unit/test_sched_rr.c +++ /dev/null @@ -1,338 +0,0 @@ -/** - * Tempesta FW - * - * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2015-2017 Tempesta Technologies, Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, - * or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include - -#undef tfw_sock_srv_init -#define tfw_sock_srv_init test_rr_sock_srv_conn_init -#undef tfw_sock_srv_exit -#define tfw_sock_srv_exit test_rr_sock_srv_exit -#undef tfw_srv_conn_release -#define tfw_srv_conn_release test_rr_srv_conn_release -#undef tfw_sock_srv_cfg_mod -#define tfw_sock_srv_cfg_mod test_rr_srv_cfg_mod - -#include "sock_srv.c" - -#ifdef module_init -#undef module_init -#undef module_exit -#define module_init(func) -#define module_exit(func) -#endif - -#include "../../sched/tfw_sched_rr.c" - -#include "sched_helper.h" -#include "server.h" -#include "test.h" - -static TfwMsg * -sched_rr_get_arg(size_t conn_type __attribute__((unused))) -{ - return NULL; -} - -static void -sched_rr_free_arg(TfwMsg *msg __attribute__((unused))) -{ -} - -static struct TestSchedHelper sched_helper_rr = { - .sched = "round-robin", - .conn_types = 1, - .get_sched_arg = &sched_rr_get_arg, - .free_sched_arg = &sched_rr_free_arg, -}; - -TEST(tfw_sched_rr, sg_empty) -{ - test_sched_sg_empty_sg(&sched_helper_rr); -} - -TEST(tfw_sched_rr, sched_sg_one_srv_zero_conn) -{ - test_sched_sg_one_srv_zero_conn(&sched_helper_rr); -} - -TEST(tfw_sched_rr, sched_sg_one_srv_max_conn) -{ - size_t i, j; - long long conn_acc = 0, conn_acc_check = 0; - - TfwSrvGroup *sg = test_create_sg("test", sched_helper_rr.sched); - TfwServer *srv = test_create_srv("127.0.0.1", sg); - - for (i = 0; i < TFW_SRV_MAX_CONN; ++i) { - TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, srv_conn); - conn_acc ^= (long long)srv_conn; - } - - /* - * Check that connections is scheduled in the fair way: - * every connection will be scheduled only once - */ - for (i = 0; i < sched_helper_rr.conn_types; ++i) { - TfwMsg *msg = sched_helper_rr.get_sched_arg(i); - conn_acc_check = 0; - - for (j = 0; j < TFW_SRV_MAX_CONN; ++j) { - TfwSrvConn *srv_conn = - sg->sched->sched_sg_conn(msg, sg); - EXPECT_NOT_NULL(srv_conn); - if (!srv_conn) - goto err; - - conn_acc_check ^= (long long)srv_conn; - tfw_srv_conn_put(srv_conn); - /* - * Don't let wachtdog suppose that we have stucked - * on long cycles. - */ - kernel_fpu_end(); - schedule(); - kernel_fpu_begin(); - } - - EXPECT_EQ(conn_acc, conn_acc_check); - sched_helper_rr.free_sched_arg(msg); - } -err: - test_conn_release_all(sg); - test_sg_release_all(); -} - -TEST(tfw_sched_rr, sched_sg_max_srv_zero_conn) -{ - test_sched_sg_max_srv_zero_conn(&sched_helper_rr); -} - -TEST(tfw_sched_rr, sched_sg_max_srv_max_conn) -{ - size_t i, j; - long long conn_acc = 0, conn_acc_check = 0; - - TfwSrvGroup *sg = test_create_sg("test", sched_helper_rr.sched); - - for (i = 0; i < TFW_SG_MAX_SRV; ++i) { - TfwServer *srv = test_create_srv("127.0.0.1", sg); - - for (j = 0; j < TFW_SRV_MAX_CONN; ++j) { - TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, srv_conn); - conn_acc ^= (long long)srv_conn; - } - } - - /* - * Check that connections is scheduled in the fair way: - * every connection will be scheduled only once - */ - for (i = 0; i < sched_helper_rr.conn_types; ++i) { - TfwMsg *msg = sched_helper_rr.get_sched_arg(i); - conn_acc_check = 0; - - for (j = 0; j < TFW_SG_MAX_SRV * TFW_SRV_MAX_CONN; ++j) { - TfwSrvConn *srv_conn = - sg->sched->sched_sg_conn(msg, sg); - EXPECT_NOT_NULL(srv_conn); - if (!srv_conn) - goto err; - - conn_acc_check ^= (long long)srv_conn; - tfw_srv_conn_put(srv_conn); - } - - EXPECT_EQ(conn_acc, conn_acc_check); - sched_helper_rr.free_sched_arg(msg); - } -err: - test_conn_release_all(sg); - test_sg_release_all(); -} - -TEST(tfw_sched_rr, sched_srv_one_srv_zero_conn) -{ - test_sched_srv_one_srv_zero_conn(&sched_helper_rr); -} - -TEST(tfw_sched_rr, sched_srv_one_srv_max_conn) -{ - size_t i, j; - long long conn_acc = 0, conn_acc_check = 0; - - TfwSrvGroup *sg = test_create_sg("test", sched_helper_rr.sched); - TfwServer *srv = test_create_srv("127.0.0.1", sg); - - for (i = 0; i < TFW_SRV_MAX_CONN; ++i) { - TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, srv_conn); - conn_acc ^= (long long)srv_conn; - } - - /* - * Check that connections is scheduled in the fair way: - * every connection will be scheduled only once - */ - for (i = 0; i < sched_helper_rr.conn_types; ++i) { - TfwMsg *msg = sched_helper_rr.get_sched_arg(i); - conn_acc_check = 0; - - for (j = 0; j < TFW_SRV_MAX_CONN; ++j) { - TfwSrvConn *srv_conn = - sg->sched->sched_srv_conn(msg, srv); - EXPECT_NOT_NULL(srv_conn); - if (!srv_conn) - goto err; - EXPECT_EQ((TfwServer *)srv_conn->peer, srv); - - conn_acc_check ^= (long long)srv_conn; - tfw_srv_conn_put(srv_conn); - - /* - * Don't let wachtdog wuppose that we have stucked - * on long cycles. - */ - kernel_fpu_end(); - schedule(); - kernel_fpu_begin(); - } - - EXPECT_EQ(conn_acc, conn_acc_check); - sched_helper_rr.free_sched_arg(msg); - } -err: - test_conn_release_all(sg); - test_sg_release_all(); -} - -TEST(tfw_sched_rr, sched_srv_max_srv_zero_conn) -{ - test_sched_srv_max_srv_zero_conn(&sched_helper_rr); -} - -TEST(tfw_sched_rr, sched_srv_max_srv_max_conn) -{ - size_t i, j; - long long conn_acc_check = 0; - struct { - TfwServer *srv; - long long conn_acc; - } srv_acc[TFW_SG_MAX_SRV] = { {0} }; - - TfwSrvGroup *sg = test_create_sg("test", sched_helper_rr.sched); - - for (i = 0; i < TFW_SG_MAX_SRV; ++i) { - TfwServer *srv = test_create_srv("127.0.0.1", sg); - srv_acc[i].srv = srv; - - for (j = 0; j < TFW_SRV_MAX_CONN; ++j) { - TfwSrvConn *srv_conn = test_create_conn((TfwPeer *)srv); - sg->sched->add_conn(sg, srv, srv_conn); - srv_acc[i].conn_acc ^= (long long)srv_conn; - } - } - - /* - * Check that connections is scheduled in the fair way: - * every connection will be scheduled only once - */ - for (i = 0; i < sched_helper_rr.conn_types; ++i) { - TfwMsg *msg = sched_helper_rr.get_sched_arg(i); - TfwServer *srv; - - list_for_each_entry(srv, &sg->srv_list, list) { - size_t k = 0; - conn_acc_check = 0; - - for (j = 0; j < TFW_SRV_MAX_CONN; ++j) { - TfwSrvConn *srv_conn = - sg->sched->sched_srv_conn(msg, srv); - EXPECT_NOT_NULL(srv_conn); - if (!srv_conn) - goto err; - EXPECT_EQ((TfwServer *)srv_conn->peer, srv); - - conn_acc_check ^= (long long)srv_conn; - tfw_srv_conn_put(srv_conn); - - /* - * Don't let wachtdog wuppose that we have - * stucked on long cycles. - */ - kernel_fpu_end(); - schedule(); - kernel_fpu_begin(); - } - - for (k = 0; k < TFW_SG_MAX_SRV; ++k) { - if (srv_acc[k].srv == srv) - EXPECT_EQ(srv_acc[k].conn_acc, - conn_acc_check); - } - } - sched_helper_rr.free_sched_arg(msg); - } -err: - test_conn_release_all(sg); - test_sg_release_all(); -} - -TEST(tfw_sched_rr, sched_srv_offline_srv) -{ - test_sched_srv_offline_srv(&sched_helper_rr); -} - -TEST_SUITE(sched_rr) -{ - kernel_fpu_end(); - - tfw_server_init(); - tfw_sched_rr_init(); - - kernel_fpu_begin(); - - /* - * Schedulers have the same interface so some test cases can use generic - * implementations. Some test cases still have to know how scheduler - * work at low level. Please, keep same structure for implementation - * aware test cases across all schedulers. - * - * Implementation aware cases: - * sched_sg_one_srv_max_conn - * sched_sg_max_srv_max_conn - * sched_srv_one_srv_max_conn - * sched_srv_max_srv_max_conn - */ - - TEST_RUN(tfw_sched_rr, sg_empty); - - TEST_RUN(tfw_sched_rr, sched_sg_one_srv_zero_conn); - TEST_RUN(tfw_sched_rr, sched_sg_one_srv_max_conn); - TEST_RUN(tfw_sched_rr, sched_sg_max_srv_zero_conn); - TEST_RUN(tfw_sched_rr, sched_sg_max_srv_max_conn); - - TEST_RUN(tfw_sched_rr, sched_srv_one_srv_zero_conn); - TEST_RUN(tfw_sched_rr, sched_srv_one_srv_max_conn); - TEST_RUN(tfw_sched_rr, sched_srv_max_srv_zero_conn); - TEST_RUN(tfw_sched_rr, sched_srv_max_srv_max_conn); - TEST_RUN(tfw_sched_rr, sched_srv_offline_srv); -} diff --git a/tempesta_fw/t/unit/user_space/Makefile b/tempesta_fw/t/unit/user_space/Makefile index 212f114b8..bdec6b733 100644 --- a/tempesta_fw/t/unit/user_space/Makefile +++ b/tempesta_fw/t/unit/user_space/Makefile @@ -30,6 +30,7 @@ CACHELINE := $(shell getconf LEVEL1_DCACHE_LINESIZE) CFLAGS = -O0 -ggdb -Wall -Werror \ -pthread -DL1_CACHE_BYTES=$(CACHELINE) \ -I../../../../ktest +CXXFLAGS = -std=c++11 ${CFLAGS} TARGETS = alb percentiles slr all : $(TARGETS) @@ -41,7 +42,7 @@ alb : alb.c $(CC) $(CFLAGS) -o $@ $^ slr : slr.cc - $(CXX) $(CFLAGS) -o $@ $^ + $(CXX) $(CXXFLAGS) -o $@ $^ clean : FORCE rm -f *.o *~ *.orig $(TARGETS) diff --git a/tempesta_fw/t/unit/user_space/percentiles.c b/tempesta_fw/t/unit/user_space/percentiles.c index 278c72821..021695450 100644 --- a/tempesta_fw/t/unit/user_space/percentiles.c +++ b/tempesta_fw/t/unit/user_space/percentiles.c @@ -174,7 +174,7 @@ static void __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) { int i; - unsigned long tmp; + unsigned long cnt_full, cnt_half; --pc->order; pc->begin = pc->end - ((TFW_STAT_BCKTS - 1) << pc->order); @@ -190,14 +190,15 @@ __range_shrink_left(TfwPcntRanges *rng, TfwPcntCtl *pc, int r) */ for (i = 1; i < TFW_STAT_BCKTS / 2; ++i) atomic_add(atomic_read(&rng->cnt[r][i]), &rng->cnt[r][0]); - tmp = atomic_read(&rng->cnt[r][TFW_STAT_BCKTS / 2]) / 2; - atomic_add(tmp, &rng->cnt[r][0]); - atomic_set(&rng->cnt[r][1], tmp); + cnt_full = atomic_read(&rng->cnt[r][TFW_STAT_BCKTS / 2]); + cnt_half = cnt_full / 2; + atomic_add(cnt_half, &rng->cnt[r][0]); + atomic_set(&rng->cnt[r][1], cnt_full - cnt_half); for (i = 1; i < TFW_STAT_BCKTS / 2; ++i) { - tmp = atomic_read(&rng->cnt[r][TFW_STAT_BCKTS / 2 + i]); - tmp /= 2; - atomic_set(&rng->cnt[r][i * 2], tmp); - atomic_set(&rng->cnt[r][i * 2 + 1], tmp); + cnt_full = atomic_read(&rng->cnt[r][TFW_STAT_BCKTS / 2 + i]); + cnt_half = cnt_full / 2; + atomic_set(&rng->cnt[r][i * 2], cnt_half); + atomic_set(&rng->cnt[r][i * 2 + 1], cnt_full - cnt_half); } } diff --git a/tempesta_fw/t/unit/user_space/slr.cc b/tempesta_fw/t/unit/user_space/slr.cc index afb4c05eb..7974e5acc 100644 --- a/tempesta_fw/t/unit/user_space/slr.cc +++ b/tempesta_fw/t/unit/user_space/slr.cc @@ -24,14 +24,13 @@ #include -template +template class SLR { - static const long WSZ = 5; // Use the multiplier to calculate @y with 1/MUL // precission on integer arithmetic. */ static const long MUL = 1000; - unsigned long n; /* observation number */ + long n; /* observation number */ T x_avg, y_avg; T xy_avg; /* avg(x * y) */ T x_avg_y_avg; /* avg(x) * avg(y) */ @@ -41,7 +40,7 @@ class SLR { struct { T x; T y; - } win[WSZ]; + } win[wsz]; public: SLR() @@ -52,30 +51,30 @@ class SLR { void slr_upd(long x, long y) { - size_t ni, cnt; + int ni, sz; y *= MUL; x *= MUL; + ni = n % wsz; - if (n < WSZ) { - ni = n; - cnt = n + 1; - x_avg = (x_avg * n + x) / cnt; - y_avg = (y_avg * n + y) / cnt; - xy_avg = (xy_avg * n + y * x) / cnt; + if (n < wsz) { + sz = ni + 1; + x_avg = (x_avg * n + x) / sz; + y_avg = (y_avg * n + y) / sz; + xy_avg = (xy_avg * n + y * x) / sz; x_avg_y_avg = x_avg * y_avg; - x_sq_avg = (x_sq_avg * n + x * x) / cnt; + x_sq_avg = (x_sq_avg * n + x * x) / sz; x_avg_sq = x_avg * x_avg; } else { // Forget history before the window // to adopt to new pattern. - ni = n % WSZ; - x_avg = x_avg - (win[ni].x - x) / WSZ; - y_avg = y_avg - (win[ni].y - y) / WSZ; - xy_avg = xy_avg - (win[ni].x * win[ni].y - y * x) / WSZ; + sz = wsz; + x_avg = x_avg - (win[ni].x - x) / sz; + y_avg = y_avg - (win[ni].y - y) / sz; + xy_avg = xy_avg - (win[ni].x * win[ni].y - y * x) / sz; x_avg_y_avg = x_avg * y_avg; x_sq_avg = x_sq_avg - (win[ni].x * win[ni].x - x * x) - / WSZ; + / sz; x_avg_sq = x_avg * x_avg; } @@ -127,11 +126,11 @@ class SLR { } }; -template +template void test() { - SLR slr; + SLR slr; slr.add_data(1, 3); slr.add_data(2, 5); @@ -149,14 +148,44 @@ test() slr.predict(15); } +// The major thing this test verifies is that the calculations +// don't break when they're switched from working on partial +// history to working on full-size history. +template +void +test_verified() +{ + SLR slr; + + slr.add_data(1, 1); + slr.add_data(2, 1); + slr.add_data(3, 1); + slr.add_data(4, 1); + slr.add_data(5, 1); + slr.add_data(6, 1); + slr.add_data(7, 1); + slr.add_data(8, 1); + slr.add_data(9, 1); + slr.add_data(10, 1); + slr.add_data(11, 1); + slr.add_data(12, 1); + slr.add_data(13, 1); + + slr.predict(15); +} + int main(int argc, char *argv[]) { std::cout << "TEST for double" << std::endl; - test(); + test(); std::cout << "TEST for long" << std::endl; - test(); + test(); + + std::cout << "Verified test for long, the result should be '1'" << std::endl; + test_verified(); + return 0; std::cout << std::endl; @@ -170,7 +199,7 @@ main(int argc, char *argv[]) std::cout << "> "; long x, y, pred_x; - SLR slr; + SLR slr; while (std::cin >> x >> y >> pred_x) { slr.slr_upd(x, y); std::cout << "(x=" << x << " y=" << y